Skip to content

Commit 707dabb

Browse files
par_iter_bp_with_buf
1 parent fec074a commit 707dabb

2 files changed

Lines changed: 112 additions & 60 deletions

File tree

src/packed_seq.rs

Lines changed: 84 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use core::cell::RefCell;
2+
use std::ops::{Deref, DerefMut};
23
use traits::Seq;
34
use wide::u16x8;
45

@@ -35,6 +36,21 @@ impl RecycledBox {
3536
}
3637
}
3738

39+
impl Deref for RecycledBox {
40+
type Target = SimdBuf;
41+
42+
#[inline(always)]
43+
fn deref(&self) -> &Self::Target {
44+
self.get()
45+
}
46+
}
47+
impl DerefMut for RecycledBox {
48+
#[inline(always)]
49+
fn deref_mut(&mut self) -> &mut SimdBuf {
50+
self.get_mut()
51+
}
52+
}
53+
3854
impl Drop for RecycledBox {
3955
#[inline(always)]
4056
fn drop(&mut self) {
@@ -495,6 +511,72 @@ where
495511

496512
#[inline(always)]
497513
fn par_iter_bp(self, context: usize) -> PaddedIt<impl ChunkIt<S>> {
514+
// Boxed, so it doesn't consume precious registers.
515+
// Without this, cur is not always inlined into a register.
516+
let mut buf = IT_BUF.with_borrow_mut(|v| RecycledBox(v.pop()));
517+
buf.init_if_needed();
518+
self.par_iter_bp_with_buf(context, buf)
519+
}
520+
521+
#[inline(always)]
522+
fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
523+
self.par_iter_bp_delayed_with_factor(context, delay, 1)
524+
}
525+
526+
/// NOTE: When `self` starts does not start at a byte boundary, the
527+
/// 'delayed' character is not guaranteed to be `0`.
528+
#[inline(always)]
529+
fn par_iter_bp_delayed_2(
530+
self,
531+
context: usize,
532+
delay1: Delay,
533+
delay2: Delay,
534+
) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
535+
self.par_iter_bp_delayed_2_with_factor(context, delay1, delay2, 1)
536+
}
537+
538+
/// Compares 29 characters at a time.
539+
fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
540+
let mut lcp = 0;
541+
let min_len = self.len.min(other.len);
542+
for i in (0..min_len).step_by(Self::K64) {
543+
let len = (min_len - i).min(Self::K64);
544+
let this = self.slice(i..i + len);
545+
let other = other.slice(i..i + len);
546+
let this_word = this.as_u64();
547+
let other_word = other.as_u64();
548+
if this_word != other_word {
549+
// Unfortunately, bases are packed in little endian order, so the default order is reversed.
550+
let eq = this_word ^ other_word;
551+
let t = eq.trailing_zeros() as usize / B * B;
552+
lcp += t / B;
553+
let mask = (Self::CHAR_MASK) << t;
554+
return ((this_word & mask).cmp(&(other_word & mask)), lcp);
555+
}
556+
lcp += len;
557+
}
558+
(self.len.cmp(&other.len), lcp)
559+
}
560+
561+
#[inline(always)]
562+
fn get(&self, index: usize) -> u8 {
563+
let offset = self.offset + index;
564+
let idx = offset / Self::C8;
565+
let offset = offset % Self::C8;
566+
(self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
567+
}
568+
}
569+
570+
impl<'s, const B: usize> PackedSeqBase<'s, B>
571+
where
572+
Bits<B>: SupportedBits,
573+
{
574+
#[inline(always)]
575+
pub fn par_iter_bp_with_buf<BUF: DerefMut<Target = [S; 8]>>(
576+
self,
577+
context: usize,
578+
mut buf: BUF,
579+
) -> PaddedIt<impl ChunkIt<S> + use<'s, B, BUF>> {
498580
#[cfg(target_endian = "big")]
499581
panic!("Big endian architectures are not supported.");
500582

@@ -516,11 +598,6 @@ where
516598
let offsets: [usize; 8] = from_fn(|l| l * bytes_per_chunk);
517599
let mut cur = S::ZERO;
518600

519-
// Boxed, so it doesn't consume precious registers.
520-
// Without this, cur is not always inlined into a register.
521-
let mut buf = IT_BUF.with_borrow_mut(|v| RecycledBox(v.pop()));
522-
buf.init_if_needed();
523-
524601
let simd_char_mask: u32x8 = S::splat(Self::CHAR_MASK as u32);
525602
let simd_b: u32x8 = S::splat(B as u32);
526603

@@ -550,9 +627,9 @@ where
550627
)
551628
},
552629
);
553-
*buf.get_mut() = transpose(data);
630+
*buf = transpose(data);
554631
}
555-
cur = buf.get()[(i % Self::C256) / Self::C32];
632+
cur = buf[(i % Self::C256) / Self::C32];
556633
}
557634
// Extract the last 2 bits of each character.
558635
let chars = cur & simd_char_mask;
@@ -566,59 +643,6 @@ where
566643
PaddedIt { it, padding }
567644
}
568645

569-
#[inline(always)]
570-
fn par_iter_bp_delayed(self, context: usize, delay: Delay) -> PaddedIt<impl ChunkIt<(S, S)>> {
571-
self.par_iter_bp_delayed_with_factor(context, delay, 1)
572-
}
573-
574-
/// NOTE: When `self` starts does not start at a byte boundary, the
575-
/// 'delayed' character is not guaranteed to be `0`.
576-
#[inline(always)]
577-
fn par_iter_bp_delayed_2(
578-
self,
579-
context: usize,
580-
delay1: Delay,
581-
delay2: Delay,
582-
) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
583-
self.par_iter_bp_delayed_2_with_factor(context, delay1, delay2, 1)
584-
}
585-
586-
/// Compares 29 characters at a time.
587-
fn cmp_lcp(&self, other: &Self) -> (std::cmp::Ordering, usize) {
588-
let mut lcp = 0;
589-
let min_len = self.len.min(other.len);
590-
for i in (0..min_len).step_by(Self::K64) {
591-
let len = (min_len - i).min(Self::K64);
592-
let this = self.slice(i..i + len);
593-
let other = other.slice(i..i + len);
594-
let this_word = this.as_u64();
595-
let other_word = other.as_u64();
596-
if this_word != other_word {
597-
// Unfortunately, bases are packed in little endian order, so the default order is reversed.
598-
let eq = this_word ^ other_word;
599-
let t = eq.trailing_zeros() as usize / B * B;
600-
lcp += t / B;
601-
let mask = (Self::CHAR_MASK) << t;
602-
return ((this_word & mask).cmp(&(other_word & mask)), lcp);
603-
}
604-
lcp += len;
605-
}
606-
(self.len.cmp(&other.len), lcp)
607-
}
608-
609-
#[inline(always)]
610-
fn get(&self, index: usize) -> u8 {
611-
let offset = self.offset + index;
612-
let idx = offset / Self::C8;
613-
let offset = offset % Self::C8;
614-
(self.seq[idx] >> (B * offset)) & Self::CHAR_MASK as u8
615-
}
616-
}
617-
618-
impl<'s, const B: usize> PackedSeqBase<'s, B>
619-
where
620-
Bits<B>: SupportedBits,
621-
{
622646
#[inline(always)]
623647
pub fn par_iter_bp_delayed_with_factor(
624648
self,

src/test.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,34 @@ fn par_iter_bp_bench() {
10001000
}
10011001
}
10021002

1003+
#[test]
1004+
#[ignore = "This is a benchmark, not a test"]
1005+
fn par_iter_bp_buf_bench() {
1006+
eprintln!("\nBench PackedSeq::par_iter_bp_buf");
1007+
1008+
let mut buf = [S::ZERO; 8];
1009+
1010+
for len in [100, 150, 200, 1000, 1_000_000] {
1011+
// 1Gbp input.
1012+
let rep = 1_000_000_000 / len;
1013+
let seq = PackedSeqVec::random(len);
1014+
1015+
let start = std::time::Instant::now();
1016+
for _ in 0..rep {
1017+
let PaddedIt { it, .. } = seq.as_slice().par_iter_bp_with_buf(1, &mut buf);
1018+
let mut sum = S::ZERO;
1019+
for x in it {
1020+
sum += x;
1021+
}
1022+
black_box(&sum);
1023+
}
1024+
eprintln!(
1025+
"Len {len:>7} => {:.03} Gbp/s",
1026+
start.elapsed().as_secs_f64().recip()
1027+
);
1028+
}
1029+
}
1030+
10031031
#[test]
10041032
#[ignore = "This is a benchmark, not a test"]
10051033
fn par_iter_bp_delayed_bench() {

0 commit comments

Comments
 (0)