Skip to content

Commit e5b8a03

Browse files
par_iter_bp_delayed_buf
1 parent 707dabb commit e5b8a03

2 files changed

Lines changed: 119 additions & 6 deletions

File tree

src/packed_seq.rs

Lines changed: 93 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,23 @@ impl Drop for RecycledBox {
6060
}
6161
}
6262

63+
#[derive(Default)]
64+
struct SimdVec(Vec<S>);
65+
66+
impl Deref for SimdVec {
67+
type Target = Vec<S>;
68+
#[inline(always)]
69+
fn deref(&self) -> &Self::Target {
70+
&self.0
71+
}
72+
}
73+
impl DerefMut for SimdVec {
74+
#[inline(always)]
75+
fn deref_mut(&mut self) -> &mut Self::Target {
76+
&mut self.0
77+
}
78+
}
79+
6380
#[doc(hidden)]
6481
pub struct Bits<const B: usize>;
6582
#[doc(hidden)]
@@ -532,7 +549,13 @@ where
532549
delay1: Delay,
533550
delay2: Delay,
534551
) -> PaddedIt<impl ChunkIt<(S, S, S)>> {
535-
self.par_iter_bp_delayed_2_with_factor(context, delay1, delay2, 1)
552+
self.par_iter_bp_delayed_2_with_factor_and_buf(
553+
context,
554+
delay1,
555+
delay2,
556+
1,
557+
SimdVec::default(),
558+
)
536559
}
537560

538561
/// Compares 29 characters at a time.
@@ -647,9 +670,30 @@ where
647670
pub fn par_iter_bp_delayed_with_factor(
648671
self,
649672
context: usize,
650-
Delay(delay): Delay,
673+
delay: Delay,
651674
factor: usize,
652675
) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B>> {
676+
self.par_iter_bp_delayed_with_factor_and_buf(context, delay, factor, SimdVec::default())
677+
}
678+
679+
#[inline(always)]
680+
pub fn par_iter_bp_delayed_with_buf<BUF: DerefMut<Target = Vec<S>>>(
681+
self,
682+
context: usize,
683+
delay: Delay,
684+
buf: BUF,
685+
) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
686+
self.par_iter_bp_delayed_with_factor_and_buf(context, delay, 1, buf)
687+
}
688+
689+
#[inline(always)]
690+
pub fn par_iter_bp_delayed_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
691+
self,
692+
context: usize,
693+
Delay(delay): Delay,
694+
factor: usize,
695+
mut buf: BUF,
696+
) -> PaddedIt<impl ChunkIt<(S, S)> + use<'s, B, BUF>> {
653697
#[cfg(target_endian = "big")]
654698
panic!("Big endian architectures are not supported.");
655699

@@ -686,7 +730,14 @@ where
686730
// +8: some 'random' padding
687731
let buf_len = (delay / Self::C32 + 8).next_power_of_two();
688732
let buf_mask = buf_len - 1;
689-
let mut buf = vec![S::ZERO; buf_len];
733+
if buf.len() != buf_len {
734+
// This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
735+
*buf.as_mut() = vec![S::ZERO; buf_len];
736+
} else {
737+
// NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
738+
buf.fill(S::ZERO);
739+
}
740+
690741
let mut write_idx = 0;
691742
// We compensate for the first delay/16 triggers of the check below that
692743
// happen before the delay is actually reached.
@@ -758,19 +809,48 @@ where
758809
PaddedIt { it, padding }
759810
}
760811

812+
#[inline(always)]
813+
pub fn par_iter_bp_delayed_2_with_factor(
814+
self,
815+
context: usize,
816+
delay1: Delay,
817+
delay2: Delay,
818+
factor: usize,
819+
) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
820+
self.par_iter_bp_delayed_2_with_factor_and_buf(
821+
context,
822+
delay1,
823+
delay2,
824+
factor,
825+
SimdVec::default(),
826+
)
827+
}
828+
829+
#[inline(always)]
830+
pub fn par_iter_bp_delayed_2_with_buf<BUF: DerefMut<Target = Vec<S>>>(
831+
self,
832+
context: usize,
833+
delay1: Delay,
834+
delay2: Delay,
835+
buf: BUF,
836+
) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
837+
self.par_iter_bp_delayed_2_with_factor_and_buf(context, delay1, delay2, 1, buf)
838+
}
839+
761840
/// When iterating over 2-bit and 1-bit encoded data in parallel,
762841
/// one must ensure that they have the same stride.
763842
/// On the larger type, set `factor` as the ratio to the smaller one,
764843
/// so that the stride in bytes is a multiple of `factor`,
765844
/// so that the smaller type also has a byte-aligned stride.
766845
#[inline(always)]
767-
pub fn par_iter_bp_delayed_2_with_factor(
846+
pub fn par_iter_bp_delayed_2_with_factor_and_buf<BUF: DerefMut<Target = Vec<S>>>(
768847
self,
769848
context: usize,
770849
Delay(delay1): Delay,
771850
Delay(delay2): Delay,
772851
factor: usize,
773-
) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B>> {
852+
mut buf: BUF,
853+
) -> PaddedIt<impl ChunkIt<(S, S, S)> + use<'s, B, BUF>> {
774854
#[cfg(target_endian = "big")]
775855
panic!("Big endian architectures are not supported.");
776856

@@ -800,7 +880,14 @@ where
800880
// Even buf_len is nice to only have the write==buf_len check once.
801881
let buf_len = (delay2 / Self::C32 + 8).next_power_of_two();
802882
let buf_mask = buf_len - 1;
803-
let mut buf = vec![S::ZERO; buf_len];
883+
if buf.len() != buf_len {
884+
// This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
885+
*buf = vec![S::ZERO; buf_len];
886+
} else {
887+
// NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
888+
buf.fill(S::ZERO);
889+
}
890+
804891
let mut write_idx = 0;
805892
// We compensate for the first delay/16 triggers of the check below that
806893
// happen before the delay is actually reached.

src/test.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,32 @@ fn par_iter_bp_delayed_bench() {
10531053
}
10541054
}
10551055

1056+
#[test]
1057+
#[ignore = "This is a benchmark, not a test"]
1058+
fn par_iter_bp_delayed_buf_bench() {
1059+
eprintln!("\nBench PackedSeq::par_iter_bp_delayed_buf");
1060+
1061+
let mut buf = vec![];
1062+
1063+
for len in [100, 150, 200, 1000, 1_000_000] {
1064+
// 1Gbp input.
1065+
let rep = 1_000_000_000 / len;
1066+
let seq = PackedSeqVec::random(len);
1067+
1068+
let start = std::time::Instant::now();
1069+
for _ in 0..rep {
1070+
let PaddedIt { it, .. } =
1071+
seq.as_slice()
1072+
.par_iter_bp_delayed_with_factor_and_buf(1, Delay(27), 1, &mut buf);
1073+
black_box(it.map(|(x, y)| x + y).sum::<u32x8>());
1074+
}
1075+
eprintln!(
1076+
"Len {len:>7} => {:.03} Gbp/s",
1077+
start.elapsed().as_secs_f64().recip()
1078+
);
1079+
}
1080+
}
1081+
10561082
#[test]
10571083
#[ignore = "This is a benchmark, not a test"]
10581084
fn par_iter_kmer_ambiguity_bench() {

0 commit comments

Comments
 (0)