@@ -60,6 +60,23 @@ impl Drop for RecycledBox {
6060 }
6161}
6262
63+ #[ derive( Default ) ]
64+ struct SimdVec ( Vec < S > ) ;
65+
66+ impl Deref for SimdVec {
67+ type Target = Vec < S > ;
68+ #[ inline( always) ]
69+ fn deref ( & self ) -> & Self :: Target {
70+ & self . 0
71+ }
72+ }
73+ impl DerefMut for SimdVec {
74+ #[ inline( always) ]
75+ fn deref_mut ( & mut self ) -> & mut Self :: Target {
76+ & mut self . 0
77+ }
78+ }
79+
6380#[ doc( hidden) ]
6481pub struct Bits < const B : usize > ;
6582#[ doc( hidden) ]
@@ -532,7 +549,13 @@ where
532549 delay1 : Delay ,
533550 delay2 : Delay ,
534551 ) -> PaddedIt < impl ChunkIt < ( S , S , S ) > > {
535- self . par_iter_bp_delayed_2_with_factor ( context, delay1, delay2, 1 )
552+ self . par_iter_bp_delayed_2_with_factor_and_buf (
553+ context,
554+ delay1,
555+ delay2,
556+ 1 ,
557+ SimdVec :: default ( ) ,
558+ )
536559 }
537560
538561 /// Compares 29 characters at a time.
@@ -647,9 +670,30 @@ where
647670 pub fn par_iter_bp_delayed_with_factor (
648671 self ,
649672 context : usize ,
650- Delay ( delay) : Delay ,
673+ delay : Delay ,
651674 factor : usize ,
652675 ) -> PaddedIt < impl ChunkIt < ( S , S ) > + use < ' s , B > > {
676+ self . par_iter_bp_delayed_with_factor_and_buf ( context, delay, factor, SimdVec :: default ( ) )
677+ }
678+
679+ #[ inline( always) ]
680+ pub fn par_iter_bp_delayed_with_buf < BUF : DerefMut < Target = Vec < S > > > (
681+ self ,
682+ context : usize ,
683+ delay : Delay ,
684+ buf : BUF ,
685+ ) -> PaddedIt < impl ChunkIt < ( S , S ) > + use < ' s , B , BUF > > {
686+ self . par_iter_bp_delayed_with_factor_and_buf ( context, delay, 1 , buf)
687+ }
688+
689+ #[ inline( always) ]
690+ pub fn par_iter_bp_delayed_with_factor_and_buf < BUF : DerefMut < Target = Vec < S > > > (
691+ self ,
692+ context : usize ,
693+ Delay ( delay) : Delay ,
694+ factor : usize ,
695+ mut buf : BUF ,
696+ ) -> PaddedIt < impl ChunkIt < ( S , S ) > + use < ' s , B , BUF > > {
653697 #[ cfg( target_endian = "big" ) ]
654698 panic ! ( "Big endian architectures are not supported." ) ;
655699
@@ -686,7 +730,14 @@ where
686730 // +8: some 'random' padding
687731 let buf_len = ( delay / Self :: C32 + 8 ) . next_power_of_two ( ) ;
688732 let buf_mask = buf_len - 1 ;
689- let mut buf = vec ! [ S :: ZERO ; buf_len] ;
733+ if buf. len ( ) != buf_len {
734+ // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
735+ * buf. as_mut ( ) = vec ! [ S :: ZERO ; buf_len] ;
736+ } else {
737+ // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
738+ buf. fill ( S :: ZERO ) ;
739+ }
740+
690741 let mut write_idx = 0 ;
691742 // We compensate for the first delay/16 triggers of the check below that
692743 // happen before the delay is actually reached.
@@ -758,19 +809,48 @@ where
758809 PaddedIt { it, padding }
759810 }
760811
812+ #[ inline( always) ]
813+ pub fn par_iter_bp_delayed_2_with_factor (
814+ self ,
815+ context : usize ,
816+ delay1 : Delay ,
817+ delay2 : Delay ,
818+ factor : usize ,
819+ ) -> PaddedIt < impl ChunkIt < ( S , S , S ) > + use < ' s , B > > {
820+ self . par_iter_bp_delayed_2_with_factor_and_buf (
821+ context,
822+ delay1,
823+ delay2,
824+ factor,
825+ SimdVec :: default ( ) ,
826+ )
827+ }
828+
829+ #[ inline( always) ]
830+ pub fn par_iter_bp_delayed_2_with_buf < BUF : DerefMut < Target = Vec < S > > > (
831+ self ,
832+ context : usize ,
833+ delay1 : Delay ,
834+ delay2 : Delay ,
835+ buf : BUF ,
836+ ) -> PaddedIt < impl ChunkIt < ( S , S , S ) > + use < ' s , B , BUF > > {
837+ self . par_iter_bp_delayed_2_with_factor_and_buf ( context, delay1, delay2, 1 , buf)
838+ }
839+
761840 /// When iterating over 2-bit and 1-bit encoded data in parallel,
762841 /// one must ensure that they have the same stride.
763842 /// On the larger type, set `factor` as the ratio to the smaller one,
764843 /// so that the stride in bytes is a multiple of `factor`,
765844 /// so that the smaller type also has a byte-aligned stride.
766845 #[ inline( always) ]
767- pub fn par_iter_bp_delayed_2_with_factor (
846+ pub fn par_iter_bp_delayed_2_with_factor_and_buf < BUF : DerefMut < Target = Vec < S > > > (
768847 self ,
769848 context : usize ,
770849 Delay ( delay1) : Delay ,
771850 Delay ( delay2) : Delay ,
772851 factor : usize ,
773- ) -> PaddedIt < impl ChunkIt < ( S , S , S ) > + use < ' s , B > > {
852+ mut buf : BUF ,
853+ ) -> PaddedIt < impl ChunkIt < ( S , S , S ) > + use < ' s , B , BUF > > {
774854 #[ cfg( target_endian = "big" ) ]
775855 panic ! ( "Big endian architectures are not supported." ) ;
776856
@@ -800,7 +880,14 @@ where
800880 // Even buf_len is nice to only have the write==buf_len check once.
801881 let buf_len = ( delay2 / Self :: C32 + 8 ) . next_power_of_two ( ) ;
802882 let buf_mask = buf_len - 1 ;
803- let mut buf = vec ! [ S :: ZERO ; buf_len] ;
883+ if buf. len ( ) != buf_len {
884+ // This has better codegen than `vec.clear(); vec.resize()`, since the inner `do_reserve_and_handle` of resize is not inlined.
885+ * buf = vec ! [ S :: ZERO ; buf_len] ;
886+ } else {
887+ // NOTE: Buf needs to be filled with zeros to guarantee returning 0 values for out-of-bounds characters.
888+ buf. fill ( S :: ZERO ) ;
889+ }
890+
804891 let mut write_idx = 0 ;
805892 // We compensate for the first delay/16 triggers of the check below that
806893 // happen before the delay is actually reached.
0 commit comments