Skip to content

Commit 9bfe8d9

Browse files
Add PackedNSeq::from_fastq_with_quality behind file_io feature
1 parent 3dcbfb7 commit 9bfe8d9

3 files changed

Lines changed: 110 additions & 1 deletion

File tree

Cargo.lock

Lines changed: 71 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,16 @@ epserde = { version = "0.8", optional = true }
2424
# Optional pyclass atrributes for some objects.
2525
pyo3 = { version = "0.25", features = ["extension-module"], optional = true }
2626
sux = "0.10.2"
27+
# Optional, for reading PackedSeq[N]Vec directly from a file
28+
needletail = { version = "0.6.3", optional = true }
2729

2830
[features]
2931
# Also needed for tests.
30-
default = ["rand"]
32+
default = ["rand", "file_io"]
33+
34+
# Enables functions to read PackedNSeq directly from a file.
35+
file_io = ["needletail"]
3136

3237
# Hides the `simd` warnings when neither AVX2 nor NEON is detected.
3338
scalar = ["ensure_simd/scalar"]
39+

src/packed_n_seq.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,4 +134,36 @@ impl PackedNSeqVec {
134134
*ambi = (u32::from_ne_bytes(*ambi) | mask).to_ne_bytes();
135135
}
136136
}
137+
138+
#[cfg(feature = "file_io")]
139+
pub fn from_fastq_with_quality(path: &std::path::Path, min_qual: u8) -> Self {
140+
let mut dna = vec![];
141+
let mut qual = vec![];
142+
143+
let mut seqs = Self::default();
144+
145+
let mut reader = needletail::parse_fastx_file(&path).unwrap();
146+
while let Some(record) = reader.next() {
147+
let seqrec = record.expect("Invalid FASTA/Q record");
148+
dna.extend_from_slice(&seqrec.seq());
149+
qual.extend_from_slice(&seqrec.qual().unwrap());
150+
dna.push(b'N');
151+
qual.push(0);
152+
153+
// Convert from ascii dna+qual to packed dna+mask in batches of 16kbp.
154+
if dna.len() > 16000 {
155+
seqs.push_from_ascii_and_quality(&dna, &qual, min_qual);
156+
dna.clear();
157+
qual.clear();
158+
}
159+
}
160+
161+
if dna.len() > 0 {
162+
seqs.push_from_ascii_and_quality(&dna, &qual, min_qual);
163+
dna.clear();
164+
qual.clear();
165+
}
166+
167+
seqs
168+
}
137169
}

0 commit comments

Comments
 (0)