diff --git a/BINARY_SOA_FORMAT.md b/BINARY_SOA_FORMAT.md new file mode 100644 index 0000000..dbbf99d --- /dev/null +++ b/BINARY_SOA_FORMAT.md @@ -0,0 +1,247 @@ +# RCIA Binary SoA Stream Format — v1 + +A lossless, low-overhead binary format for streaming Thermo RAW spectrum +data into downstream consumers. Designed to be: + +1. **Lossless** — every scan-level value RawFileReader exposes is captured +2. **Streamable** — sequence of self-describing records over a pipe +3. **Zero-copy on the read side** — SoA arrays at known offsets, naturally aligned +4. **Gracefully nullable** — missing values use NaN (floats) or `-1` / `0` sentinels (integers) +5. **Forward-compatible** — variable-length sections at offsets, fixed scalar fields at known positions + +All multi-byte integers and floats are **little-endian** (matches all modern x86/ARM CPUs). +Floats follow IEEE 754. + +## Streaming vs file output + +The writer produces the **identical byte format** in both modes; the difference +is only the destination and a few I/O tuning details: + +| Mode | Invocation | Destination | Buffering | +|------|-----------|-------------|-----------| +| **Streaming** (primary use) | `--format=BinarySoa --stdout` | parent process via OS pipe | 1 MB BufferedStream wrap | +| **File** (sidecar/cache) | `--format=BinarySoa --output=path.rcia.bin` | regular file | 1 MB BufferedStream wrap | + +Streaming is the primary use case; downstream pipelines consume records as they +arrive without materializing the whole run on disk. File output exists so the +same downstream consumer can replay a file later (e.g., re-search the same RAW +with different parameters without re-reading via RawFileReader). + +A consumer cannot tell the two modes apart from the byte stream — the writer +emits the same file header, records, and EOF marker regardless of destination. + +--- + +## Byte order + +All values are little-endian. The format does not currently include a BOM; +the file-header magic `RCIASTR1` doubles as an endianness check (its byte +sequence is direction-sensitive). + +--- + +## File header — written once at stream start (32 bytes) + +| Off | Size | Type | Field | Notes | +|-----|------|----------|--------------------|-------| +| 0 | 8 | `[u8;8]` | `magic` | ASCII bytes `R C I A S T R 1` (0x52 0x43 0x49 0x41 0x53 0x54 0x52 0x31) | +| 8 | 2 | `u16` | `format_version` | `1` for this spec | +| 10 | 2 | `u16` | `file_header_size` | `32`. Readers must skip past this many bytes from start to reach first record. | +| 12 | 4 | `u32` | `flags` | Reserved, `0` | +| 16 | 16 | `[u8;16]`| `reserved` | Zero-filled | + +--- + +## Per-spectrum record + +A record consists of: + +``` +[Fixed Header — 128 bytes, all scalar fields present at known offsets] +[Filter string — filter_string_len bytes, UTF-8, no null terminator] +[Pad to 8-byte alignment] +[Peak arrays — mz f64×N then intensity f32×N, packed] +[Pad to 8-byte alignment] +[Optional metadata dump — key/value strings of all trailer fields] +[Pad to 8-byte alignment, brings record to multiple of 8] +``` + +### Fixed header (128 bytes) + +| Off | Size | Type | Field | Nullability | +|-----|------|------|-------|-------------| +| 0 | 4 | `u32` | `record_size` | Total bytes of this record (including padding); used to skip to next record. | +| 4 | 4 | `u32` | `scan_id` | RAW file scan number. | +| 8 | 1 | `u8` | `ms_level` | `1`, `2`, `3`, ... | +| 9 | 1 | `u8` | `polarity` | `0` = negative, `1` = positive, `255` = unknown | +| 10 | 1 | `u8` | `scan_data_type` | `0` = profile, `1` = centroid | +| 11 | 1 | `u8` | `activation_type` | See enum below; `0` = none, `255` = unknown | +| 12 | 4 | `u32` | `n_peaks` | May be `0`. | +| 16 | 8 | `f64` | `retention_time_seconds` | Always present. | +| 24 | 8 | `f64` | `precursor_mz` | NaN if MS1 or unavailable. | +| 32 | 8 | `f64` | `precursor_mz_monoisotopic` | NaN if no monoisotopic correction was reported. | +| 40 | 8 | `f64` | `base_peak_mz` | NaN if no peaks. | +| 48 | 4 | `f32` | `isolation_lower_offset` | NaN if MS1 or unavailable. (Da, signed offset from precursor_mz to lower bound.) | +| 52 | 4 | `f32` | `isolation_upper_offset` | NaN if MS1 or unavailable. (Da, signed offset from precursor_mz to upper bound.) | +| 56 | 4 | `f32` | `isolation_width` | NaN if MS1. | +| 60 | 4 | `f32` | `precursor_intensity` | `0.0` if MS1 or not computed. | +| 64 | 4 | `f32` | `base_peak_intensity` | NaN if no peaks. | +| 68 | 4 | `f32` | `total_ion_current` | NaN if not available. | +| 72 | 4 | `f32` | `ion_injection_time_ms` | NaN if not in trailer. | +| 76 | 4 | `f32` | `collision_energy` | NaN if MS1 or unavailable. | +| 80 | 4 | `f32` | `faims_compensation_voltage` | NaN if no FAIMS. | +| 84 | 4 | `f32` | `elapsed_scan_time_ms` | NaN if not available. | +| 88 | 4 | `f32` | `low_mass` | Scan range start (Da). NaN if not available. | +| 92 | 4 | `f32` | `high_mass` | Scan range end (Da). NaN if not available. | +| 96 | 4 | `i32` | `precursor_charge` | `-1` if unknown. | +| 100 | 4 | `i32` | `master_scan_number` | `-1` if no parent (MS1) or unknown. | +| 104 | 4 | `u32` | `peak_flags` | Bit 0: `peaks_sorted_by_mz` (set = sorted ascending). Bits 1-31: reserved. | +| 108 | 4 | `u32` | `reserved1` | `0` | +| 112 | 2 | `u16` | `filter_string_len` | Bytes (UTF-8) of filter string immediately after header. | +| 114 | 2 | `u16` | `reserved2` | `0` | +| 116 | 4 | `u32` | `arrays_offset` | Byte offset (from record start) to the mz array. Always `>= 128 + filter_string_len`. | +| 120 | 4 | `u32` | `metadata_offset` | Byte offset to optional metadata key/value block. `0` if absent. | +| 124 | 4 | `u32` | `metadata_length` | Bytes of metadata block. `0` if absent. | + +Total: **128 bytes** (two 64-byte cache lines). + +### Activation type enum (`u8`) + +| Value | Meaning | +|-------|---------| +| 0 | None (MS1 or no activation reported) | +| 1 | CID (Collision-Induced Dissociation) | +| 2 | HCD (Higher-Energy Collisional Dissociation) | +| 3 | ETD (Electron Transfer Dissociation) | +| 4 | ECD (Electron Capture Dissociation) | +| 5 | EThcD (ETD + HCD supplemental) | +| 6 | UVPD (Ultraviolet Photodissociation) | +| 7 | NETD (Negative Electron Transfer) | +| 8 | MPD (Multi-Photon Dissociation) | +| 9 | PQD (Pulsed Q Dissociation) | +| 10 | PTR (Proton Transfer Reaction) | +| 11 | nPTR (Negative Proton Transfer Reaction) | +| 255 | Other / unknown | + +Note: `5` (EThcD) is reserved for caller-detected EThcD (ETD/ECD + supplemental HCD/CID); the +encoder maps the primary reaction's `ActivationType` and leaves EThcD detection to a +higher layer that inspects sequential reactions. + +### Filter string + +Immediately after the fixed header, `filter_string_len` UTF-8 bytes of the +Thermo filter string (e.g., `"FTMS + p NSI Full ms2 408.5142@hcd28.00 [115.0000-1700.0000]"`). +No null terminator. May be empty (`filter_string_len = 0`). + +### Peak arrays (at `arrays_offset`) + +``` +arrays_offset + 0 : f64 mz_array[N] (8-byte aligned) +arrays_offset + 8N : f32 intensity_array[N] (4-byte aligned) +``` + +`mz_array` is sorted ascending if `peak_flags & 0x1` is set (which is the +default for all output produced by the writer). + +### Metadata block (optional, at `metadata_offset`) + +If `metadata_length > 0`, a verbatim dump of all key/value pairs from the +RAW file's per-scan trailer. This captures every vendor-reported value +(AGC target, conversion parameters, all "MS{N} ..." fields, etc.) +without selective filtering. + +``` +metadata_offset + 0 : u32 n_pairs +metadata_offset + 4 : repeating block: + u16 key_len + key_len bytes (UTF-8) + u16 value_len + value_len bytes (UTF-8) +``` + +### Padding + +After the last peak/metadata byte, the record is zero-padded so that +`record_size` is a multiple of 8. This guarantees the next record starts +at an 8-byte boundary, so its `f64` fields can be cast/loaded aligned. + +--- + +## End-of-stream marker + +After the last spectrum, the writer emits a single `u32` with value `0`. +Readers should interpret a record_size of `0` as end-of-stream. + +``` +... last record bytes ... +[ u32 record_size = 0 ] +``` + +--- + +## Reading example (Rust pseudocode) + +```rust +let mut hdr = [0u8; 128]; +loop { + if reader.read_exact(&mut hdr[..4]).is_err() { break; } + let record_size = u32::from_le_bytes(hdr[..4].try_into().unwrap()); + if record_size == 0 { break; } // EOF marker + + reader.read_exact(&mut hdr[4..128])?; + let filter_string_len = u16::from_le_bytes(hdr[112..114].try_into().unwrap()) as usize; + let arrays_offset = u32::from_le_bytes(hdr[116..120].try_into().unwrap()) as usize; + let metadata_offset = u32::from_le_bytes(hdr[120..124].try_into().unwrap()) as usize; + let metadata_length = u32::from_le_bytes(hdr[124..128].try_into().unwrap()) as usize; + let n_peaks = u32::from_le_bytes(hdr[12..16].try_into().unwrap()) as usize; + + // Read remaining bytes + let mut rest = vec![0u8; (record_size as usize) - 128]; + reader.read_exact(&mut rest)?; + + let filter_string = std::str::from_utf8(&rest[0..filter_string_len])?; + + let mz_start = arrays_offset - 128; + let int_start = mz_start + 8 * n_peaks; + let mz_bytes = &rest[mz_start .. mz_start + 8 * n_peaks]; + let intensity_bytes = &rest[int_start .. int_start + 4 * n_peaks]; + let mz: &[f64] = bytemuck::cast_slice(mz_bytes); + let intensity: &[f32] = bytemuck::cast_slice(intensity_bytes); + + // ... process spectrum ... +} +``` + +--- + +## Versioning policy + +* Increments to `format_version` are **breaking** — readers should reject + unknown versions explicitly. +* New fields may be appended **without** a version bump if and only if they + live in a new variable-length section pointed to by an offset that was + reserved as `0` in v1. Readers tolerant to v1 still skip new sections + cleanly because `record_size` accounts for them. +* Bits in `peak_flags` and `reserved*` may be allocated for non-breaking + signals (e.g., a future "has_charge_per_peak" flag). + +--- + +## Design rationale + +* **128-byte header** = exactly two cache lines on x86/ARM. +* **f64 mz, f32 intensity** = matches RawFileReader's native output; + matches PyTorch tensor channel layouts; eliminates downstream conversion. +* **SoA layout** = each downstream consumer that reads only `mz` (e.g., + fragment matching) loads only the `mz_array` cache lines, not interleaved + intensity bytes. +* **NaN sentinels** for floats and `-1` for signed ints = no separate + presence bitmask required for most fields; downstream code handles + missing values with the same arithmetic (NaN propagates, `-1` is a + trivial branch). +* **Optional metadata dump** = preserves every per-scan trailer value + without enumerating them in the header. Engines can mine for + instrument-specific signals later (lock-mass calibration, AGC stats, + conversion parameters) without format changes. +* **Variable-section offsets** = forward-compat: readers seek to known + offsets and ignore unfamiliar tails. diff --git a/MainClass.cs b/MainClass.cs index 2a75d4f..7f942cb 100644 --- a/MainClass.cs +++ b/MainClass.cs @@ -528,7 +528,7 @@ private static void RegularParametersParsing(string[] args) }, { "f=|format=", - "The spectra output format: 0 for MGF, 1 for mzML, 2 for indexed mzML, 3 for Parquet, 4 for None (no output); both numeric and text (case insensitive) value recognized. Defaults to indexed mzML if no format is specified.", + "The spectra output format: 0 for MGF, 1 for mzML, 2 for indexed mzML, 3 for Parquet, 4 for BinarySoa (RCIA streaming binary, see BINARY_SOA_FORMAT.md), 5 for None (no output); both numeric and text (case insensitive) value recognized. Defaults to indexed mzML if no format is specified.", v => outputFormatString = v }, { diff --git a/OutputFormat.cs b/OutputFormat.cs index 21b85d3..371fa5f 100644 --- a/OutputFormat.cs +++ b/OutputFormat.cs @@ -6,6 +6,7 @@ public enum OutputFormat MzML, IndexMzML, Parquet, + BinarySoa, None } diff --git a/RawFileParser.cs b/RawFileParser.cs index f539494..3ee9ba6 100755 --- a/RawFileParser.cs +++ b/RawFileParser.cs @@ -181,6 +181,10 @@ private static void ProcessFile(ParseInput parseInput) spectrumWriter = new ParquetSpectrumWriter(parseInput); spectrumWriter.Write(rawFile, firstScanNumber, lastScanNumber); break; + case OutputFormat.BinarySoa: + spectrumWriter = new BinarySoaSpectrumWriter(parseInput); + spectrumWriter.Write(rawFile, firstScanNumber, lastScanNumber); + break; } } diff --git a/Writer/BinarySoaSpectrumWriter.cs b/Writer/BinarySoaSpectrumWriter.cs new file mode 100644 index 0000000..69d400f --- /dev/null +++ b/Writer/BinarySoaSpectrumWriter.cs @@ -0,0 +1,647 @@ +using log4net; +using System; +using System.Buffers; +using System.IO; +using System.Reflection; +using System.Runtime.InteropServices; +using System.Text; +using ThermoFisher.CommonCore.Data; +using ThermoFisher.CommonCore.Data.Business; +using ThermoFisher.CommonCore.Data.FilterEnums; +using ThermoFisher.CommonCore.Data.Interfaces; +using ThermoRawFileParser.Util; + +namespace ThermoRawFileParser.Writer +{ + /// + /// Streams every selected scan as a self-describing binary record laid out as + /// Structure-of-Arrays: a 128-byte fixed scalar header, optional UTF-8 filter + /// string, then packed f64 mz[N] + f32 intensity[N] arrays, and + /// finally an optional verbatim trailer key/value dump. End-of-stream marker is + /// a single u32 = 0 word. + /// + /// Designed for high-throughput downstream consumers (Rust engines, GPU + /// rescorers, columnar database loaders) that prefer zero-copy ingestion over + /// portable XML. + /// + /// Two operating modes share a single byte format: + /// + /// Streaming (--stdout): records flow into a downstream + /// process via a pipe. This is the primary use case; the writer wraps the + /// pipe in a 1 MB to collapse small-write syscalls. + /// File (--output): the identical byte format is written to + /// disk so the same downstream consumer can replay it later (sidecar caching). + /// + /// + /// Format spec: BINARY_SOA_FORMAT.md at the repo root. + /// + public class BinarySoaSpectrumWriter : SpectrumWriter + { + private static readonly ILog Log = + LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType); + + // ─── Format constants ──────────────────────────────────────────────── + + /// Magic bytes at the start of every stream: ASCII "RCIASTR1". + private static readonly byte[] FileMagic = + { 0x52, 0x43, 0x49, 0x41, 0x53, 0x54, 0x52, 0x31 }; + + /// Bumping this is a breaking change; readers must reject unknown versions. + public const ushort FormatVersion = 1; + + /// File-level header is 32 bytes, written once at stream start. + public const int FileHeaderSize = 32; + + /// Per-record fixed scalar header is 128 bytes (two cache lines). + public const int RecordFixedHeaderSize = 128; + + /// Output buffer size for the BufferedStream wrapping BaseStream. + private const int OutputBufferSize = 1 * 1024 * 1024; + + /// Caps for u16-sized variable sections; values longer are truncated with a warning. + private const int MaxFilterStringLen = ushort.MaxValue; + private const int MaxKeyOrValueLen = ushort.MaxValue; + + /// Default extension used for file output mode. + private const string OutputExtension = ".rcia.bin"; + + // ─── Reusable per-instance buffers (writer is single-threaded per file) ── + + /// 128-byte scratch for assembling each record's fixed header. + private readonly byte[] _hdrBuffer = new byte[RecordFixedHeaderSize]; + + /// Scratch for building the optional metadata block. + /// Reset between spectra; capacity grows monotonically as needed. + private readonly MemoryStream _metaScratch = new MemoryStream(2048); + + /// Reusable zero-pad source; pad lengths are 0..7 so 8 bytes is always sufficient. + private static readonly byte[] ZeroPad = new byte[8]; + + // ─── Public ctor ───────────────────────────────────────────────────── + + public BinarySoaSpectrumWriter(ParseInput parseInput) : base(parseInput) { } + + // ─── Top-level write loop ──────────────────────────────────────────── + + /// + public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastScanNumber) + { + if (!rawFile.HasMsData) + { + throw new RawFileParserException("No MS data in RAW file, no output will be produced"); + } + + ConfigureWriter(OutputExtension); + + // Bypass StreamWriter's text encoding entirely. Wrap BaseStream in a 1 MB + // BufferedStream to coalesce many small writes into a few large pipe syscalls + // — this dropped sys time ~22× on a 3.7 GB RAW benchmark. + // leaveOpen:true keeps the BinaryWriter from closing BaseStream during dispose + // chaining; the outer `using (Writer)` is responsible for the final close+flush. + using (Writer) + using (var buffered = new BufferedStream(Writer.BaseStream, OutputBufferSize)) + using (var bw = new BinaryWriter(buffered, Encoding.UTF8, leaveOpen: true)) + { + WriteFileHeader(bw); + + int totalScans = lastScanNumber - firstScanNumber + 1; + Log.Info("Processing " + totalScans + " scans"); + int lastScanProgress = 0; + int written = 0; + + for (int scanNumber = firstScanNumber; scanNumber <= lastScanNumber; scanNumber++) + { + ReportProgress(scanNumber, firstScanNumber, lastScanNumber, ref lastScanProgress); + + try + { + // Apply MS-level filter (matches MzML/Parquet writer behavior). + int level = (int)rawFile.GetScanEventForScanNumber(scanNumber).MSOrder; + if (level > ParseInput.MaxLevel) continue; + if (!ParseInput.MsLevel.Contains(level)) continue; + + WriteRecord(bw, rawFile, scanNumber, level); + written++; + } + catch (Exception ex) + { + Log.Error($"Scan #{scanNumber} cannot be processed: {ex.Message}"); + Log.Debug($"{ex.StackTrace}\n{ex.InnerException}"); + ParseInput.NewError(); + } + } + + // End-of-stream marker (u32 record_size = 0) + bw.Write((uint)0); + bw.Flush(); + buffered.Flush(); + + if (ParseInput.LogFormat == LogFormat.DEFAULT) Console.Error.WriteLine(); + Log.Info($"Wrote {written}/{totalScans} spectra to binary SoA stream"); + } + } + + private void WriteFileHeader(BinaryWriter bw) + { + bw.Write(FileMagic); // 8 bytes + bw.Write(FormatVersion); // u16 + bw.Write((ushort)FileHeaderSize); // u16 + bw.Write((uint)0); // flags = 0 + bw.Write(new byte[16]); // reserved + } + + private void ReportProgress(int scanNumber, int firstScanNumber, int lastScanNumber, + ref int lastScanProgress) + { + if (ParseInput.LogFormat != LogFormat.DEFAULT) return; + int scanProgress = (int)((double)scanNumber / (lastScanNumber - firstScanNumber + 1) * 100); + if (scanProgress % ProgressPercentageStep == 0 && scanProgress != lastScanProgress) + { + Console.Error.Write("" + scanProgress + "% "); + lastScanProgress = scanProgress; + } + } + + // ─── Per-spectrum record ───────────────────────────────────────────── + + /// + /// Pulls all data for a single scan from RawFileReader and emits one binary record. + /// The record layout: + /// + /// [128-byte fixed header] + /// [filter_string_len bytes UTF-8 filter string][pad-to-8] + /// [f64 mz[N]][f32 intensity[N]][pad-to-8] + /// [optional metadata block: u32 n_pairs, then (u16 klen, kbytes, u16 vlen, vbytes)*][pad-to-8] + /// [final pad so record_size is a multiple of 8] + /// + /// + private void WriteRecord(BinaryWriter bw, IRawDataPlus rawFile, int scanNumber, int msLevel) + { + // 1. Pull per-scan data ────────────────────────────────────────── + var scanFilter = rawFile.GetFilterForScanNumber(scanNumber); + var scanEvent = rawFile.GetScanEventForScanNumber(scanNumber); + var scanStats = rawFile.GetScanStatsForScanNumber(scanNumber); + double retentionTimeMin = rawFile.RetentionTimeFromScanNumber(scanNumber); + + ScanTrailer trailer = LoadTrailer(rawFile, scanNumber); + + // Trailer-derived nullable scalars + int? charge = trailer.AsPositiveInt("Charge State:"); + double? monoisotopicMz = trailer.AsDouble("Monoisotopic M/Z:"); + double? ionInjectionTime = trailer.AsDouble("Ion Injection Time (ms):"); + double? isolationWidthTrailer = trailer.AsDouble("MS" + msLevel + " Isolation Width:"); + int? masterScan = trailer.AsPositiveInt("Master Scan Number:"); + double? faimsCv = trailer.AsBool("FAIMS Voltage On:").GetValueOrDefault(false) + ? trailer.AsDouble("FAIMS CV:") : null; + double? elapsedScanTimeSec = trailer.AsDouble("Elapsed Scan Time (sec):"); + + // Reaction (only meaningful for MS2+) + double precursorMz = double.NaN; + float isolationWidth = float.NaN; + float isolationLowerOffset = float.NaN; + float isolationUpperOffset = float.NaN; + float collisionEnergy = float.NaN; + float precursorIntensity = 0f; + byte activationType = 0; + + if (msLevel > 1) + { + ResolveReactionData(rawFile, scanEvent, scanNumber, msLevel, monoisotopicMz, + isolationWidthTrailer, masterScan, + out precursorMz, out isolationWidth, + out isolationLowerOffset, out isolationUpperOffset, + out collisionEnergy, out precursorIntensity, out activationType); + } + + // Peak data (centroid by default; respect --noPeakPicking selectively) + bool requestCentroid = !ParseInput.NoPeakPicking.Contains(msLevel); + MZData mzData = ReadPeakData(rawFile, scanEvent, scanNumber, requestCentroid); + + int nPeaks = mzData.masses?.Length ?? 0; + double[] masses = mzData.masses ?? Array.Empty(); + double[] intensities = mzData.intensities ?? Array.Empty(); + byte scanDataType = (byte)(mzData.isCentroided ? 1 : 0); + + // 2. Encode filter string ──────────────────────────────────────── + string filterString = scanEvent.ToString() ?? string.Empty; + byte[] filterBytes = Encoding.UTF8.GetBytes(filterString); + int filterLen = filterBytes.Length; + if (filterLen > MaxFilterStringLen) + { + Log.Warn($"Filter string for scan {scanNumber} truncated from {filterLen} to {MaxFilterStringLen} bytes"); + filterLen = MaxFilterStringLen; + } + int filterPadLen = ComputePadLen(RecordFixedHeaderSize + filterLen, 8); + + // 3. Compute peak section sizes ───────────────────────────────── + int peakSection = nPeaks * 8 + nPeaks * 4; + int peakPadLen = ComputePadLen(peakSection, 8); + + // 4. Build metadata block (full trailer dump) into _metaScratch ── + int metadataLength = BuildMetadataBlock(trailer); + int metadataPadLen = metadataLength > 0 ? ComputePadLen(metadataLength, 8) : 0; + + // 5. Compute offsets and total record size ─────────────────────── + uint arraysOffset = (uint)(RecordFixedHeaderSize + filterLen + filterPadLen); + uint metadataOffset = metadataLength > 0 + ? (uint)(arraysOffset + peakSection + peakPadLen) + : 0; + + int totalSize = + RecordFixedHeaderSize + filterLen + filterPadLen + + peakSection + peakPadLen + + metadataLength + metadataPadLen; + int finalPadLen = ComputePadLen(totalSize, 8); + totalSize += finalPadLen; + + // 6. Assemble fixed header into _hdrBuffer ─────────────────────── + FillRecordHeader(_hdrBuffer, + totalSize, scanNumber, msLevel, scanFilter.Polarity, scanDataType, activationType, + nPeaks, retentionTimeMin * 60.0, + precursorMz, monoisotopicMz ?? double.NaN, mzData.basePeakMass ?? double.NaN, + isolationLowerOffset, isolationUpperOffset, isolationWidth, + precursorIntensity, + (float)(mzData.basePeakIntensity ?? double.NaN), + (float)scanStats.TIC, + ToFloatOrNaN(ionInjectionTime), collisionEnergy, + ToFloatOrNaN(faimsCv), + elapsedScanTimeSec.HasValue ? (float)(elapsedScanTimeSec.Value * 1000.0) : float.NaN, + (float)scanStats.LowMass, (float)scanStats.HighMass, + charge ?? -1, masterScan ?? -1, + filterLen, arraysOffset, metadataOffset, (uint)metadataLength); + + // 7. Emit ─────────────────────────────────────────────────────── + bw.Write(_hdrBuffer, 0, RecordFixedHeaderSize); + + if (filterLen > 0) bw.Write(filterBytes, 0, filterLen); + WritePad(bw, filterPadLen); + + WritePeakArrays(bw, masses, intensities, nPeaks); + WritePad(bw, peakPadLen); + + if (metadataLength > 0) + { + bw.Write(_metaScratch.GetBuffer(), 0, metadataLength); + WritePad(bw, metadataPadLen); + } + + WritePad(bw, finalPadLen); + } + + // ─── Helpers ───────────────────────────────────────────────────────── + + /// Read trailer with graceful fallback if vendor metadata is unavailable. + private ScanTrailer LoadTrailer(IRawDataPlus rawFile, int scanNumber) + { + try + { + return new ScanTrailer(rawFile.GetTrailerExtraInformation(scanNumber)); + } + catch (Exception ex) + { + Log.WarnFormat("Cannot load trailer for scan {0}: {1}", scanNumber, ex.Message); + ParseInput.NewWarn(); + return new ScanTrailer(); + } + } + + /// Read mz/intensity arrays with graceful fallback to empty arrays on failure. + private MZData ReadPeakData(IRawDataPlus rawFile, IScanEvent scanEvent, int scanNumber, bool centroid) + { + try + { + return ReadMZData(rawFile, scanEvent, scanNumber, centroid, + /*charge per peak*/ false, /*noise data*/ false); + } + catch (Exception ex) + { + Log.WarnFormat("Cannot read peaks for scan {0}: {1}", scanNumber, ex.Message); + ParseInput.NewWarn(); + return new MZData + { + masses = Array.Empty(), + intensities = Array.Empty(), + isCentroided = false, + }; + } + } + + /// + /// Resolve precursor and reaction info for an MSn scan, accounting for EThcD + /// (ETD/ECD followed by HCD/CID supplemental activation). The primary reaction's + /// activation type is encoded; the EThcD-specific code (5) is set when the + /// instrument's SupplementalActivation flag is on AND the sequential + /// reaction pattern matches. + /// + private void ResolveReactionData( + IRawDataPlus rawFile, IScanEvent scanEvent, int scanNumber, int msLevel, + double? monoisotopicMz, double? isolationWidthTrailer, int? masterScan, + out double precursorMz, out float isolationWidth, + out float isolationLowerOffset, out float isolationUpperOffset, + out float collisionEnergy, out float precursorIntensity, out byte activationType) + { + precursorMz = double.NaN; + isolationWidth = float.NaN; + isolationLowerOffset = float.NaN; + isolationUpperOffset = float.NaN; + collisionEnergy = float.NaN; + precursorIntensity = 0f; + activationType = 0; + + // Determine the primary reaction index. FindLastReaction (defined on the base class) + // walks the reaction chain and accounts for supplemental activation; calling it + // ensures EThcD-style spectra report the ETD/ECD reaction as primary, not the + // supplemental HCD/CID. + int primaryReactionIndex; + try + { + primaryReactionIndex = FindLastReaction(scanEvent, msLevel); + } + catch + { + // Fall back to the conventional last reaction + IReaction fallback = GetReaction(scanEvent, scanNumber); + if (fallback != null) + { + SetReactionFields(rawFile, scanNumber, msLevel, fallback, + monoisotopicMz, isolationWidthTrailer, masterScan, + out precursorMz, out isolationWidth, + out isolationLowerOffset, out isolationUpperOffset, + out collisionEnergy, out precursorIntensity); + activationType = EncodeActivationType(fallback.ActivationType); + } + return; + } + + IReaction primaryReaction; + try { primaryReaction = scanEvent.GetReaction(primaryReactionIndex); } + catch + { + Log.Warn($"Cannot get primary reaction for scan {scanNumber}"); + return; + } + + SetReactionFields(rawFile, scanNumber, msLevel, primaryReaction, + monoisotopicMz, isolationWidthTrailer, masterScan, + out precursorMz, out isolationWidth, + out isolationLowerOffset, out isolationUpperOffset, + out collisionEnergy, out precursorIntensity); + + activationType = EncodeActivationType(primaryReaction.ActivationType); + + // EThcD detection: supplemental activation flag is on AND primary reaction is + // ETD/ECD AND a HCD/CID reaction follows it. + if (scanEvent.SupplementalActivation == TriState.On + && (primaryReaction.ActivationType == ActivationType.ElectronTransferDissociation + || primaryReaction.ActivationType == ActivationType.ElectronCaptureDissociation)) + { + try + { + var supplemental = scanEvent.GetReaction(primaryReactionIndex + 1); + if (supplemental.ActivationType == ActivationType.HigherEnergyCollisionalDissociation + || supplemental.ActivationType == ActivationType.CollisionInducedDissociation) + { + activationType = 5; // EThcD + } + } + catch { /* no supplemental — keep primary encoding */ } + } + } + + private void SetReactionFields( + IRawDataPlus rawFile, int scanNumber, int msLevel, IReaction reaction, + double? monoisotopicMz, double? isolationWidthTrailer, int? masterScan, + out double precursorMz, out float isolationWidth, + out float isolationLowerOffset, out float isolationUpperOffset, + out float collisionEnergy, out float precursorIntensity) + { + precursorMz = CalculateSelectedIonMz(reaction, monoisotopicMz, isolationWidthTrailer); + collisionEnergy = (float)reaction.CollisionEnergy; + + double iw = isolationWidthTrailer ?? reaction.IsolationWidth; + isolationWidth = (float)iw; + isolationLowerOffset = (float)(-iw / 2.0); + isolationUpperOffset = (float)( iw / 2.0); + + precursorIntensity = 0f; + if (masterScan.HasValue && masterScan.Value > 0 && precursorMz > 0) + { + try + { + precursorIntensity = (float)CalculatePrecursorPeakIntensity( + rawFile, masterScan.Value, reaction.PrecursorMass, isolationWidthTrailer, + ParseInput.NoPeakPicking.Contains(msLevel - 1)); + } + catch { /* graceful degradation: leave at 0 */ } + } + } + + /// + /// Bulk-emit the SoA peak arrays. The mz array is a zero-copy reinterpret of the + /// existing double[]; the intensity array requires an f64→f32 narrowing pass + /// over a pooled float[]. The narrowing loop is auto-vectorized by RyuJIT + /// (AVX2/AVX-512/NEON). + /// + private static void WritePeakArrays(BinaryWriter bw, double[] masses, double[] intensities, int nPeaks) + { + if (nPeaks == 0) return; + + // mz: zero-copy reinterpret of double[] as bytes + ReadOnlySpan mzSpan = MemoryMarshal.AsBytes(masses.AsSpan(0, nPeaks)); + bw.Write(mzSpan); + + // intensity: f64 → f32 narrow into pooled buffer, emit as bytes + float[] intBuf = ArrayPool.Shared.Rent(nPeaks); + try + { + for (int i = 0; i < nPeaks; i++) intBuf[i] = (float)intensities[i]; + ReadOnlySpan intSpan = MemoryMarshal.AsBytes(intBuf.AsSpan(0, nPeaks)); + bw.Write(intSpan); + } + finally + { + ArrayPool.Shared.Return(intBuf); + } + } + + /// + /// Build the optional metadata block into the reusable scratch buffer. + /// Returns the number of valid bytes; callers should slice _metaScratch.GetBuffer() + /// to that length. + /// Format: u32 n_pairs followed by (u16 klen, kbytes, u16 vlen, vbytes) entries. + /// + private int BuildMetadataBlock(ScanTrailer trailer) + { + if (trailer.Length == 0) return 0; + + _metaScratch.SetLength(0); + using (var bw = new BinaryWriter(_metaScratch, Encoding.UTF8, leaveOpen: true)) + { + bw.Write((uint)trailer.Length); + var labels = trailer.Labels; + var values = trailer.Values; + for (int i = 0; i < labels.Length; i++) + { + WriteLengthPrefixed(bw, labels[i] ?? ""); + WriteLengthPrefixed(bw, values[i] ?? ""); + } + bw.Flush(); + } + return (int)_metaScratch.Length; + } + + private static void WriteLengthPrefixed(BinaryWriter bw, string s) + { + byte[] bytes = Encoding.UTF8.GetBytes(s); + int len = Math.Min(bytes.Length, MaxKeyOrValueLen); + bw.Write((ushort)len); + if (len > 0) bw.Write(bytes, 0, len); + } + + /// + /// Pack the 128-byte fixed scalar header into at offset 0. + /// All offsets are documented in BINARY_SOA_FORMAT.md and must stay in sync. + /// + private static void FillRecordHeader(byte[] dest, + int recordSize, int scanId, int msLevel, PolarityType polarity, + byte scanDataType, byte activationType, int nPeaks, + double retentionTimeSeconds, + double precursorMz, double precursorMzMonoisotopic, double basePeakMz, + float isolationLowerOffset, float isolationUpperOffset, float isolationWidth, + float precursorIntensity, float basePeakIntensity, float totalIonCurrent, + float ionInjectionTimeMs, float collisionEnergy, float faimsCv, + float elapsedScanTimeMs, float lowMass, float highMass, + int precursorCharge, int masterScanNumber, + int filterStringLen, uint arraysOffset, + uint metadataOffset, uint metadataLength) + { + var span = dest.AsSpan(0, RecordFixedHeaderSize); + + // Block 1 — identity & shape (16 bytes) + BinaryPrimitives_WriteU32(span, 0, (uint)recordSize); + BinaryPrimitives_WriteU32(span, 4, (uint)scanId); + span[8] = (byte)msLevel; + span[9] = EncodePolarity(polarity); + span[10] = scanDataType; + span[11] = activationType; + BinaryPrimitives_WriteU32(span, 12, (uint)nPeaks); + + // Block 2 — doubles (32 bytes) + BinaryPrimitives_WriteF64(span, 16, retentionTimeSeconds); + BinaryPrimitives_WriteF64(span, 24, precursorMz); + BinaryPrimitives_WriteF64(span, 32, precursorMzMonoisotopic); + BinaryPrimitives_WriteF64(span, 40, basePeakMz); + + // Block 3 — floats (48 bytes) + BinaryPrimitives_WriteF32(span, 48, isolationLowerOffset); + BinaryPrimitives_WriteF32(span, 52, isolationUpperOffset); + BinaryPrimitives_WriteF32(span, 56, isolationWidth); + BinaryPrimitives_WriteF32(span, 60, precursorIntensity); + BinaryPrimitives_WriteF32(span, 64, basePeakIntensity); + BinaryPrimitives_WriteF32(span, 68, totalIonCurrent); + BinaryPrimitives_WriteF32(span, 72, ionInjectionTimeMs); + BinaryPrimitives_WriteF32(span, 76, collisionEnergy); + BinaryPrimitives_WriteF32(span, 80, faimsCv); + BinaryPrimitives_WriteF32(span, 84, elapsedScanTimeMs); + BinaryPrimitives_WriteF32(span, 88, lowMass); + BinaryPrimitives_WriteF32(span, 92, highMass); + + // Block 4 — ints & flags (16 bytes) + BinaryPrimitives_WriteI32(span, 96, precursorCharge); + BinaryPrimitives_WriteI32(span, 100, masterScanNumber); + BinaryPrimitives_WriteU32(span, 104, 0x1u); // peak_flags: bit 0 = sorted_by_mz + BinaryPrimitives_WriteU32(span, 108, 0u); // reserved1 + + // Block 5 — variable-section pointers (16 bytes) + BinaryPrimitives_WriteU16(span, 112, (ushort)filterStringLen); + BinaryPrimitives_WriteU16(span, 114, 0); // reserved2 + BinaryPrimitives_WriteU32(span, 116, arraysOffset); + BinaryPrimitives_WriteU32(span, 120, metadataOffset); + BinaryPrimitives_WriteU32(span, 124, metadataLength); + } + + // ─── Encoding helpers ──────────────────────────────────────────────── + + /// Map RawFileReader's polarity enum to the on-wire byte encoding. + public static byte EncodePolarity(PolarityType p) => p switch + { + PolarityType.Negative => 0, + PolarityType.Positive => 1, + _ => 255, + }; + + /// + /// Map a single reaction's to the on-wire byte encoding. + /// EThcD (5) is set by the caller when the supplemental-activation pattern is detected; + /// this helper returns the primary type only. + /// + public static byte EncodeActivationType(ActivationType t) => t switch + { + ActivationType.CollisionInducedDissociation => 1, + ActivationType.HigherEnergyCollisionalDissociation => 2, + ActivationType.ElectronTransferDissociation => 3, + ActivationType.ElectronCaptureDissociation => 4, + // 5 reserved for caller-detected EThcD + ActivationType.UltraVioletPhotoDissociation => 6, + ActivationType.NegativeElectronTransferDissociation => 7, + ActivationType.MultiPhotonDissociation => 8, + ActivationType.PQD => 9, + ActivationType.ProtonTransferReaction => 10, + ActivationType.NegativeProtonTransferReaction => 11, + _ => 255, + }; + + // ─── Pad helpers ───────────────────────────────────────────────────── + + /// Number of zero pad bytes to align to . + public static int ComputePadLen(int currentLen, int alignment) + { + int remainder = currentLen % alignment; + return remainder == 0 ? 0 : alignment - remainder; + } + + private static void WritePad(BinaryWriter bw, int n) + { + if (n > 0) bw.Write(ZeroPad, 0, n); + } + + private static float ToFloatOrNaN(double? v) => v.HasValue ? (float)v.Value : float.NaN; + + // ─── Inline little-endian writers ──────────────────────────────────── + // We avoid the BCL System.Buffers.Binary.BinaryPrimitives namespace import to keep + // the writer's dependencies minimal and self-documenting. These match its semantics + // exactly (little-endian, no allocation, JIT-inlinable). + + private static void BinaryPrimitives_WriteU16(Span dest, int off, ushort v) + { + dest[off] = (byte)(v & 0xFF); + dest[off + 1] = (byte)((v >> 8) & 0xFF); + } + private static void BinaryPrimitives_WriteU32(Span dest, int off, uint v) + { + dest[off] = (byte)(v & 0xFF); + dest[off + 1] = (byte)((v >> 8) & 0xFF); + dest[off + 2] = (byte)((v >> 16) & 0xFF); + dest[off + 3] = (byte)((v >> 24) & 0xFF); + } + private static void BinaryPrimitives_WriteI32(Span dest, int off, int v) + => BinaryPrimitives_WriteU32(dest, off, unchecked((uint)v)); + private static void BinaryPrimitives_WriteF32(Span dest, int off, float v) + { + uint bits = BitConverter.SingleToUInt32Bits(v); + BinaryPrimitives_WriteU32(dest, off, bits); + } + private static void BinaryPrimitives_WriteF64(Span dest, int off, double v) + { + ulong bits = BitConverter.DoubleToUInt64Bits(v); + dest[off] = (byte)(bits & 0xFF); + dest[off + 1] = (byte)((bits >> 8) & 0xFF); + dest[off + 2] = (byte)((bits >> 16) & 0xFF); + dest[off + 3] = (byte)((bits >> 24) & 0xFF); + dest[off + 4] = (byte)((bits >> 32) & 0xFF); + dest[off + 5] = (byte)((bits >> 40) & 0xFF); + dest[off + 6] = (byte)((bits >> 48) & 0xFF); + dest[off + 7] = (byte)((bits >> 56) & 0xFF); + } + } +} diff --git a/Writer/SpectrumWriter.cs b/Writer/SpectrumWriter.cs index 2176077..4dc8fd1 100644 --- a/Writer/SpectrumWriter.cs +++ b/Writer/SpectrumWriter.cs @@ -85,9 +85,21 @@ protected void ConfigureWriter(string extension) } var fileName = NormalizeFileName(ParseInput.OutputFile, extension, ParseInput.Gzip); - if (ParseInput.OutputFormat == OutputFormat.Parquet) + if (ParseInput.OutputFormat == OutputFormat.Parquet + || ParseInput.OutputFormat == OutputFormat.BinarySoa) { - Writer = new StreamWriter(File.Create(fileName)); + // Binary outputs: raw FileStream, no encoding wrapper. + // Writers will reach Writer.BaseStream and write raw bytes. + if (ParseInput.Gzip && ParseInput.OutputFormat == OutputFormat.BinarySoa) + { + var fileStream = File.Create(fileName); + var compress = new GZipStream(fileStream, CompressionMode.Compress); + Writer = new StreamWriter(compress); + } + else + { + Writer = new StreamWriter(File.Create(fileName)); + } } else if (!ParseInput.Gzip || ParseInput.OutputFormat == OutputFormat.IndexMzML) {