diff --git a/BINARY_SOA_FORMAT.md b/BINARY_SOA_FORMAT.md
new file mode 100644
index 0000000..dbbf99d
--- /dev/null
+++ b/BINARY_SOA_FORMAT.md
@@ -0,0 +1,247 @@
+# RCIA Binary SoA Stream Format — v1
+
+A lossless, low-overhead binary format for streaming Thermo RAW spectrum
+data into downstream consumers. Designed to be:
+
+1. **Lossless** — every scan-level value RawFileReader exposes is captured
+2. **Streamable** — sequence of self-describing records over a pipe
+3. **Zero-copy on the read side** — SoA arrays at known offsets, naturally aligned
+4. **Gracefully nullable** — missing values use NaN (floats) or `-1` / `0` sentinels (integers)
+5. **Forward-compatible** — variable-length sections at offsets, fixed scalar fields at known positions
+
+All multi-byte integers and floats are **little-endian** (matches all modern x86/ARM CPUs).
+Floats follow IEEE 754.
+
+## Streaming vs file output
+
+The writer produces the **identical byte format** in both modes; the difference
+is only the destination and a few I/O tuning details:
+
+| Mode | Invocation | Destination | Buffering |
+|------|-----------|-------------|-----------|
+| **Streaming** (primary use) | `--format=BinarySoa --stdout` | parent process via OS pipe | 1 MB BufferedStream wrap |
+| **File** (sidecar/cache) | `--format=BinarySoa --output=path.rcia.bin` | regular file | 1 MB BufferedStream wrap |
+
+Streaming is the primary use case; downstream pipelines consume records as they
+arrive without materializing the whole run on disk. File output exists so the
+same downstream consumer can replay a file later (e.g., re-search the same RAW
+with different parameters without re-reading via RawFileReader).
+
+A consumer cannot tell the two modes apart from the byte stream — the writer
+emits the same file header, records, and EOF marker regardless of destination.
+
+---
+
+## Byte order
+
+All values are little-endian. The format does not currently include a BOM;
+the file-header magic `RCIASTR1` doubles as an endianness check (its byte
+sequence is direction-sensitive).
+
+---
+
+## File header — written once at stream start (32 bytes)
+
+| Off | Size | Type | Field | Notes |
+|-----|------|----------|--------------------|-------|
+| 0 | 8 | `[u8;8]` | `magic` | ASCII bytes `R C I A S T R 1` (0x52 0x43 0x49 0x41 0x53 0x54 0x52 0x31) |
+| 8 | 2 | `u16` | `format_version` | `1` for this spec |
+| 10 | 2 | `u16` | `file_header_size` | `32`. Readers must skip past this many bytes from start to reach first record. |
+| 12 | 4 | `u32` | `flags` | Reserved, `0` |
+| 16 | 16 | `[u8;16]`| `reserved` | Zero-filled |
+
+---
+
+## Per-spectrum record
+
+A record consists of:
+
+```
+[Fixed Header — 128 bytes, all scalar fields present at known offsets]
+[Filter string — filter_string_len bytes, UTF-8, no null terminator]
+[Pad to 8-byte alignment]
+[Peak arrays — mz f64×N then intensity f32×N, packed]
+[Pad to 8-byte alignment]
+[Optional metadata dump — key/value strings of all trailer fields]
+[Pad to 8-byte alignment, brings record to multiple of 8]
+```
+
+### Fixed header (128 bytes)
+
+| Off | Size | Type | Field | Nullability |
+|-----|------|------|-------|-------------|
+| 0 | 4 | `u32` | `record_size` | Total bytes of this record (including padding); used to skip to next record. |
+| 4 | 4 | `u32` | `scan_id` | RAW file scan number. |
+| 8 | 1 | `u8` | `ms_level` | `1`, `2`, `3`, ... |
+| 9 | 1 | `u8` | `polarity` | `0` = negative, `1` = positive, `255` = unknown |
+| 10 | 1 | `u8` | `scan_data_type` | `0` = profile, `1` = centroid |
+| 11 | 1 | `u8` | `activation_type` | See enum below; `0` = none, `255` = unknown |
+| 12 | 4 | `u32` | `n_peaks` | May be `0`. |
+| 16 | 8 | `f64` | `retention_time_seconds` | Always present. |
+| 24 | 8 | `f64` | `precursor_mz` | NaN if MS1 or unavailable. |
+| 32 | 8 | `f64` | `precursor_mz_monoisotopic` | NaN if no monoisotopic correction was reported. |
+| 40 | 8 | `f64` | `base_peak_mz` | NaN if no peaks. |
+| 48 | 4 | `f32` | `isolation_lower_offset` | NaN if MS1 or unavailable. (Da, signed offset from precursor_mz to lower bound.) |
+| 52 | 4 | `f32` | `isolation_upper_offset` | NaN if MS1 or unavailable. (Da, signed offset from precursor_mz to upper bound.) |
+| 56 | 4 | `f32` | `isolation_width` | NaN if MS1. |
+| 60 | 4 | `f32` | `precursor_intensity` | `0.0` if MS1 or not computed. |
+| 64 | 4 | `f32` | `base_peak_intensity` | NaN if no peaks. |
+| 68 | 4 | `f32` | `total_ion_current` | NaN if not available. |
+| 72 | 4 | `f32` | `ion_injection_time_ms` | NaN if not in trailer. |
+| 76 | 4 | `f32` | `collision_energy` | NaN if MS1 or unavailable. |
+| 80 | 4 | `f32` | `faims_compensation_voltage` | NaN if no FAIMS. |
+| 84 | 4 | `f32` | `elapsed_scan_time_ms` | NaN if not available. |
+| 88 | 4 | `f32` | `low_mass` | Scan range start (Da). NaN if not available. |
+| 92 | 4 | `f32` | `high_mass` | Scan range end (Da). NaN if not available. |
+| 96 | 4 | `i32` | `precursor_charge` | `-1` if unknown. |
+| 100 | 4 | `i32` | `master_scan_number` | `-1` if no parent (MS1) or unknown. |
+| 104 | 4 | `u32` | `peak_flags` | Bit 0: `peaks_sorted_by_mz` (set = sorted ascending). Bits 1-31: reserved. |
+| 108 | 4 | `u32` | `reserved1` | `0` |
+| 112 | 2 | `u16` | `filter_string_len` | Bytes (UTF-8) of filter string immediately after header. |
+| 114 | 2 | `u16` | `reserved2` | `0` |
+| 116 | 4 | `u32` | `arrays_offset` | Byte offset (from record start) to the mz array. Always `>= 128 + filter_string_len`. |
+| 120 | 4 | `u32` | `metadata_offset` | Byte offset to optional metadata key/value block. `0` if absent. |
+| 124 | 4 | `u32` | `metadata_length` | Bytes of metadata block. `0` if absent. |
+
+Total: **128 bytes** (two 64-byte cache lines).
+
+### Activation type enum (`u8`)
+
+| Value | Meaning |
+|-------|---------|
+| 0 | None (MS1 or no activation reported) |
+| 1 | CID (Collision-Induced Dissociation) |
+| 2 | HCD (Higher-Energy Collisional Dissociation) |
+| 3 | ETD (Electron Transfer Dissociation) |
+| 4 | ECD (Electron Capture Dissociation) |
+| 5 | EThcD (ETD + HCD supplemental) |
+| 6 | UVPD (Ultraviolet Photodissociation) |
+| 7 | NETD (Negative Electron Transfer) |
+| 8 | MPD (Multi-Photon Dissociation) |
+| 9 | PQD (Pulsed Q Dissociation) |
+| 10 | PTR (Proton Transfer Reaction) |
+| 11 | nPTR (Negative Proton Transfer Reaction) |
+| 255 | Other / unknown |
+
+Note: `5` (EThcD) is reserved for caller-detected EThcD (ETD/ECD + supplemental HCD/CID); the
+encoder maps the primary reaction's `ActivationType` and leaves EThcD detection to a
+higher layer that inspects sequential reactions.
+
+### Filter string
+
+Immediately after the fixed header, `filter_string_len` UTF-8 bytes of the
+Thermo filter string (e.g., `"FTMS + p NSI Full ms2 408.5142@hcd28.00 [115.0000-1700.0000]"`).
+No null terminator. May be empty (`filter_string_len = 0`).
+
+### Peak arrays (at `arrays_offset`)
+
+```
+arrays_offset + 0 : f64 mz_array[N] (8-byte aligned)
+arrays_offset + 8N : f32 intensity_array[N] (4-byte aligned)
+```
+
+`mz_array` is sorted ascending if `peak_flags & 0x1` is set (which is the
+default for all output produced by the writer).
+
+### Metadata block (optional, at `metadata_offset`)
+
+If `metadata_length > 0`, a verbatim dump of all key/value pairs from the
+RAW file's per-scan trailer. This captures every vendor-reported value
+(AGC target, conversion parameters, all "MS{N} ..." fields, etc.)
+without selective filtering.
+
+```
+metadata_offset + 0 : u32 n_pairs
+metadata_offset + 4 : repeating block:
+ u16 key_len
+ key_len bytes (UTF-8)
+ u16 value_len
+ value_len bytes (UTF-8)
+```
+
+### Padding
+
+After the last peak/metadata byte, the record is zero-padded so that
+`record_size` is a multiple of 8. This guarantees the next record starts
+at an 8-byte boundary, so its `f64` fields can be cast/loaded aligned.
+
+---
+
+## End-of-stream marker
+
+After the last spectrum, the writer emits a single `u32` with value `0`.
+Readers should interpret a record_size of `0` as end-of-stream.
+
+```
+... last record bytes ...
+[ u32 record_size = 0 ]
+```
+
+---
+
+## Reading example (Rust pseudocode)
+
+```rust
+let mut hdr = [0u8; 128];
+loop {
+ if reader.read_exact(&mut hdr[..4]).is_err() { break; }
+ let record_size = u32::from_le_bytes(hdr[..4].try_into().unwrap());
+ if record_size == 0 { break; } // EOF marker
+
+ reader.read_exact(&mut hdr[4..128])?;
+ let filter_string_len = u16::from_le_bytes(hdr[112..114].try_into().unwrap()) as usize;
+ let arrays_offset = u32::from_le_bytes(hdr[116..120].try_into().unwrap()) as usize;
+ let metadata_offset = u32::from_le_bytes(hdr[120..124].try_into().unwrap()) as usize;
+ let metadata_length = u32::from_le_bytes(hdr[124..128].try_into().unwrap()) as usize;
+ let n_peaks = u32::from_le_bytes(hdr[12..16].try_into().unwrap()) as usize;
+
+ // Read remaining bytes
+ let mut rest = vec![0u8; (record_size as usize) - 128];
+ reader.read_exact(&mut rest)?;
+
+ let filter_string = std::str::from_utf8(&rest[0..filter_string_len])?;
+
+ let mz_start = arrays_offset - 128;
+ let int_start = mz_start + 8 * n_peaks;
+ let mz_bytes = &rest[mz_start .. mz_start + 8 * n_peaks];
+ let intensity_bytes = &rest[int_start .. int_start + 4 * n_peaks];
+ let mz: &[f64] = bytemuck::cast_slice(mz_bytes);
+ let intensity: &[f32] = bytemuck::cast_slice(intensity_bytes);
+
+ // ... process spectrum ...
+}
+```
+
+---
+
+## Versioning policy
+
+* Increments to `format_version` are **breaking** — readers should reject
+ unknown versions explicitly.
+* New fields may be appended **without** a version bump if and only if they
+ live in a new variable-length section pointed to by an offset that was
+ reserved as `0` in v1. Readers tolerant to v1 still skip new sections
+ cleanly because `record_size` accounts for them.
+* Bits in `peak_flags` and `reserved*` may be allocated for non-breaking
+ signals (e.g., a future "has_charge_per_peak" flag).
+
+---
+
+## Design rationale
+
+* **128-byte header** = exactly two cache lines on x86/ARM.
+* **f64 mz, f32 intensity** = matches RawFileReader's native output;
+ matches PyTorch tensor channel layouts; eliminates downstream conversion.
+* **SoA layout** = each downstream consumer that reads only `mz` (e.g.,
+ fragment matching) loads only the `mz_array` cache lines, not interleaved
+ intensity bytes.
+* **NaN sentinels** for floats and `-1` for signed ints = no separate
+ presence bitmask required for most fields; downstream code handles
+ missing values with the same arithmetic (NaN propagates, `-1` is a
+ trivial branch).
+* **Optional metadata dump** = preserves every per-scan trailer value
+ without enumerating them in the header. Engines can mine for
+ instrument-specific signals later (lock-mass calibration, AGC stats,
+ conversion parameters) without format changes.
+* **Variable-section offsets** = forward-compat: readers seek to known
+ offsets and ignore unfamiliar tails.
diff --git a/MainClass.cs b/MainClass.cs
index 2a75d4f..7f942cb 100644
--- a/MainClass.cs
+++ b/MainClass.cs
@@ -528,7 +528,7 @@ private static void RegularParametersParsing(string[] args)
},
{
"f=|format=",
- "The spectra output format: 0 for MGF, 1 for mzML, 2 for indexed mzML, 3 for Parquet, 4 for None (no output); both numeric and text (case insensitive) value recognized. Defaults to indexed mzML if no format is specified.",
+ "The spectra output format: 0 for MGF, 1 for mzML, 2 for indexed mzML, 3 for Parquet, 4 for BinarySoa (RCIA streaming binary, see BINARY_SOA_FORMAT.md), 5 for None (no output); both numeric and text (case insensitive) value recognized. Defaults to indexed mzML if no format is specified.",
v => outputFormatString = v
},
{
diff --git a/OutputFormat.cs b/OutputFormat.cs
index 21b85d3..371fa5f 100644
--- a/OutputFormat.cs
+++ b/OutputFormat.cs
@@ -6,6 +6,7 @@ public enum OutputFormat
MzML,
IndexMzML,
Parquet,
+ BinarySoa,
None
}
diff --git a/RawFileParser.cs b/RawFileParser.cs
index f539494..3ee9ba6 100755
--- a/RawFileParser.cs
+++ b/RawFileParser.cs
@@ -181,6 +181,10 @@ private static void ProcessFile(ParseInput parseInput)
spectrumWriter = new ParquetSpectrumWriter(parseInput);
spectrumWriter.Write(rawFile, firstScanNumber, lastScanNumber);
break;
+ case OutputFormat.BinarySoa:
+ spectrumWriter = new BinarySoaSpectrumWriter(parseInput);
+ spectrumWriter.Write(rawFile, firstScanNumber, lastScanNumber);
+ break;
}
}
diff --git a/Writer/BinarySoaSpectrumWriter.cs b/Writer/BinarySoaSpectrumWriter.cs
new file mode 100644
index 0000000..69d400f
--- /dev/null
+++ b/Writer/BinarySoaSpectrumWriter.cs
@@ -0,0 +1,647 @@
+using log4net;
+using System;
+using System.Buffers;
+using System.IO;
+using System.Reflection;
+using System.Runtime.InteropServices;
+using System.Text;
+using ThermoFisher.CommonCore.Data;
+using ThermoFisher.CommonCore.Data.Business;
+using ThermoFisher.CommonCore.Data.FilterEnums;
+using ThermoFisher.CommonCore.Data.Interfaces;
+using ThermoRawFileParser.Util;
+
+namespace ThermoRawFileParser.Writer
+{
+ ///
+ /// Streams every selected scan as a self-describing binary record laid out as
+ /// Structure-of-Arrays: a 128-byte fixed scalar header, optional UTF-8 filter
+ /// string, then packed f64 mz[N] + f32 intensity[N] arrays, and
+ /// finally an optional verbatim trailer key/value dump. End-of-stream marker is
+ /// a single u32 = 0 word.
+ ///
+ /// Designed for high-throughput downstream consumers (Rust engines, GPU
+ /// rescorers, columnar database loaders) that prefer zero-copy ingestion over
+ /// portable XML.
+ ///
+ /// Two operating modes share a single byte format:
+ ///
+ /// - Streaming (--stdout): records flow into a downstream
+ /// process via a pipe. This is the primary use case; the writer wraps the
+ /// pipe in a 1 MB to collapse small-write syscalls.
+ /// - File (--output): the identical byte format is written to
+ /// disk so the same downstream consumer can replay it later (sidecar caching).
+ ///
+ ///
+ /// Format spec: BINARY_SOA_FORMAT.md at the repo root.
+ ///
+ public class BinarySoaSpectrumWriter : SpectrumWriter
+ {
+ private static readonly ILog Log =
+ LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
+
+ // ─── Format constants ────────────────────────────────────────────────
+
+ /// Magic bytes at the start of every stream: ASCII "RCIASTR1".
+ private static readonly byte[] FileMagic =
+ { 0x52, 0x43, 0x49, 0x41, 0x53, 0x54, 0x52, 0x31 };
+
+ /// Bumping this is a breaking change; readers must reject unknown versions.
+ public const ushort FormatVersion = 1;
+
+ /// File-level header is 32 bytes, written once at stream start.
+ public const int FileHeaderSize = 32;
+
+ /// Per-record fixed scalar header is 128 bytes (two cache lines).
+ public const int RecordFixedHeaderSize = 128;
+
+ /// Output buffer size for the BufferedStream wrapping BaseStream.
+ private const int OutputBufferSize = 1 * 1024 * 1024;
+
+ /// Caps for u16-sized variable sections; values longer are truncated with a warning.
+ private const int MaxFilterStringLen = ushort.MaxValue;
+ private const int MaxKeyOrValueLen = ushort.MaxValue;
+
+ /// Default extension used for file output mode.
+ private const string OutputExtension = ".rcia.bin";
+
+ // ─── Reusable per-instance buffers (writer is single-threaded per file) ──
+
+ /// 128-byte scratch for assembling each record's fixed header.
+ private readonly byte[] _hdrBuffer = new byte[RecordFixedHeaderSize];
+
+ /// Scratch for building the optional metadata block.
+ /// Reset between spectra; capacity grows monotonically as needed.
+ private readonly MemoryStream _metaScratch = new MemoryStream(2048);
+
+ /// Reusable zero-pad source; pad lengths are 0..7 so 8 bytes is always sufficient.
+ private static readonly byte[] ZeroPad = new byte[8];
+
+ // ─── Public ctor ─────────────────────────────────────────────────────
+
+ public BinarySoaSpectrumWriter(ParseInput parseInput) : base(parseInput) { }
+
+ // ─── Top-level write loop ────────────────────────────────────────────
+
+ ///
+ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastScanNumber)
+ {
+ if (!rawFile.HasMsData)
+ {
+ throw new RawFileParserException("No MS data in RAW file, no output will be produced");
+ }
+
+ ConfigureWriter(OutputExtension);
+
+ // Bypass StreamWriter's text encoding entirely. Wrap BaseStream in a 1 MB
+ // BufferedStream to coalesce many small writes into a few large pipe syscalls
+ // — this dropped sys time ~22× on a 3.7 GB RAW benchmark.
+ // leaveOpen:true keeps the BinaryWriter from closing BaseStream during dispose
+ // chaining; the outer `using (Writer)` is responsible for the final close+flush.
+ using (Writer)
+ using (var buffered = new BufferedStream(Writer.BaseStream, OutputBufferSize))
+ using (var bw = new BinaryWriter(buffered, Encoding.UTF8, leaveOpen: true))
+ {
+ WriteFileHeader(bw);
+
+ int totalScans = lastScanNumber - firstScanNumber + 1;
+ Log.Info("Processing " + totalScans + " scans");
+ int lastScanProgress = 0;
+ int written = 0;
+
+ for (int scanNumber = firstScanNumber; scanNumber <= lastScanNumber; scanNumber++)
+ {
+ ReportProgress(scanNumber, firstScanNumber, lastScanNumber, ref lastScanProgress);
+
+ try
+ {
+ // Apply MS-level filter (matches MzML/Parquet writer behavior).
+ int level = (int)rawFile.GetScanEventForScanNumber(scanNumber).MSOrder;
+ if (level > ParseInput.MaxLevel) continue;
+ if (!ParseInput.MsLevel.Contains(level)) continue;
+
+ WriteRecord(bw, rawFile, scanNumber, level);
+ written++;
+ }
+ catch (Exception ex)
+ {
+ Log.Error($"Scan #{scanNumber} cannot be processed: {ex.Message}");
+ Log.Debug($"{ex.StackTrace}\n{ex.InnerException}");
+ ParseInput.NewError();
+ }
+ }
+
+ // End-of-stream marker (u32 record_size = 0)
+ bw.Write((uint)0);
+ bw.Flush();
+ buffered.Flush();
+
+ if (ParseInput.LogFormat == LogFormat.DEFAULT) Console.Error.WriteLine();
+ Log.Info($"Wrote {written}/{totalScans} spectra to binary SoA stream");
+ }
+ }
+
+ private void WriteFileHeader(BinaryWriter bw)
+ {
+ bw.Write(FileMagic); // 8 bytes
+ bw.Write(FormatVersion); // u16
+ bw.Write((ushort)FileHeaderSize); // u16
+ bw.Write((uint)0); // flags = 0
+ bw.Write(new byte[16]); // reserved
+ }
+
+ private void ReportProgress(int scanNumber, int firstScanNumber, int lastScanNumber,
+ ref int lastScanProgress)
+ {
+ if (ParseInput.LogFormat != LogFormat.DEFAULT) return;
+ int scanProgress = (int)((double)scanNumber / (lastScanNumber - firstScanNumber + 1) * 100);
+ if (scanProgress % ProgressPercentageStep == 0 && scanProgress != lastScanProgress)
+ {
+ Console.Error.Write("" + scanProgress + "% ");
+ lastScanProgress = scanProgress;
+ }
+ }
+
+ // ─── Per-spectrum record ─────────────────────────────────────────────
+
+ ///
+ /// Pulls all data for a single scan from RawFileReader and emits one binary record.
+ /// The record layout:
+ ///
+ /// [128-byte fixed header]
+ /// [filter_string_len bytes UTF-8 filter string][pad-to-8]
+ /// [f64 mz[N]][f32 intensity[N]][pad-to-8]
+ /// [optional metadata block: u32 n_pairs, then (u16 klen, kbytes, u16 vlen, vbytes)*][pad-to-8]
+ /// [final pad so record_size is a multiple of 8]
+ ///
+ ///
+ private void WriteRecord(BinaryWriter bw, IRawDataPlus rawFile, int scanNumber, int msLevel)
+ {
+ // 1. Pull per-scan data ──────────────────────────────────────────
+ var scanFilter = rawFile.GetFilterForScanNumber(scanNumber);
+ var scanEvent = rawFile.GetScanEventForScanNumber(scanNumber);
+ var scanStats = rawFile.GetScanStatsForScanNumber(scanNumber);
+ double retentionTimeMin = rawFile.RetentionTimeFromScanNumber(scanNumber);
+
+ ScanTrailer trailer = LoadTrailer(rawFile, scanNumber);
+
+ // Trailer-derived nullable scalars
+ int? charge = trailer.AsPositiveInt("Charge State:");
+ double? monoisotopicMz = trailer.AsDouble("Monoisotopic M/Z:");
+ double? ionInjectionTime = trailer.AsDouble("Ion Injection Time (ms):");
+ double? isolationWidthTrailer = trailer.AsDouble("MS" + msLevel + " Isolation Width:");
+ int? masterScan = trailer.AsPositiveInt("Master Scan Number:");
+ double? faimsCv = trailer.AsBool("FAIMS Voltage On:").GetValueOrDefault(false)
+ ? trailer.AsDouble("FAIMS CV:") : null;
+ double? elapsedScanTimeSec = trailer.AsDouble("Elapsed Scan Time (sec):");
+
+ // Reaction (only meaningful for MS2+)
+ double precursorMz = double.NaN;
+ float isolationWidth = float.NaN;
+ float isolationLowerOffset = float.NaN;
+ float isolationUpperOffset = float.NaN;
+ float collisionEnergy = float.NaN;
+ float precursorIntensity = 0f;
+ byte activationType = 0;
+
+ if (msLevel > 1)
+ {
+ ResolveReactionData(rawFile, scanEvent, scanNumber, msLevel, monoisotopicMz,
+ isolationWidthTrailer, masterScan,
+ out precursorMz, out isolationWidth,
+ out isolationLowerOffset, out isolationUpperOffset,
+ out collisionEnergy, out precursorIntensity, out activationType);
+ }
+
+ // Peak data (centroid by default; respect --noPeakPicking selectively)
+ bool requestCentroid = !ParseInput.NoPeakPicking.Contains(msLevel);
+ MZData mzData = ReadPeakData(rawFile, scanEvent, scanNumber, requestCentroid);
+
+ int nPeaks = mzData.masses?.Length ?? 0;
+ double[] masses = mzData.masses ?? Array.Empty();
+ double[] intensities = mzData.intensities ?? Array.Empty();
+ byte scanDataType = (byte)(mzData.isCentroided ? 1 : 0);
+
+ // 2. Encode filter string ────────────────────────────────────────
+ string filterString = scanEvent.ToString() ?? string.Empty;
+ byte[] filterBytes = Encoding.UTF8.GetBytes(filterString);
+ int filterLen = filterBytes.Length;
+ if (filterLen > MaxFilterStringLen)
+ {
+ Log.Warn($"Filter string for scan {scanNumber} truncated from {filterLen} to {MaxFilterStringLen} bytes");
+ filterLen = MaxFilterStringLen;
+ }
+ int filterPadLen = ComputePadLen(RecordFixedHeaderSize + filterLen, 8);
+
+ // 3. Compute peak section sizes ─────────────────────────────────
+ int peakSection = nPeaks * 8 + nPeaks * 4;
+ int peakPadLen = ComputePadLen(peakSection, 8);
+
+ // 4. Build metadata block (full trailer dump) into _metaScratch ──
+ int metadataLength = BuildMetadataBlock(trailer);
+ int metadataPadLen = metadataLength > 0 ? ComputePadLen(metadataLength, 8) : 0;
+
+ // 5. Compute offsets and total record size ───────────────────────
+ uint arraysOffset = (uint)(RecordFixedHeaderSize + filterLen + filterPadLen);
+ uint metadataOffset = metadataLength > 0
+ ? (uint)(arraysOffset + peakSection + peakPadLen)
+ : 0;
+
+ int totalSize =
+ RecordFixedHeaderSize + filterLen + filterPadLen
+ + peakSection + peakPadLen
+ + metadataLength + metadataPadLen;
+ int finalPadLen = ComputePadLen(totalSize, 8);
+ totalSize += finalPadLen;
+
+ // 6. Assemble fixed header into _hdrBuffer ───────────────────────
+ FillRecordHeader(_hdrBuffer,
+ totalSize, scanNumber, msLevel, scanFilter.Polarity, scanDataType, activationType,
+ nPeaks, retentionTimeMin * 60.0,
+ precursorMz, monoisotopicMz ?? double.NaN, mzData.basePeakMass ?? double.NaN,
+ isolationLowerOffset, isolationUpperOffset, isolationWidth,
+ precursorIntensity,
+ (float)(mzData.basePeakIntensity ?? double.NaN),
+ (float)scanStats.TIC,
+ ToFloatOrNaN(ionInjectionTime), collisionEnergy,
+ ToFloatOrNaN(faimsCv),
+ elapsedScanTimeSec.HasValue ? (float)(elapsedScanTimeSec.Value * 1000.0) : float.NaN,
+ (float)scanStats.LowMass, (float)scanStats.HighMass,
+ charge ?? -1, masterScan ?? -1,
+ filterLen, arraysOffset, metadataOffset, (uint)metadataLength);
+
+ // 7. Emit ───────────────────────────────────────────────────────
+ bw.Write(_hdrBuffer, 0, RecordFixedHeaderSize);
+
+ if (filterLen > 0) bw.Write(filterBytes, 0, filterLen);
+ WritePad(bw, filterPadLen);
+
+ WritePeakArrays(bw, masses, intensities, nPeaks);
+ WritePad(bw, peakPadLen);
+
+ if (metadataLength > 0)
+ {
+ bw.Write(_metaScratch.GetBuffer(), 0, metadataLength);
+ WritePad(bw, metadataPadLen);
+ }
+
+ WritePad(bw, finalPadLen);
+ }
+
+ // ─── Helpers ─────────────────────────────────────────────────────────
+
+ /// Read trailer with graceful fallback if vendor metadata is unavailable.
+ private ScanTrailer LoadTrailer(IRawDataPlus rawFile, int scanNumber)
+ {
+ try
+ {
+ return new ScanTrailer(rawFile.GetTrailerExtraInformation(scanNumber));
+ }
+ catch (Exception ex)
+ {
+ Log.WarnFormat("Cannot load trailer for scan {0}: {1}", scanNumber, ex.Message);
+ ParseInput.NewWarn();
+ return new ScanTrailer();
+ }
+ }
+
+ /// Read mz/intensity arrays with graceful fallback to empty arrays on failure.
+ private MZData ReadPeakData(IRawDataPlus rawFile, IScanEvent scanEvent, int scanNumber, bool centroid)
+ {
+ try
+ {
+ return ReadMZData(rawFile, scanEvent, scanNumber, centroid,
+ /*charge per peak*/ false, /*noise data*/ false);
+ }
+ catch (Exception ex)
+ {
+ Log.WarnFormat("Cannot read peaks for scan {0}: {1}", scanNumber, ex.Message);
+ ParseInput.NewWarn();
+ return new MZData
+ {
+ masses = Array.Empty(),
+ intensities = Array.Empty(),
+ isCentroided = false,
+ };
+ }
+ }
+
+ ///
+ /// Resolve precursor and reaction info for an MSn scan, accounting for EThcD
+ /// (ETD/ECD followed by HCD/CID supplemental activation). The primary reaction's
+ /// activation type is encoded; the EThcD-specific code (5) is set when the
+ /// instrument's SupplementalActivation flag is on AND the sequential
+ /// reaction pattern matches.
+ ///
+ private void ResolveReactionData(
+ IRawDataPlus rawFile, IScanEvent scanEvent, int scanNumber, int msLevel,
+ double? monoisotopicMz, double? isolationWidthTrailer, int? masterScan,
+ out double precursorMz, out float isolationWidth,
+ out float isolationLowerOffset, out float isolationUpperOffset,
+ out float collisionEnergy, out float precursorIntensity, out byte activationType)
+ {
+ precursorMz = double.NaN;
+ isolationWidth = float.NaN;
+ isolationLowerOffset = float.NaN;
+ isolationUpperOffset = float.NaN;
+ collisionEnergy = float.NaN;
+ precursorIntensity = 0f;
+ activationType = 0;
+
+ // Determine the primary reaction index. FindLastReaction (defined on the base class)
+ // walks the reaction chain and accounts for supplemental activation; calling it
+ // ensures EThcD-style spectra report the ETD/ECD reaction as primary, not the
+ // supplemental HCD/CID.
+ int primaryReactionIndex;
+ try
+ {
+ primaryReactionIndex = FindLastReaction(scanEvent, msLevel);
+ }
+ catch
+ {
+ // Fall back to the conventional last reaction
+ IReaction fallback = GetReaction(scanEvent, scanNumber);
+ if (fallback != null)
+ {
+ SetReactionFields(rawFile, scanNumber, msLevel, fallback,
+ monoisotopicMz, isolationWidthTrailer, masterScan,
+ out precursorMz, out isolationWidth,
+ out isolationLowerOffset, out isolationUpperOffset,
+ out collisionEnergy, out precursorIntensity);
+ activationType = EncodeActivationType(fallback.ActivationType);
+ }
+ return;
+ }
+
+ IReaction primaryReaction;
+ try { primaryReaction = scanEvent.GetReaction(primaryReactionIndex); }
+ catch
+ {
+ Log.Warn($"Cannot get primary reaction for scan {scanNumber}");
+ return;
+ }
+
+ SetReactionFields(rawFile, scanNumber, msLevel, primaryReaction,
+ monoisotopicMz, isolationWidthTrailer, masterScan,
+ out precursorMz, out isolationWidth,
+ out isolationLowerOffset, out isolationUpperOffset,
+ out collisionEnergy, out precursorIntensity);
+
+ activationType = EncodeActivationType(primaryReaction.ActivationType);
+
+ // EThcD detection: supplemental activation flag is on AND primary reaction is
+ // ETD/ECD AND a HCD/CID reaction follows it.
+ if (scanEvent.SupplementalActivation == TriState.On
+ && (primaryReaction.ActivationType == ActivationType.ElectronTransferDissociation
+ || primaryReaction.ActivationType == ActivationType.ElectronCaptureDissociation))
+ {
+ try
+ {
+ var supplemental = scanEvent.GetReaction(primaryReactionIndex + 1);
+ if (supplemental.ActivationType == ActivationType.HigherEnergyCollisionalDissociation
+ || supplemental.ActivationType == ActivationType.CollisionInducedDissociation)
+ {
+ activationType = 5; // EThcD
+ }
+ }
+ catch { /* no supplemental — keep primary encoding */ }
+ }
+ }
+
+ private void SetReactionFields(
+ IRawDataPlus rawFile, int scanNumber, int msLevel, IReaction reaction,
+ double? monoisotopicMz, double? isolationWidthTrailer, int? masterScan,
+ out double precursorMz, out float isolationWidth,
+ out float isolationLowerOffset, out float isolationUpperOffset,
+ out float collisionEnergy, out float precursorIntensity)
+ {
+ precursorMz = CalculateSelectedIonMz(reaction, monoisotopicMz, isolationWidthTrailer);
+ collisionEnergy = (float)reaction.CollisionEnergy;
+
+ double iw = isolationWidthTrailer ?? reaction.IsolationWidth;
+ isolationWidth = (float)iw;
+ isolationLowerOffset = (float)(-iw / 2.0);
+ isolationUpperOffset = (float)( iw / 2.0);
+
+ precursorIntensity = 0f;
+ if (masterScan.HasValue && masterScan.Value > 0 && precursorMz > 0)
+ {
+ try
+ {
+ precursorIntensity = (float)CalculatePrecursorPeakIntensity(
+ rawFile, masterScan.Value, reaction.PrecursorMass, isolationWidthTrailer,
+ ParseInput.NoPeakPicking.Contains(msLevel - 1));
+ }
+ catch { /* graceful degradation: leave at 0 */ }
+ }
+ }
+
+ ///
+ /// Bulk-emit the SoA peak arrays. The mz array is a zero-copy reinterpret of the
+ /// existing double[]; the intensity array requires an f64→f32 narrowing pass
+ /// over a pooled float[]. The narrowing loop is auto-vectorized by RyuJIT
+ /// (AVX2/AVX-512/NEON).
+ ///
+ private static void WritePeakArrays(BinaryWriter bw, double[] masses, double[] intensities, int nPeaks)
+ {
+ if (nPeaks == 0) return;
+
+ // mz: zero-copy reinterpret of double[] as bytes
+ ReadOnlySpan mzSpan = MemoryMarshal.AsBytes(masses.AsSpan(0, nPeaks));
+ bw.Write(mzSpan);
+
+ // intensity: f64 → f32 narrow into pooled buffer, emit as bytes
+ float[] intBuf = ArrayPool.Shared.Rent(nPeaks);
+ try
+ {
+ for (int i = 0; i < nPeaks; i++) intBuf[i] = (float)intensities[i];
+ ReadOnlySpan intSpan = MemoryMarshal.AsBytes(intBuf.AsSpan(0, nPeaks));
+ bw.Write(intSpan);
+ }
+ finally
+ {
+ ArrayPool.Shared.Return(intBuf);
+ }
+ }
+
+ ///
+ /// Build the optional metadata block into the reusable scratch buffer.
+ /// Returns the number of valid bytes; callers should slice _metaScratch.GetBuffer()
+ /// to that length.
+ /// Format: u32 n_pairs followed by (u16 klen, kbytes, u16 vlen, vbytes) entries.
+ ///
+ private int BuildMetadataBlock(ScanTrailer trailer)
+ {
+ if (trailer.Length == 0) return 0;
+
+ _metaScratch.SetLength(0);
+ using (var bw = new BinaryWriter(_metaScratch, Encoding.UTF8, leaveOpen: true))
+ {
+ bw.Write((uint)trailer.Length);
+ var labels = trailer.Labels;
+ var values = trailer.Values;
+ for (int i = 0; i < labels.Length; i++)
+ {
+ WriteLengthPrefixed(bw, labels[i] ?? "");
+ WriteLengthPrefixed(bw, values[i] ?? "");
+ }
+ bw.Flush();
+ }
+ return (int)_metaScratch.Length;
+ }
+
+ private static void WriteLengthPrefixed(BinaryWriter bw, string s)
+ {
+ byte[] bytes = Encoding.UTF8.GetBytes(s);
+ int len = Math.Min(bytes.Length, MaxKeyOrValueLen);
+ bw.Write((ushort)len);
+ if (len > 0) bw.Write(bytes, 0, len);
+ }
+
+ ///
+ /// Pack the 128-byte fixed scalar header into at offset 0.
+ /// All offsets are documented in BINARY_SOA_FORMAT.md and must stay in sync.
+ ///
+ private static void FillRecordHeader(byte[] dest,
+ int recordSize, int scanId, int msLevel, PolarityType polarity,
+ byte scanDataType, byte activationType, int nPeaks,
+ double retentionTimeSeconds,
+ double precursorMz, double precursorMzMonoisotopic, double basePeakMz,
+ float isolationLowerOffset, float isolationUpperOffset, float isolationWidth,
+ float precursorIntensity, float basePeakIntensity, float totalIonCurrent,
+ float ionInjectionTimeMs, float collisionEnergy, float faimsCv,
+ float elapsedScanTimeMs, float lowMass, float highMass,
+ int precursorCharge, int masterScanNumber,
+ int filterStringLen, uint arraysOffset,
+ uint metadataOffset, uint metadataLength)
+ {
+ var span = dest.AsSpan(0, RecordFixedHeaderSize);
+
+ // Block 1 — identity & shape (16 bytes)
+ BinaryPrimitives_WriteU32(span, 0, (uint)recordSize);
+ BinaryPrimitives_WriteU32(span, 4, (uint)scanId);
+ span[8] = (byte)msLevel;
+ span[9] = EncodePolarity(polarity);
+ span[10] = scanDataType;
+ span[11] = activationType;
+ BinaryPrimitives_WriteU32(span, 12, (uint)nPeaks);
+
+ // Block 2 — doubles (32 bytes)
+ BinaryPrimitives_WriteF64(span, 16, retentionTimeSeconds);
+ BinaryPrimitives_WriteF64(span, 24, precursorMz);
+ BinaryPrimitives_WriteF64(span, 32, precursorMzMonoisotopic);
+ BinaryPrimitives_WriteF64(span, 40, basePeakMz);
+
+ // Block 3 — floats (48 bytes)
+ BinaryPrimitives_WriteF32(span, 48, isolationLowerOffset);
+ BinaryPrimitives_WriteF32(span, 52, isolationUpperOffset);
+ BinaryPrimitives_WriteF32(span, 56, isolationWidth);
+ BinaryPrimitives_WriteF32(span, 60, precursorIntensity);
+ BinaryPrimitives_WriteF32(span, 64, basePeakIntensity);
+ BinaryPrimitives_WriteF32(span, 68, totalIonCurrent);
+ BinaryPrimitives_WriteF32(span, 72, ionInjectionTimeMs);
+ BinaryPrimitives_WriteF32(span, 76, collisionEnergy);
+ BinaryPrimitives_WriteF32(span, 80, faimsCv);
+ BinaryPrimitives_WriteF32(span, 84, elapsedScanTimeMs);
+ BinaryPrimitives_WriteF32(span, 88, lowMass);
+ BinaryPrimitives_WriteF32(span, 92, highMass);
+
+ // Block 4 — ints & flags (16 bytes)
+ BinaryPrimitives_WriteI32(span, 96, precursorCharge);
+ BinaryPrimitives_WriteI32(span, 100, masterScanNumber);
+ BinaryPrimitives_WriteU32(span, 104, 0x1u); // peak_flags: bit 0 = sorted_by_mz
+ BinaryPrimitives_WriteU32(span, 108, 0u); // reserved1
+
+ // Block 5 — variable-section pointers (16 bytes)
+ BinaryPrimitives_WriteU16(span, 112, (ushort)filterStringLen);
+ BinaryPrimitives_WriteU16(span, 114, 0); // reserved2
+ BinaryPrimitives_WriteU32(span, 116, arraysOffset);
+ BinaryPrimitives_WriteU32(span, 120, metadataOffset);
+ BinaryPrimitives_WriteU32(span, 124, metadataLength);
+ }
+
+ // ─── Encoding helpers ────────────────────────────────────────────────
+
+ /// Map RawFileReader's polarity enum to the on-wire byte encoding.
+ public static byte EncodePolarity(PolarityType p) => p switch
+ {
+ PolarityType.Negative => 0,
+ PolarityType.Positive => 1,
+ _ => 255,
+ };
+
+ ///
+ /// Map a single reaction's to the on-wire byte encoding.
+ /// EThcD (5) is set by the caller when the supplemental-activation pattern is detected;
+ /// this helper returns the primary type only.
+ ///
+ public static byte EncodeActivationType(ActivationType t) => t switch
+ {
+ ActivationType.CollisionInducedDissociation => 1,
+ ActivationType.HigherEnergyCollisionalDissociation => 2,
+ ActivationType.ElectronTransferDissociation => 3,
+ ActivationType.ElectronCaptureDissociation => 4,
+ // 5 reserved for caller-detected EThcD
+ ActivationType.UltraVioletPhotoDissociation => 6,
+ ActivationType.NegativeElectronTransferDissociation => 7,
+ ActivationType.MultiPhotonDissociation => 8,
+ ActivationType.PQD => 9,
+ ActivationType.ProtonTransferReaction => 10,
+ ActivationType.NegativeProtonTransferReaction => 11,
+ _ => 255,
+ };
+
+ // ─── Pad helpers ─────────────────────────────────────────────────────
+
+ /// Number of zero pad bytes to align to .
+ public static int ComputePadLen(int currentLen, int alignment)
+ {
+ int remainder = currentLen % alignment;
+ return remainder == 0 ? 0 : alignment - remainder;
+ }
+
+ private static void WritePad(BinaryWriter bw, int n)
+ {
+ if (n > 0) bw.Write(ZeroPad, 0, n);
+ }
+
+ private static float ToFloatOrNaN(double? v) => v.HasValue ? (float)v.Value : float.NaN;
+
+ // ─── Inline little-endian writers ────────────────────────────────────
+ // We avoid the BCL System.Buffers.Binary.BinaryPrimitives namespace import to keep
+ // the writer's dependencies minimal and self-documenting. These match its semantics
+ // exactly (little-endian, no allocation, JIT-inlinable).
+
+ private static void BinaryPrimitives_WriteU16(Span dest, int off, ushort v)
+ {
+ dest[off] = (byte)(v & 0xFF);
+ dest[off + 1] = (byte)((v >> 8) & 0xFF);
+ }
+ private static void BinaryPrimitives_WriteU32(Span dest, int off, uint v)
+ {
+ dest[off] = (byte)(v & 0xFF);
+ dest[off + 1] = (byte)((v >> 8) & 0xFF);
+ dest[off + 2] = (byte)((v >> 16) & 0xFF);
+ dest[off + 3] = (byte)((v >> 24) & 0xFF);
+ }
+ private static void BinaryPrimitives_WriteI32(Span dest, int off, int v)
+ => BinaryPrimitives_WriteU32(dest, off, unchecked((uint)v));
+ private static void BinaryPrimitives_WriteF32(Span dest, int off, float v)
+ {
+ uint bits = BitConverter.SingleToUInt32Bits(v);
+ BinaryPrimitives_WriteU32(dest, off, bits);
+ }
+ private static void BinaryPrimitives_WriteF64(Span dest, int off, double v)
+ {
+ ulong bits = BitConverter.DoubleToUInt64Bits(v);
+ dest[off] = (byte)(bits & 0xFF);
+ dest[off + 1] = (byte)((bits >> 8) & 0xFF);
+ dest[off + 2] = (byte)((bits >> 16) & 0xFF);
+ dest[off + 3] = (byte)((bits >> 24) & 0xFF);
+ dest[off + 4] = (byte)((bits >> 32) & 0xFF);
+ dest[off + 5] = (byte)((bits >> 40) & 0xFF);
+ dest[off + 6] = (byte)((bits >> 48) & 0xFF);
+ dest[off + 7] = (byte)((bits >> 56) & 0xFF);
+ }
+ }
+}
diff --git a/Writer/SpectrumWriter.cs b/Writer/SpectrumWriter.cs
index 2176077..4dc8fd1 100644
--- a/Writer/SpectrumWriter.cs
+++ b/Writer/SpectrumWriter.cs
@@ -85,9 +85,21 @@ protected void ConfigureWriter(string extension)
}
var fileName = NormalizeFileName(ParseInput.OutputFile, extension, ParseInput.Gzip);
- if (ParseInput.OutputFormat == OutputFormat.Parquet)
+ if (ParseInput.OutputFormat == OutputFormat.Parquet
+ || ParseInput.OutputFormat == OutputFormat.BinarySoa)
{
- Writer = new StreamWriter(File.Create(fileName));
+ // Binary outputs: raw FileStream, no encoding wrapper.
+ // Writers will reach Writer.BaseStream and write raw bytes.
+ if (ParseInput.Gzip && ParseInput.OutputFormat == OutputFormat.BinarySoa)
+ {
+ var fileStream = File.Create(fileName);
+ var compress = new GZipStream(fileStream, CompressionMode.Compress);
+ Writer = new StreamWriter(compress);
+ }
+ else
+ {
+ Writer = new StreamWriter(File.Create(fileName));
+ }
}
else if (!ParseInput.Gzip || ParseInput.OutputFormat == OutputFormat.IndexMzML)
{