Implement per-stream DataQueues

jsiegle · jsiegle · commit b543af915529 · 2025-12-29T14:17:11.000-08:00
diff --git a/Source/Processors/RecordNode/BinaryFormat/BinaryRecording.cpp b/Source/Processors/RecordNode/BinaryFormat/BinaryRecording.cpp
@@ -685,9 +685,8 @@ void BinaryRecording::writeContinuousData (int writeChannel,
             wroteFirstSampleNumber[streamId] = true;
         }
 
-        for (int i = 0; i < size; i++)
-            /* Generate int sample number */
-            m_sampleNumberBuffer[i] = baseSampleNumber + i;
+        /* Generate sequential sample numbers using SIMD-optimized fill */
+        SIMDConverter::fillSequentialInt64 (m_sampleNumberBuffer, baseSampleNumber, size);
 
         /* Write int timestamps to disc */
         m_dataTimestampFiles[fileIndex]->writeData (m_sampleNumberBuffer, size * sizeof (int64));
@@ -764,9 +763,10 @@ void BinaryRecording::writeSpike (int electrodeIndex, const Spike* spike)
         m_intBuffer.malloc (totalSamples);
     }
 
-    double multFactor = 1 / (float (0x7fff) * channel->getChannelBitVolts (0));
-    FloatVectorOperations::copyWithMultiply (m_scaledBuffer.getData(), spike->getDataPointer(), multFactor, totalSamples);
-    AudioDataConverters::convertFloatToInt16LE (m_scaledBuffer.getData(), m_intBuffer.getData(), totalSamples);
+    /* Convert spike waveforms from float to int16 using SIMD-optimized conversion.
+       Scale factor converts microvolts to int16 units: output = input / bitVolts */
+    float scaleFactor = 1.0f / channel->getChannelBitVolts (0);
+    SIMDConverter::convertFloatToInt16 (spike->getDataPointer(), m_intBuffer.getData(), scaleFactor, totalSamples);
     rec->data->writeData (m_intBuffer.getData(), totalSamples * sizeof (int16));
 
     int64 sampleIdx = spike->getSampleNumber();
@@ -848,6 +848,8 @@ void BinaryRecording::writeContinuousDataBatch (const int* writeChannels,
     // Get starting sample position (all channels in a stream have same position)
     uint64 startPos = m_samplesWritten[writeChannels[0]];
 
+    //LOGD("BinaryRecording::writeContinuousDataBatch: Writing ", numSamples, " samples for ", numChannels, " channels at position ", startPos, " to file index ", fileIndex);
+
     // Try batch interleaving if we have all channels for this file
     // The file's channel count is determined by the stream's channel count
     // If we have a partial batch, fall back to per-channel writes
@@ -894,8 +896,8 @@ void BinaryRecording::writeContinuousDataBatch (const int* writeChannels,
                 wroteFirstSampleNumber[streamId] = true;
             }
 
-            for (int s = 0; s < numSamples; s++)
-                m_sampleNumberBuffer[s] = baseSampleNumber + s;
+            /* Generate sequential sample numbers using SIMD-optimized fill */
+            SIMDConverter::fillSequentialInt64 (m_sampleNumberBuffer, baseSampleNumber, numSamples);
 
             m_dataTimestampFiles[fileIndex]->writeData (m_sampleNumberBuffer, numSamples * sizeof (int64));
             m_dataTimestampFiles[fileIndex]->increaseRecordCount (numSamples);
@@ -921,8 +923,7 @@ void BinaryRecording::writeTimestampSyncText (uint64 streamId, int64 sampleNumbe
         jassert (fsn == sampleNumber);
 
     m_syncTextFile->writeText (syncString + "\r\n", false, false, nullptr);
-
-    m_syncTextFile->flush();
+    // Note: flush removed - file will be flushed on close or by OS buffering
 }
 
 RecordEngineManager* BinaryRecording::getEngineManager()
diff --git a/Source/Processors/RecordNode/BinaryFormat/NpyFile.cpp b/Source/Processors/RecordNode/BinaryFormat/NpyFile.cpp
@@ -76,7 +76,8 @@ bool NpyFile::openFile (String path)
         LOGD ("Re-creating file: ", path);
     }
 
-    m_file = file.createOutputStream();
+    // Use 64KB buffer to reduce system calls
+    m_file = file.createOutputStream (65536);
 
     if (! m_file)
         return false;
diff --git a/Source/Processors/RecordNode/BinaryFormat/NpyFile.h b/Source/Processors/RecordNode/BinaryFormat/NpyFile.h
@@ -124,7 +124,7 @@ class PLUGIN_API NpyFile
     unsigned int m_dim2;
 
     /** flush file buffer to disk and update the .npy header every this many records: */
-    const int recordBufferSize { 1024 };
+    const int recordBufferSize { 32768 };
 };
 
 #endif
diff --git a/Source/Processors/RecordNode/BinaryFormat/SIMDConverter.cpp b/Source/Processors/RecordNode/BinaryFormat/SIMDConverter.cpp
@@ -605,3 +605,71 @@ void SIMDConverter::interleaveInt16 (const int16_t* const* channelData,
             return;
     }
 }
+
+// ============================================================================
+// Sequential int64 fill (for sample numbers)
+// ============================================================================
+
+void SIMDConverter::fillSequentialInt64 (int64_t* output, int64_t baseValue, int numSamples)
+{
+    if (numSamples <= 0)
+        return;
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+    // ARM NEON: Process 2 int64 values at a time
+    const int simdWidth = 2;
+    int i = 0;
+    
+    // Initialize increment vector [0, 1] + baseValue -> [base, base+1]
+    int64x2_t vbase = { baseValue, baseValue + 1 };
+    const int64x2_t vIncrement = { 2, 2 };
+    
+    const int numFullIterations = numSamples / simdWidth;
+    
+    for (int iter = 0; iter < numFullIterations; ++iter)
+    {
+        vst1q_s64 (reinterpret_cast<int64_t*> (output + i), vbase);
+        vbase = vaddq_s64 (vbase, vIncrement);
+        i += simdWidth;
+    }
+    
+    // Handle remaining samples
+    int64_t currentValue = baseValue + i;
+    for (; i < numSamples; ++i)
+    {
+        output[i] = currentValue++;
+    }
+
+#elif defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
+    // x86 SSE2: Process 2 int64 values at a time
+    const int simdWidth = 2;
+    int i = 0;
+    
+    // Initialize: [base, base+1]
+    __m128i vbase = _mm_set_epi64x (baseValue + 1, baseValue);
+    const __m128i vIncrement = _mm_set_epi64x (2, 2);
+    
+    const int numFullIterations = numSamples / simdWidth;
+    
+    for (int iter = 0; iter < numFullIterations; ++iter)
+    {
+        _mm_storeu_si128 (reinterpret_cast<__m128i*> (output + i), vbase);
+        vbase = _mm_add_epi64 (vbase, vIncrement);
+        i += simdWidth;
+    }
+    
+    // Handle remaining samples
+    int64_t currentValue = baseValue + i;
+    for (; i < numSamples; ++i)
+    {
+        output[i] = currentValue++;
+    }
+
+#else
+    // Scalar fallback - simple loop that compilers can auto-vectorize
+    for (int i = 0; i < numSamples; ++i)
+    {
+        output[i] = baseValue + i;
+    }
+#endif
+}
diff --git a/Source/Processors/RecordNode/BinaryFormat/SIMDConverter.h b/Source/Processors/RecordNode/BinaryFormat/SIMDConverter.h
@@ -136,6 +136,18 @@ class SIMDConverter
      */
     static TileConfig getRecommendedTileConfig (int numChannels);
 
+    /**
+     * Fills a buffer with sequential int64 values starting from a base value.
+     * This is optimized for generating sample number arrays.
+     * 
+     * output[i] = baseValue + i  for i in [0, numSamples)
+     * 
+     * @param output     Pointer to output int64 buffer
+     * @param baseValue  Starting value (output[0] = baseValue)
+     * @param numSamples Number of samples to fill
+     */
+    static void fillSequentialInt64 (int64_t* output, int64_t baseValue, int numSamples);
+
 private:
     // Implementation functions for each SIMD type
     static void convertScalar (const float* input, int16_t* output, float scale, int numSamples);
diff --git a/Source/Processors/RecordNode/BinaryFormat/SequentialBlockFile.cpp b/Source/Processors/RecordNode/BinaryFormat/SequentialBlockFile.cpp
@@ -138,10 +138,10 @@ bool SequentialBlockFile::writeChannelBatch (uint64 startPos, int16* const* chan
         return false;
     }
 
+    // Batch writing requires all channels - return false to signal caller should use per-channel writes
     if (numChannels != m_nChannels)
     {
-        printf ("[RN]SequentialBlockFile::writeChannelBatch: channel count mismatch (%d vs %d)\n", 
-                numChannels, m_nChannels);
+        printf ("[RN]SequentialBlockFile::writeChannelBatch channel count mismatch: expected %d, got %d\n", m_nChannels, numChannels);
         return false;
     }
 
diff --git a/Source/Processors/RecordNode/BinaryFormat/SequentialBlockFile.h b/Source/Processors/RecordNode/BinaryFormat/SequentialBlockFile.h
@@ -92,7 +92,7 @@ class PLUGIN_API SequentialBlockFile
     void allocateBlocks (uint64 startIndex, int numSamples);
 
     /** Compile-time params */
-    const int streamBufferSize { 0 };
+    const int streamBufferSize { 65536 };  // 64KB buffer to reduce system calls
     const int blockArrayInitSize { 128 };
 };
 #endif // !SEQUENTIALBLOCKFILE_H
diff --git a/Source/Processors/RecordNode/DataQueue.cpp b/Source/Processors/RecordNode/DataQueue.cpp
@@ -242,12 +242,48 @@ bool DataQueue::startRead (std::vector<CircularBufferIndexes>& dataBufferIdxs,
 
     m_readInProgress = true;
 
+    // First pass: find the minimum samples available across ALL channels
+    // This ensures we don't read from some channels while others are still being written
+    int minSamplesAvailable = INT_MAX;
     for (int chan = 0; chan < m_numChans; ++chan)
     {
         int readyToRead = m_fifos.getUnchecked (chan)->getNumReady();
+        if (readyToRead < minSamplesAvailable)
+            minSamplesAvailable = readyToRead;
+    }
+
+    // Apply nMax limit to the minimum
+    int samplesToRead = ((minSamplesAvailable > nMax) && (nMax > 0)) ? nMax : minSamplesAvailable;
 
-        int samplesToRead = ((readyToRead > nMax) && (nMax > 0)) ? nMax : readyToRead;
+    // If no samples available on any channel, nothing to read
+    if (samplesToRead == 0)
+    {
+        // Initialize all indexes to zero
+        for (int chan = 0; chan < m_numChans; ++chan)
+        {
+            CircularBufferIndexes& idx = dataBufferIdxs[chan];
+            idx.index1 = 0;
+            idx.size1 = 0;
+            idx.index2 = 0;
+            idx.size2 = 0;
+            m_readSamples[chan] = 0;
+        }
+        for (int chan = 0; chan < m_numFTSChans; ++chan)
+        {
+            CircularBufferIndexes& idx = timestampBufferIdxs[chan];
+            idx.index1 = 0;
+            idx.size1 = 0;
+            idx.index2 = 0;
+            idx.size2 = 0;
+            m_readFTSSamples[chan] = 0;
+        }
+        m_readInProgress = false;
+        return false;
+    }
 
+    // Second pass: read the same number of samples from all channels
+    for (int chan = 0; chan < m_numChans; ++chan)
+    {
         CircularBufferIndexes& idx = dataBufferIdxs[chan];
 
         m_fifos.getUnchecked (chan)->prepareToRead (samplesToRead, idx.index1, idx.size1, idx.index2, idx.size2);
@@ -274,13 +310,21 @@ bool DataQueue::startRead (std::vector<CircularBufferIndexes>& dataBufferIdxs,
         m_lastReadSampleNumbers[chan] = sampleNum + idx.size1 + idx.size2;
     }
 
+    // Also find minimum for timestamp streams and read consistently
+    int minFTSSamples = INT_MAX;
     for (int chan = 0; chan < m_numFTSChans; ++chan)
     {
-        CircularBufferIndexes& idx = timestampBufferIdxs[chan];
         int readyToRead = m_FTSFifos.getUnchecked (chan)->getNumReady();
-        int samplesToRead = ((readyToRead > nMax) && (nMax > 0)) ? nMax : readyToRead;
+        if (readyToRead < minFTSSamples)
+            minFTSSamples = readyToRead;
+    }
+    int ftsToRead = ((minFTSSamples > nMax) && (nMax > 0)) ? nMax : minFTSSamples;
+
+    for (int chan = 0; chan < m_numFTSChans; ++chan)
+    {
+        CircularBufferIndexes& idx = timestampBufferIdxs[chan];
 
-        m_FTSFifos.getUnchecked (chan)->prepareToRead (samplesToRead, idx.index1, idx.size1, idx.index2, idx.size2);
+        m_FTSFifos.getUnchecked (chan)->prepareToRead (ftsToRead, idx.index1, idx.size1, idx.index2, idx.size2);
         m_readFTSSamples[chan] = idx.size1 + idx.size2;
     }
 
diff --git a/Source/Processors/RecordNode/RecordNode.cpp b/Source/Processors/RecordNode/RecordNode.cpp
diff --git a/Source/Processors/RecordNode/RecordNode.h b/Source/Processors/RecordNode/RecordNode.h
diff --git a/Source/Processors/RecordNode/RecordThread.cpp b/Source/Processors/RecordNode/RecordThread.cpp
diff --git a/Source/Processors/RecordNode/RecordThread.h b/Source/Processors/RecordNode/RecordThread.h

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,8 @@ bool NpyFile::openFile (String path)`
`76`	`76`	`LOGD ("Re-creating file: ", path);`
`77`	`77`	`}`
`78`	`78`
`79`		`- m_file = file.createOutputStream();`
	`79`	`+ // Use 64KB buffer to reduce system calls`
	`80`	`+ m_file = file.createOutputStream (65536);`
`80`	`81`
`81`	`82`	`if (! m_file)`
`82`	`83`	`return false;`
Original file line number	Diff line number	Diff line change
`@@ -138,10 +138,10 @@ bool SequentialBlockFile::writeChannelBatch (uint64 startPos, int16* const* chan`
`138`	`138`	`return false;`
`139`	`139`	`}`
`140`	`140`
	`141`	`+ // Batch writing requires all channels - return false to signal caller should use per-channel writes`
`141`	`142`	`if (numChannels != m_nChannels)`
`142`	`143`	`{`
`143`		`- printf ("[RN]SequentialBlockFile::writeChannelBatch: channel count mismatch (%d vs %d)\n",`
`144`		`- numChannels, m_nChannels);`
	`144`	`+ printf ("[RN]SequentialBlockFile::writeChannelBatch channel count mismatch: expected %d, got %d\n", m_nChannels, numChannels);`
`145`	`145`	`return false;`
`146`	`146`	`}`
`147`	`147`