Skip to content

Commit bd7feb8

Browse files
committed
Implement SIMD interleaving when writing a batch of channels
1 parent a1427a5 commit bd7feb8

5 files changed

Lines changed: 566 additions & 42 deletions

File tree

Source/Processors/RecordNode/BinaryFormat/BinaryRecording.cpp

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -848,12 +848,30 @@ void BinaryRecording::writeContinuousDataBatch (const int* writeChannels,
848848
// Get starting sample position (all channels in a stream have same position)
849849
uint64 startPos = m_samplesWritten[writeChannels[0]];
850850

851-
// Write all channels at once using batch interleaving
852-
m_continuousFiles[fileIndex]->writeChannelBatch (
853-
startPos,
854-
m_batchIntBufferPtrs.data(),
855-
numChannels,
856-
numSamples);
851+
// Try batch interleaving if we have all channels for this file
852+
// The file's channel count is determined by the stream's channel count
853+
// If we have a partial batch, fall back to per-channel writes
854+
bool useBatchWrite = m_continuousFiles[fileIndex] != nullptr &&
855+
m_continuousFiles[fileIndex]->writeChannelBatch (
856+
startPos,
857+
m_batchIntBufferPtrs.data(),
858+
numChannels,
859+
numSamples);
860+
861+
if (! useBatchWrite)
862+
{
863+
// Fall back to per-channel writes for partial batches
864+
for (int i = 0; i < numChannels; i++)
865+
{
866+
int writeChannel = writeChannels[i];
867+
int channelIdx = m_channelIndexes[writeChannel];
868+
m_continuousFiles[fileIndex]->writeChannel (
869+
startPos,
870+
channelIdx,
871+
m_batchIntBufferPtrs[i],
872+
numSamples);
873+
}
874+
}
857875

858876
// Update samples written for all channels
859877
for (int i = 0; i < numChannels; i++)

Source/Processors/RecordNode/BinaryFormat/SIMDConverter.cpp

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,3 +359,249 @@ void SIMDConverter::convertFloatToInt16Batch (const float* const* inputs,
359359
convertFloatToInt16 (inputs[ch], outputs[ch], scaleFactors[ch], numSamples);
360360
}
361361
}
362+
363+
// ============================================================================
364+
// Interleaving implementation
365+
// ============================================================================
366+
367+
SIMDConverter::TileConfig SIMDConverter::getRecommendedTileConfig (int numChannels)
368+
{
369+
// Target: Keep working set in L1 cache (~32KB)
370+
// Working set = tileSamples * tileChannels * 2 bytes (int16)
371+
// Plus output buffer: tileSamples * numChannels * 2 bytes
372+
373+
// Tuned based on benchmark results (TileSizeTuning_Interleaving test)
374+
// Larger tile channels (128) with larger tile samples perform better
375+
// due to better SIMD vector utilization and cache line efficiency
376+
377+
TileConfig config;
378+
379+
if (numChannels >= 1024)
380+
{
381+
// Very high channel count: use larger tiles for better throughput
382+
// Benchmark showed 256x64 is ~10% faster than 128x32 at 1536 channels
383+
config.tileSamples = 256;
384+
config.tileChannels = 128;
385+
}
386+
else if (numChannels >= 384)
387+
{
388+
// Neuropixels range: 256x128 provides best performance
389+
// ~2% better than 256x64 at 384-768 channels
390+
config.tileSamples = 256;
391+
config.tileChannels = 128;
392+
}
393+
else if (numChannels >= 64)
394+
{
395+
// Medium channel count: larger sample tiles
396+
config.tileSamples = 512;
397+
config.tileChannels = 64;
398+
}
399+
else
400+
{
401+
// Low channel count: process all channels together
402+
config.tileSamples = 1024;
403+
config.tileChannels = numChannels;
404+
}
405+
406+
return config;
407+
}
408+
409+
// Scalar interleaving (fallback and reference implementation)
410+
static void interleaveScalar (const int16_t* const* channelData,
411+
int16_t* output,
412+
int numChannels,
413+
int numSamples,
414+
int tileSamples,
415+
int tileChannels)
416+
{
417+
// Process in tiles for cache efficiency
418+
for (int sampleTile = 0; sampleTile < numSamples; sampleTile += tileSamples)
419+
{
420+
int sampleEnd = std::min (sampleTile + tileSamples, numSamples);
421+
422+
for (int channelTile = 0; channelTile < numChannels; channelTile += tileChannels)
423+
{
424+
int channelEnd = std::min (channelTile + tileChannels, numChannels);
425+
426+
// Process this tile: samples in outer loop for sequential output writes
427+
for (int s = sampleTile; s < sampleEnd; s++)
428+
{
429+
int16_t* outPtr = output + s * numChannels + channelTile;
430+
431+
for (int ch = channelTile; ch < channelEnd; ch++)
432+
{
433+
*outPtr++ = channelData[ch][s];
434+
}
435+
}
436+
}
437+
}
438+
}
439+
440+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
441+
// ARM NEON optimized interleaving
442+
static void interleaveNEON (const int16_t* const* channelData,
443+
int16_t* output,
444+
int numChannels,
445+
int numSamples,
446+
int tileSamples,
447+
int tileChannels)
448+
{
449+
// Process in tiles for cache efficiency
450+
for (int sampleTile = 0; sampleTile < numSamples; sampleTile += tileSamples)
451+
{
452+
int sampleEnd = std::min (sampleTile + tileSamples, numSamples);
453+
454+
for (int channelTile = 0; channelTile < numChannels; channelTile += tileChannels)
455+
{
456+
int channelEnd = std::min (channelTile + tileChannels, numChannels);
457+
int tileWidth = channelEnd - channelTile;
458+
459+
// Process this tile
460+
for (int s = sampleTile; s < sampleEnd; s++)
461+
{
462+
int16_t* outPtr = output + s * numChannels + channelTile;
463+
int ch = channelTile;
464+
465+
// Use NEON to copy 8 channels at a time when aligned
466+
// This is beneficial when tileWidth >= 8
467+
while (ch + 8 <= channelEnd)
468+
{
469+
// Load 8 values from 8 different channel pointers
470+
// Unfortunately, NEON doesn't have a gather instruction,
471+
// so we load individually and combine
472+
int16x8_t vals = {
473+
channelData[ch + 0][s],
474+
channelData[ch + 1][s],
475+
channelData[ch + 2][s],
476+
channelData[ch + 3][s],
477+
channelData[ch + 4][s],
478+
channelData[ch + 5][s],
479+
channelData[ch + 6][s],
480+
channelData[ch + 7][s]
481+
};
482+
483+
// Store 8 values contiguously to output
484+
vst1q_s16 (outPtr, vals);
485+
486+
outPtr += 8;
487+
ch += 8;
488+
}
489+
490+
// Handle remaining channels
491+
while (ch < channelEnd)
492+
{
493+
*outPtr++ = channelData[ch][s];
494+
ch++;
495+
}
496+
}
497+
}
498+
}
499+
}
500+
#endif
501+
502+
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
503+
// x86 SSE2 optimized interleaving
504+
static void interleaveSSE2 (const int16_t* const* channelData,
505+
int16_t* output,
506+
int numChannels,
507+
int numSamples,
508+
int tileSamples,
509+
int tileChannels)
510+
{
511+
// Process in tiles for cache efficiency
512+
for (int sampleTile = 0; sampleTile < numSamples; sampleTile += tileSamples)
513+
{
514+
int sampleEnd = std::min (sampleTile + tileSamples, numSamples);
515+
516+
for (int channelTile = 0; channelTile < numChannels; channelTile += tileChannels)
517+
{
518+
int channelEnd = std::min (channelTile + tileChannels, numChannels);
519+
int tileWidth = channelEnd - channelTile;
520+
521+
// Process this tile
522+
for (int s = sampleTile; s < sampleEnd; s++)
523+
{
524+
int16_t* outPtr = output + s * numChannels + channelTile;
525+
int ch = channelTile;
526+
527+
// Use SSE2 to copy 8 channels at a time
528+
while (ch + 8 <= channelEnd)
529+
{
530+
// Manually gather 8 int16 values and pack into __m128i
531+
// SSE2 doesn't have gather, so we use scalar loads + set
532+
__m128i vals = _mm_set_epi16 (
533+
channelData[ch + 7][s],
534+
channelData[ch + 6][s],
535+
channelData[ch + 5][s],
536+
channelData[ch + 4][s],
537+
channelData[ch + 3][s],
538+
channelData[ch + 2][s],
539+
channelData[ch + 1][s],
540+
channelData[ch + 0][s]
541+
);
542+
543+
// Store 8 values contiguously
544+
_mm_storeu_si128 (reinterpret_cast<__m128i*> (outPtr), vals);
545+
546+
outPtr += 8;
547+
ch += 8;
548+
}
549+
550+
// Handle remaining channels
551+
while (ch < channelEnd)
552+
{
553+
*outPtr++ = channelData[ch][s];
554+
ch++;
555+
}
556+
}
557+
}
558+
}
559+
}
560+
#endif
561+
562+
void SIMDConverter::interleaveInt16 (const int16_t* const* channelData,
563+
int16_t* output,
564+
int numChannels,
565+
int numSamples,
566+
int tileSamples,
567+
int tileChannels)
568+
{
569+
if (numSamples <= 0 || numChannels <= 0)
570+
return;
571+
572+
// Use recommended tile sizes if not specified
573+
if (tileSamples <= 0 || tileChannels <= 0)
574+
{
575+
TileConfig config = getRecommendedTileConfig (numChannels);
576+
if (tileSamples <= 0)
577+
tileSamples = config.tileSamples;
578+
if (tileChannels <= 0)
579+
tileChannels = config.tileChannels;
580+
}
581+
582+
// Clamp tile sizes to actual dimensions
583+
tileSamples = std::min (tileSamples, numSamples);
584+
tileChannels = std::min (tileChannels, numChannels);
585+
586+
SIMDType simd = getAvailableSIMD();
587+
588+
switch (simd)
589+
{
590+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
591+
case SIMDType::NEON:
592+
interleaveNEON (channelData, output, numChannels, numSamples, tileSamples, tileChannels);
593+
return;
594+
#endif
595+
596+
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
597+
case SIMDType::SSE4_1:
598+
case SIMDType::SSE2:
599+
interleaveSSE2 (channelData, output, numChannels, numSamples, tileSamples, tileChannels);
600+
return;
601+
#endif
602+
603+
default:
604+
interleaveScalar (channelData, output, numChannels, numSamples, tileSamples, tileChannels);
605+
return;
606+
}
607+
}

Source/Processors/RecordNode/BinaryFormat/SIMDConverter.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,42 @@ class SIMDConverter
100100
int numChannels,
101101
int numSamples);
102102

103+
/**
104+
* Interleaves data from multiple channel buffers into a single interleaved output buffer.
105+
* This is optimized for the memory access patterns in disk recording.
106+
*
107+
* Output format: [ch0_s0, ch1_s0, ..., chN_s0, ch0_s1, ch1_s1, ..., chN_s1, ...]
108+
*
109+
* @param channelData Array of pointers to int16 data for each channel (contiguous per channel)
110+
* @param output Pointer to output buffer (will contain interleaved data)
111+
* @param numChannels Number of channels
112+
* @param numSamples Number of samples per channel
113+
* @param tileSamples Tile size for sample dimension (for cache blocking, 0 = auto)
114+
* @param tileChannels Tile size for channel dimension (for cache blocking, 0 = auto)
115+
*/
116+
static void interleaveInt16 (const int16_t* const* channelData,
117+
int16_t* output,
118+
int numChannels,
119+
int numSamples,
120+
int tileSamples = 0,
121+
int tileChannels = 0);
122+
123+
/**
124+
* Recommended tile sizes based on cache characteristics.
125+
* These are tuned for typical L1 cache sizes (32-64KB).
126+
*/
127+
struct TileConfig
128+
{
129+
int tileSamples; // Number of samples per tile
130+
int tileChannels; // Number of channels per tile
131+
};
132+
133+
/**
134+
* Returns recommended tile configuration for the given channel count.
135+
* Optimizes for L1 cache efficiency.
136+
*/
137+
static TileConfig getRecommendedTileConfig (int numChannels);
138+
103139
private:
104140
// Implementation functions for each SIMD type
105141
static void convertScalar (const float* input, int16_t* output, float scale, int numSamples);

0 commit comments

Comments
 (0)