@@ -359,3 +359,249 @@ void SIMDConverter::convertFloatToInt16Batch (const float* const* inputs,
359359 convertFloatToInt16 (inputs[ch], outputs[ch], scaleFactors[ch], numSamples);
360360 }
361361}
362+
363+ // ============================================================================
364+ // Interleaving implementation
365+ // ============================================================================
366+
367+ SIMDConverter::TileConfig SIMDConverter::getRecommendedTileConfig (int numChannels)
368+ {
369+ // Target: Keep working set in L1 cache (~32KB)
370+ // Working set = tileSamples * tileChannels * 2 bytes (int16)
371+ // Plus output buffer: tileSamples * numChannels * 2 bytes
372+
373+ // Tuned based on benchmark results (TileSizeTuning_Interleaving test)
374+ // Larger tile channels (128) with larger tile samples perform better
375+ // due to better SIMD vector utilization and cache line efficiency
376+
377+ TileConfig config;
378+
379+ if (numChannels >= 1024 )
380+ {
381+ // Very high channel count: use larger tiles for better throughput
382+ // Benchmark showed 256x64 is ~10% faster than 128x32 at 1536 channels
383+ config.tileSamples = 256 ;
384+ config.tileChannels = 128 ;
385+ }
386+ else if (numChannels >= 384 )
387+ {
388+ // Neuropixels range: 256x128 provides best performance
389+ // ~2% better than 256x64 at 384-768 channels
390+ config.tileSamples = 256 ;
391+ config.tileChannels = 128 ;
392+ }
393+ else if (numChannels >= 64 )
394+ {
395+ // Medium channel count: larger sample tiles
396+ config.tileSamples = 512 ;
397+ config.tileChannels = 64 ;
398+ }
399+ else
400+ {
401+ // Low channel count: process all channels together
402+ config.tileSamples = 1024 ;
403+ config.tileChannels = numChannels;
404+ }
405+
406+ return config;
407+ }
408+
409+ // Scalar interleaving (fallback and reference implementation)
410+ static void interleaveScalar (const int16_t * const * channelData,
411+ int16_t * output,
412+ int numChannels,
413+ int numSamples,
414+ int tileSamples,
415+ int tileChannels)
416+ {
417+ // Process in tiles for cache efficiency
418+ for (int sampleTile = 0 ; sampleTile < numSamples; sampleTile += tileSamples)
419+ {
420+ int sampleEnd = std::min (sampleTile + tileSamples, numSamples);
421+
422+ for (int channelTile = 0 ; channelTile < numChannels; channelTile += tileChannels)
423+ {
424+ int channelEnd = std::min (channelTile + tileChannels, numChannels);
425+
426+ // Process this tile: samples in outer loop for sequential output writes
427+ for (int s = sampleTile; s < sampleEnd; s++)
428+ {
429+ int16_t * outPtr = output + s * numChannels + channelTile;
430+
431+ for (int ch = channelTile; ch < channelEnd; ch++)
432+ {
433+ *outPtr++ = channelData[ch][s];
434+ }
435+ }
436+ }
437+ }
438+ }
439+
440+ #if defined(__ARM_NEON) || defined(__ARM_NEON__)
441+ // ARM NEON optimized interleaving
442+ static void interleaveNEON (const int16_t * const * channelData,
443+ int16_t * output,
444+ int numChannels,
445+ int numSamples,
446+ int tileSamples,
447+ int tileChannels)
448+ {
449+ // Process in tiles for cache efficiency
450+ for (int sampleTile = 0 ; sampleTile < numSamples; sampleTile += tileSamples)
451+ {
452+ int sampleEnd = std::min (sampleTile + tileSamples, numSamples);
453+
454+ for (int channelTile = 0 ; channelTile < numChannels; channelTile += tileChannels)
455+ {
456+ int channelEnd = std::min (channelTile + tileChannels, numChannels);
457+ int tileWidth = channelEnd - channelTile;
458+
459+ // Process this tile
460+ for (int s = sampleTile; s < sampleEnd; s++)
461+ {
462+ int16_t * outPtr = output + s * numChannels + channelTile;
463+ int ch = channelTile;
464+
465+ // Use NEON to copy 8 channels at a time when aligned
466+ // This is beneficial when tileWidth >= 8
467+ while (ch + 8 <= channelEnd)
468+ {
469+ // Load 8 values from 8 different channel pointers
470+ // Unfortunately, NEON doesn't have a gather instruction,
471+ // so we load individually and combine
472+ int16x8_t vals = {
473+ channelData[ch + 0 ][s],
474+ channelData[ch + 1 ][s],
475+ channelData[ch + 2 ][s],
476+ channelData[ch + 3 ][s],
477+ channelData[ch + 4 ][s],
478+ channelData[ch + 5 ][s],
479+ channelData[ch + 6 ][s],
480+ channelData[ch + 7 ][s]
481+ };
482+
483+ // Store 8 values contiguously to output
484+ vst1q_s16 (outPtr, vals);
485+
486+ outPtr += 8 ;
487+ ch += 8 ;
488+ }
489+
490+ // Handle remaining channels
491+ while (ch < channelEnd)
492+ {
493+ *outPtr++ = channelData[ch][s];
494+ ch++;
495+ }
496+ }
497+ }
498+ }
499+ }
500+ #endif
501+
502+ #if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
503+ // x86 SSE2 optimized interleaving
504+ static void interleaveSSE2 (const int16_t * const * channelData,
505+ int16_t * output,
506+ int numChannels,
507+ int numSamples,
508+ int tileSamples,
509+ int tileChannels)
510+ {
511+ // Process in tiles for cache efficiency
512+ for (int sampleTile = 0 ; sampleTile < numSamples; sampleTile += tileSamples)
513+ {
514+ int sampleEnd = std::min (sampleTile + tileSamples, numSamples);
515+
516+ for (int channelTile = 0 ; channelTile < numChannels; channelTile += tileChannels)
517+ {
518+ int channelEnd = std::min (channelTile + tileChannels, numChannels);
519+ int tileWidth = channelEnd - channelTile;
520+
521+ // Process this tile
522+ for (int s = sampleTile; s < sampleEnd; s++)
523+ {
524+ int16_t * outPtr = output + s * numChannels + channelTile;
525+ int ch = channelTile;
526+
527+ // Use SSE2 to copy 8 channels at a time
528+ while (ch + 8 <= channelEnd)
529+ {
530+ // Manually gather 8 int16 values and pack into __m128i
531+ // SSE2 doesn't have gather, so we use scalar loads + set
532+ __m128i vals = _mm_set_epi16 (
533+ channelData[ch + 7 ][s],
534+ channelData[ch + 6 ][s],
535+ channelData[ch + 5 ][s],
536+ channelData[ch + 4 ][s],
537+ channelData[ch + 3 ][s],
538+ channelData[ch + 2 ][s],
539+ channelData[ch + 1 ][s],
540+ channelData[ch + 0 ][s]
541+ );
542+
543+ // Store 8 values contiguously
544+ _mm_storeu_si128 (reinterpret_cast <__m128i*> (outPtr), vals);
545+
546+ outPtr += 8 ;
547+ ch += 8 ;
548+ }
549+
550+ // Handle remaining channels
551+ while (ch < channelEnd)
552+ {
553+ *outPtr++ = channelData[ch][s];
554+ ch++;
555+ }
556+ }
557+ }
558+ }
559+ }
560+ #endif
561+
562+ void SIMDConverter::interleaveInt16 (const int16_t * const * channelData,
563+ int16_t * output,
564+ int numChannels,
565+ int numSamples,
566+ int tileSamples,
567+ int tileChannels)
568+ {
569+ if (numSamples <= 0 || numChannels <= 0 )
570+ return ;
571+
572+ // Use recommended tile sizes if not specified
573+ if (tileSamples <= 0 || tileChannels <= 0 )
574+ {
575+ TileConfig config = getRecommendedTileConfig (numChannels);
576+ if (tileSamples <= 0 )
577+ tileSamples = config.tileSamples ;
578+ if (tileChannels <= 0 )
579+ tileChannels = config.tileChannels ;
580+ }
581+
582+ // Clamp tile sizes to actual dimensions
583+ tileSamples = std::min (tileSamples, numSamples);
584+ tileChannels = std::min (tileChannels, numChannels);
585+
586+ SIMDType simd = getAvailableSIMD ();
587+
588+ switch (simd)
589+ {
590+ #if defined(__ARM_NEON) || defined(__ARM_NEON__)
591+ case SIMDType::NEON:
592+ interleaveNEON (channelData, output, numChannels, numSamples, tileSamples, tileChannels);
593+ return ;
594+ #endif
595+
596+ #if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
597+ case SIMDType::SSE4_1:
598+ case SIMDType::SSE2:
599+ interleaveSSE2 (channelData, output, numChannels, numSamples, tileSamples, tileChannels);
600+ return ;
601+ #endif
602+
603+ default :
604+ interleaveScalar (channelData, output, numChannels, numSamples, tileSamples, tileChannels);
605+ return ;
606+ }
607+ }
0 commit comments