Skip to content

Commit 9b6af24

Browse files
committed
Use SIMD for RecordNode float-to-int conversion
1 parent f0f3f78 commit 9b6af24

4 files changed

Lines changed: 1079 additions & 0 deletions

File tree

Source/Processors/RecordNode/BinaryFormat/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ add_sources(open-ephys
99
NpyFile.h
1010
SequentialBlockFile.cpp
1111
SequentialBlockFile.h
12+
SIMDConverter.cpp
13+
SIMDConverter.h
1214
)
1315

1416
#add nested directories
Lines changed: 361 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
/*
2+
------------------------------------------------------------------
3+
4+
This file is part of the Open Ephys GUI
5+
Copyright (C) 2024 Open Ephys
6+
7+
------------------------------------------------------------------
8+
9+
This program is free software: you can redistribute it and/or modify
10+
it under the terms of the GNU General Public License as published by
11+
the Free Software Foundation, either version 3 of the License, or
12+
(at your option) any later version.
13+
14+
This program is distributed in the hope that it will be useful,
15+
but WITHOUT ANY WARRANTY; without even the implied warranty of
16+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17+
GNU General Public License for more details.
18+
19+
You should have received a copy of the GNU General Public License
20+
along with this program. If not, see <http://www.gnu.org/licenses/>.
21+
22+
*/
23+
24+
#include "SIMDConverter.h"
25+
26+
#include <algorithm>
27+
#include <cmath>
28+
29+
// Platform-specific includes
30+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
31+
#include <arm_neon.h>
32+
#endif
33+
34+
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
35+
#include <emmintrin.h> // SSE2
36+
#endif
37+
38+
#if defined(__SSE4_1__)
39+
#include <smmintrin.h> // SSE4.1
40+
#endif
41+
42+
// For CPUID on x86
43+
#if defined(_MSC_VER)
44+
#include <intrin.h>
45+
#elif defined(__GNUC__) || defined(__clang__)
46+
#if defined(__x86_64__) || defined(__i386__)
47+
#include <cpuid.h>
48+
#endif
49+
#endif
50+
51+
// Static member initialization
52+
SIMDConverter::SIMDType SIMDConverter::s_detectedSIMD = SIMDType::None;
53+
bool SIMDConverter::s_simdDetected = false;
54+
55+
SIMDConverter::SIMDType SIMDConverter::getAvailableSIMD()
56+
{
57+
if (s_simdDetected)
58+
return s_detectedSIMD;
59+
60+
s_simdDetected = true;
61+
62+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
63+
// ARM NEON is always available when compiled with NEON support
64+
s_detectedSIMD = SIMDType::NEON;
65+
return s_detectedSIMD;
66+
#endif
67+
68+
#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
69+
// Runtime detection for x86 using CPUID
70+
int cpuInfo[4] = { 0 };
71+
72+
#if defined(_MSC_VER)
73+
__cpuid(cpuInfo, 1);
74+
#elif defined(__GNUC__) || defined(__clang__)
75+
__cpuid(1, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]);
76+
#endif
77+
78+
// Check SSE4.1 (bit 19 of ECX)
79+
bool hasSSE4_1 = (cpuInfo[2] & (1 << 19)) != 0;
80+
// Check SSE2 (bit 26 of EDX)
81+
bool hasSSE2 = (cpuInfo[3] & (1 << 26)) != 0;
82+
83+
#if defined(__SSE4_1__)
84+
if (hasSSE4_1)
85+
{
86+
s_detectedSIMD = SIMDType::SSE4_1;
87+
return s_detectedSIMD;
88+
}
89+
#endif
90+
91+
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
92+
if (hasSSE2)
93+
{
94+
s_detectedSIMD = SIMDType::SSE2;
95+
return s_detectedSIMD;
96+
}
97+
#endif
98+
#endif
99+
100+
s_detectedSIMD = SIMDType::None;
101+
return s_detectedSIMD;
102+
}
103+
104+
std::string SIMDConverter::getSIMDTypeString()
105+
{
106+
switch (getAvailableSIMD())
107+
{
108+
case SIMDType::NEON: return "ARM NEON";
109+
case SIMDType::SSE4_1: return "x86 SSE4.1";
110+
case SIMDType::SSE2: return "x86 SSE2";
111+
case SIMDType::AVX2: return "x86 AVX2";
112+
case SIMDType::None: return "Scalar (no SIMD)";
113+
default: return "Unknown";
114+
}
115+
}
116+
117+
// ============================================================================
118+
// Scalar implementation (fallback)
119+
// ============================================================================
120+
121+
void SIMDConverter::convertScalar (const float* input, int16_t* output, float scale, int numSamples)
122+
{
123+
for (int i = 0; i < numSamples; ++i)
124+
{
125+
// Scale, round, and clamp in one operation
126+
float scaled = input[i] * scale;
127+
int32_t rounded = static_cast<int32_t> (std::round (scaled));
128+
// Clamp to int16 range
129+
rounded = std::max (-32768, std::min (32767, rounded));
130+
output[i] = static_cast<int16_t> (rounded);
131+
}
132+
}
133+
134+
// ============================================================================
135+
// ARM NEON implementation
136+
// ============================================================================
137+
138+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
139+
void SIMDConverter::convertNEON (const float* input, int16_t* output, float scale, int numSamples)
140+
{
141+
const float32x4_t vscale = vdupq_n_f32 (scale);
142+
143+
int i = 0;
144+
145+
// Process 8 samples at a time (2 x float32x4 -> 1 x int16x8)
146+
const int simdWidth = 8;
147+
const int numFullIterations = numSamples / simdWidth;
148+
149+
for (int iter = 0; iter < numFullIterations; ++iter)
150+
{
151+
// Load 8 floats (2 x 4)
152+
float32x4_t f0 = vld1q_f32 (input + i);
153+
float32x4_t f1 = vld1q_f32 (input + i + 4);
154+
155+
// Multiply by scale factor
156+
f0 = vmulq_f32 (f0, vscale);
157+
f1 = vmulq_f32 (f1, vscale);
158+
159+
// Convert to int32 with rounding (vcvtnq_s32_f32 rounds to nearest)
160+
int32x4_t i0 = vcvtnq_s32_f32 (f0);
161+
int32x4_t i1 = vcvtnq_s32_f32 (f1);
162+
163+
// Narrow to int16 with saturation (combines two int32x4 into one int16x8)
164+
int16x4_t lo = vqmovn_s32 (i0);
165+
int16x4_t hi = vqmovn_s32 (i1);
166+
int16x8_t result = vcombine_s16 (lo, hi);
167+
168+
// Store 8 int16 values
169+
vst1q_s16 (output + i, result);
170+
171+
i += simdWidth;
172+
}
173+
174+
// Handle remaining samples with scalar code
175+
for (; i < numSamples; ++i)
176+
{
177+
float scaled = input[i] * scale;
178+
int32_t rounded = static_cast<int32_t> (std::round (scaled));
179+
rounded = std::max (-32768, std::min (32767, rounded));
180+
output[i] = static_cast<int16_t> (rounded);
181+
}
182+
}
183+
#endif
184+
185+
// ============================================================================
186+
// x86 SSE2 implementation
187+
// ============================================================================
188+
189+
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
190+
void SIMDConverter::convertSSE2 (const float* input, int16_t* output, float scale, int numSamples)
191+
{
192+
const __m128 vscale = _mm_set1_ps (scale);
193+
194+
int i = 0;
195+
196+
// Process 8 samples at a time (2 x 4 floats -> 8 int16s)
197+
const int simdWidth = 8;
198+
const int simdIterations = numSamples / simdWidth;
199+
200+
for (int iter = 0; iter < simdIterations; ++iter)
201+
{
202+
// Load 8 floats (2 x 4)
203+
__m128 f0 = _mm_loadu_ps (input + i);
204+
__m128 f1 = _mm_loadu_ps (input + i + 4);
205+
206+
// Multiply by scale factor
207+
f0 = _mm_mul_ps (f0, vscale);
208+
f1 = _mm_mul_ps (f1, vscale);
209+
210+
// Convert to int32 (truncates toward zero, but we'll round first)
211+
// SSE2 doesn't have direct round-to-nearest, so we use a trick:
212+
// add/subtract 0.5 and truncate
213+
const __m128 half = _mm_set1_ps (0.5f);
214+
const __m128 negHalf = _mm_set1_ps (-0.5f);
215+
const __m128 zero = _mm_setzero_ps();
216+
217+
// Select +0.5 for positive, -0.5 for negative
218+
__m128 sign0 = _mm_cmpge_ps (f0, zero);
219+
__m128 sign1 = _mm_cmpge_ps (f1, zero);
220+
221+
__m128 offset0 = _mm_or_ps (_mm_and_ps (sign0, half), _mm_andnot_ps (sign0, negHalf));
222+
__m128 offset1 = _mm_or_ps (_mm_and_ps (sign1, half), _mm_andnot_ps (sign1, negHalf));
223+
224+
f0 = _mm_add_ps (f0, offset0);
225+
f1 = _mm_add_ps (f1, offset1);
226+
227+
// Convert to int32 (truncates)
228+
__m128i i0 = _mm_cvttps_epi32 (f0);
229+
__m128i i1 = _mm_cvttps_epi32 (f1);
230+
231+
// Pack two int32x4 into one int16x8 with signed saturation
232+
__m128i result = _mm_packs_epi32 (i0, i1);
233+
234+
// Store 8 int16 values
235+
_mm_storeu_si128 (reinterpret_cast<__m128i*> (output + i), result);
236+
237+
i += simdWidth;
238+
}
239+
240+
// Handle remaining samples with scalar code
241+
for (; i < numSamples; ++i)
242+
{
243+
float scaled = input[i] * scale;
244+
int32_t rounded = static_cast<int32_t> (std::round (scaled));
245+
rounded = std::max (-32768, std::min (32767, rounded));
246+
output[i] = static_cast<int16_t> (rounded);
247+
}
248+
}
249+
#endif
250+
251+
// ============================================================================
252+
// x86 SSE4.1 implementation (uses roundps for proper rounding)
253+
// ============================================================================
254+
255+
#if defined(__SSE4_1__)
256+
void SIMDConverter::convertSSE4_1 (const float* input, int16_t* output, float scale, int numSamples)
257+
{
258+
const __m128 vscale = _mm_set1_ps (scale);
259+
260+
int i = 0;
261+
262+
// Process 8 samples at a time
263+
const int simdWidth = 8;
264+
const int simdIterations = numSamples / simdWidth;
265+
266+
for (int iter = 0; iter < simdIterations; ++iter)
267+
{
268+
// Load 8 floats (2 x 4)
269+
__m128 f0 = _mm_loadu_ps (input + i);
270+
__m128 f1 = _mm_loadu_ps (input + i + 4);
271+
272+
// Multiply by scale factor
273+
f0 = _mm_mul_ps (f0, vscale);
274+
f1 = _mm_mul_ps (f1, vscale);
275+
276+
// Round to nearest integer (SSE4.1)
277+
// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC = 0x00 | 0x08 = 0x08
278+
f0 = _mm_round_ps (f0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
279+
f1 = _mm_round_ps (f1, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
280+
281+
// Convert to int32
282+
__m128i i0 = _mm_cvtps_epi32 (f0);
283+
__m128i i1 = _mm_cvtps_epi32 (f1);
284+
285+
// Pack two int32x4 into one int16x8 with signed saturation
286+
__m128i result = _mm_packs_epi32 (i0, i1);
287+
288+
// Store 8 int16 values
289+
_mm_storeu_si128 (reinterpret_cast<__m128i*> (output + i), result);
290+
291+
i += simdWidth;
292+
}
293+
294+
// Handle remaining samples with scalar code
295+
for (; i < numSamples; ++i)
296+
{
297+
float scaled = input[i] * scale;
298+
int32_t rounded = static_cast<int32_t> (std::round (scaled));
299+
rounded = std::max (-32768, std::min (32767, rounded));
300+
output[i] = static_cast<int16_t> (rounded);
301+
}
302+
}
303+
#endif
304+
305+
// ============================================================================
306+
// Main dispatch function
307+
// ============================================================================
308+
309+
void SIMDConverter::convertFloatToInt16 (const float* input,
310+
int16_t* output,
311+
float scaleFactor,
312+
int numSamples)
313+
{
314+
if (numSamples <= 0)
315+
return;
316+
317+
SIMDType simd = getAvailableSIMD();
318+
319+
switch (simd)
320+
{
321+
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
322+
case SIMDType::NEON:
323+
convertNEON (input, output, scaleFactor, numSamples);
324+
return;
325+
#endif
326+
327+
#if defined(__SSE4_1__)
328+
case SIMDType::SSE4_1:
329+
convertSSE4_1 (input, output, scaleFactor, numSamples);
330+
return;
331+
#endif
332+
333+
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_IX86)
334+
case SIMDType::SSE2:
335+
convertSSE2 (input, output, scaleFactor, numSamples);
336+
return;
337+
#endif
338+
339+
default:
340+
convertScalar (input, output, scaleFactor, numSamples);
341+
return;
342+
}
343+
}
344+
345+
// ============================================================================
346+
// Batch conversion (multiple channels)
347+
// ============================================================================
348+
349+
void SIMDConverter::convertFloatToInt16Batch (const float* const* inputs,
350+
int16_t* const* outputs,
351+
const float* scaleFactors,
352+
int numChannels,
353+
int numSamples)
354+
{
355+
// For now, simply call single-channel conversion in a loop
356+
// Future optimization: process multiple channels in parallel using wider SIMD
357+
for (int ch = 0; ch < numChannels; ++ch)
358+
{
359+
convertFloatToInt16 (inputs[ch], outputs[ch], scaleFactors[ch], numSamples);
360+
}
361+
}

0 commit comments

Comments
 (0)