Skip to content

Commit a43fa81

Browse files
authored
Merge pull request InsightSoftwareConsortium#6004 from hjmjohnson/fftw-compute-optimized-defaults
PERF: Enable FFTW SIMD codelets with per-CPU introspection at configure time
2 parents 6dcda76 + 418a2c2 commit a43fa81

1 file changed

Lines changed: 81 additions & 19 deletions

File tree

CMake/itkExternal_FFTW.cmake

Lines changed: 81 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
#
22
# Encapsulates building FFTW as an External Project.
33
#
4-
# NOTE: internal building of fftw is for convenience,
5-
# and the version of fftw built here does not
6-
# use modern hardware optimzations.
4+
# SIMD codelet selection
5+
# ----------------------
6+
# FFTW SIMD codelets are hand-written assembly routines baked into the
7+
# library at compile time. Passing -march=native to the ITK build does
8+
# NOT activate them; they must be requested explicitly via FFTW's own
9+
# CMake options (ENABLE_NEON, ENABLE_SSE, ENABLE_SSE2, ENABLE_AVX, ENABLE_AVX2).
710
#
8-
# The build configuration chosen to be
9-
# generalizable to as many hardware platforms.
10-
# Being backward compatible for decades
11-
# old hardware is the goal of this internal
12-
# representation.
11+
# This file detects appropriate defaults at cmake configure time:
1312
#
14-
# This is primarily used to support testing
15-
# and should not be used for production
16-
# builds where performance is a concern.
13+
# Native builds (CMAKE_CROSSCOMPILING is false):
14+
# - ARM64 (aarch64/arm64): NEON=ON (mandatory in ARMv8); x86 SIMD off.
15+
# - x86/x86_64: each of SSE, SSE2, AVX, AVX2 is probed individually
16+
# via __builtin_cpu_supports() / CheckCSourceRuns so that the
17+
# detected flags match the actual build-host CPU. A pre-AVX
18+
# Sandy Bridge gets SSE+SSE2 only; a Haswell or later gets all four.
19+
# - Other architectures: all SIMD off (conservative fallback).
20+
#
21+
# Cross-compiled builds (CMAKE_CROSSCOMPILING is true):
22+
# - ARM64: NEON=ON (mandatory); x86 SIMD off.
23+
# - x86_64: SSE+SSE2 only (baseline; AVX/AVX2 not assumed for target).
24+
# - Other: all SIMD off.
25+
#
26+
# Every flag is an individually overridable cache option, e.g.:
27+
# cmake -DFFTW_ENABLE_AVX2=OFF ...
1728
#
1829
# These instructions follow the guidance provided for modern cmake usage as described:
1930
# https://github.com/dev-cafe/cmake-cookbook/blob/master/chapter-08/recipe-03/c-example/external/upstream/fftw3/CMakeLists.txt
@@ -53,6 +64,55 @@ if(NOT ITK_USE_SYSTEM_FFTW)
5364
set(_fftw_url "https://data.kitware.com/api/v1/file/hashsum/sha512/${_fftw_url_hash}/download")
5465

5566
set(FFTW_STAGED_INSTALL_PREFIX "${ITK_BINARY_DIR}/fftw")
67+
68+
# Detect SIMD defaults (see file header for full policy description).
69+
# CheckCSourceRuns results are cached after the first cmake configure run.
70+
include(CheckCSourceRuns)
71+
72+
set(_fftw_default_neon OFF)
73+
set(_fftw_default_sse OFF)
74+
set(_fftw_default_sse2 OFF)
75+
set(_fftw_default_avx OFF)
76+
set(_fftw_default_avx2 OFF)
77+
78+
if(NOT CMAKE_CROSSCOMPILING)
79+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
80+
# NEON is mandatory in ARMv8/AArch64 — every arm64 CPU has it.
81+
set(_fftw_default_neon ON)
82+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i686")
83+
# Probe each x86 SIMD level individually via CPUID so the defaults
84+
# are accurate for the actual build-host CPU (e.g. pre-AVX Sandy Bridge
85+
# or pre-AVX2 Ivy Bridge get only the levels their hardware supports).
86+
# __builtin_cpu_supports is a GCC/Clang intrinsic; skip on MSVC.
87+
if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|AppleClang")
88+
foreach(_fftw_simd IN ITEMS sse sse2 avx avx2)
89+
check_c_source_runs(
90+
"int main(void){return __builtin_cpu_supports(\"${_fftw_simd}\")?0:1;}"
91+
_fftw_cpu_has_${_fftw_simd}
92+
)
93+
if(_fftw_cpu_has_${_fftw_simd})
94+
set(_fftw_default_${_fftw_simd} ON)
95+
endif()
96+
endforeach()
97+
endif()
98+
endif()
99+
else()
100+
# Cross-compiling: conservative architecture-level fallback.
101+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
102+
set(_fftw_default_neon ON)
103+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
104+
# SSE/SSE2 are baseline on all 64-bit x86 CPUs; AVX/AVX2 not assumed.
105+
set(_fftw_default_sse ON)
106+
set(_fftw_default_sse2 ON)
107+
endif()
108+
endif()
109+
110+
option(FFTW_ENABLE_NEON "Enable FFTW NEON SIMD codelets (ARM64)" ${_fftw_default_neon})
111+
option(FFTW_ENABLE_SSE "Enable FFTW SSE SIMD codelets (x86)" ${_fftw_default_sse})
112+
option(FFTW_ENABLE_SSE2 "Enable FFTW SSE2 SIMD codelets (x86)" ${_fftw_default_sse2})
113+
option(FFTW_ENABLE_AVX "Enable FFTW AVX SIMD codelets (x86)" ${_fftw_default_avx})
114+
option(FFTW_ENABLE_AVX2 "Enable FFTW AVX2 SIMD codelets (x86)" ${_fftw_default_avx2})
115+
56116
set(PROJ_FFTWD_DEPENDS "")
57117
if(ITK_USE_FFTWF)
58118
itk_download_attempt_check(FFTW)
@@ -76,14 +136,15 @@ if(NOT ITK_USE_SYSTEM_FFTW)
76136
-DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR}
77137
-DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR}
78138
-DDISABLE_FORTRAN:BOOL=ON
79-
-DENABLE_AVX:BOOL=OFF
80-
-DENABLE_AVX2:BOOL=OFF
139+
-DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX}
140+
-DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2}
81141
-DENABLE_FLOAT:BOOL=ON
82142
-DENABLE_LONG_DOUBLE:BOOL=OFF
143+
-DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON}
83144
-DENABLE_OPENMP:BOOL=OFF
84145
-DENABLE_QUAD_PRECISION:BOOL=OFF
85-
-DENABLE_SSE:BOOL=OFF
86-
-DENABLE_SSE2:BOOL=OFF
146+
-DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE}
147+
-DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2}
87148
-DENABLE_THREADS:BOOL=ON
88149
-DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR}
89150
-DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER}
@@ -132,14 +193,15 @@ if(NOT ITK_USE_SYSTEM_FFTW)
132193
-DCMAKE_INSTALL_LIBDIR:STRING=${CMAKE_INSTALL_LIBDIR}
133194
-DCMAKE_INSTALL_BINDIR:STRING=${CMAKE_INSTALL_BINDIR}
134195
-DDISABLE_FORTRAN:BOOL=ON
135-
-DENABLE_AVX:BOOL=OFF
136-
-DENABLE_AVX2:BOOL=OFF
196+
-DENABLE_AVX:BOOL=${FFTW_ENABLE_AVX}
197+
-DENABLE_AVX2:BOOL=${FFTW_ENABLE_AVX2}
137198
-DENABLE_FLOAT:BOOL=OFF
138199
-DENABLE_LONG_DOUBLE:BOOL=OFF
200+
-DENABLE_NEON:BOOL=${FFTW_ENABLE_NEON}
139201
-DENABLE_OPENMP:BOOL=OFF
140202
-DENABLE_QUAD_PRECISION:BOOL=OFF
141-
-DENABLE_SSE:BOOL=OFF
142-
-DENABLE_SSE2:BOOL=OFF
203+
-DENABLE_SSE:BOOL=${FFTW_ENABLE_SSE}
204+
-DENABLE_SSE2:BOOL=${FFTW_ENABLE_SSE2}
143205
-DENABLE_THREADS:BOOL=ON
144206
-DCMAKE_APPLE_SILICON_PROCESSOR:STRING=${CMAKE_APPLE_SILICON_PROCESSOR}
145207
-DCMAKE_C_COMPILER_LAUNCHER:PATH=${CMAKE_C_COMPILER_LAUNCHER}

0 commit comments

Comments
 (0)