From 0f5dab178f30162b0a3fbe9f7ee06e0b8bb611ed Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 12 May 2026 12:07:57 -0700 Subject: [PATCH] x86: use generic x86-64-v{N} mcpu and explicit mattrs The x86 backend previously assumed AVX-512 microarchitectures form a strict feature hierarchy (Zen4 implies Cannonlake implies Skylake-AVX512 implies AVX512). It then drove `-mcpu` off the highest level in that chain, picking vendor-specific CPU names like `znver4` or `cannonlake`. That assumption isn't quite right: vendor-specific mcpu choices enable vendor-specific features for us (e.g. `-mcpu=znver4` turns on `sse4a`, which Cannonlake doesn't have). A Halide flag higher in the hierarchy was therefore not actually a strict superset of one below it -- code built for `AVX512_Zen4` could use SSE4a and fail to run on a Cannonlake CPU even though Halide treats Zen4 as "Cannonlake and above". Switch `mcpu_target()` to pick only generic `x86-64-v{N}` levels, and have `mattrs()` explicitly enable every feature Halide tracks on top of that baseline. The Halide feature flags keep their existing meaning, but a level like `AVX512_Zen4` now produces code that runs on the intersection of Zen4 and Cannonlake rather than the union, preserving the "each level is a superset of the level below" invariant the rest of the backend depends on. Per-CPU tuning via `mcpu_tune()` is untouched, so users who want znver4/znver5-specific scheduling still get it. Verified with `make correctness_simd_op_check_x86` (passes) and by running `correctness_vector_reductions` under Intel SDE's `-chip_check_die` for pnr/snb/hsw/skx/cnl/icx/spr with matching Halide targets (all pass). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/CodeGen_X86.cpp | 137 +++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 51 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 4e42ecdc0703..4ff685a7a4e2 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -1718,31 +1718,34 @@ void CodeGen_X86::visit(const Store *op) { } string CodeGen_X86::mcpu_target() const { - // Perform an ad-hoc guess for the -mcpu given features. - // WARNING: this is used to drive -mcpu, *NOT* -mtune! - // The CPU choice here *WILL* affect -mattrs! - if (target.has_feature(Target::AVX512_SapphireRapids)) { - return "sapphirerapids"; - } else if (target.has_feature(Target::AVX512_Zen5)) { - return "znver5"; - } else if (target.has_feature(Target::AVX512_Zen4)) { - return "znver4"; - } else if (target.has_feature(Target::AVX512_Cannonlake)) { - return "cannonlake"; - } else if (target.has_feature(Target::AVX512_Skylake)) { - return "skylake-avx512"; - } else if (target.has_feature(Target::AVX512_KNL)) { - return "knl"; + // Use generic x86-64 microarchitecture levels rather than specific + // CPU names. mattrs() turns on the actual features Halide knows + // about on top of this baseline. This avoids LLVM enabling + // vendor-specific extras for us (e.g. sse4a on AMD CPUs like + // "znver4"), which would otherwise mean that a Halide feature + // higher in the hierarchy (Zen4) failed to be a strict superset + // of one below it (Cannonlake). With this change, a Halide flag + // like AVX512_Zen4 produces code that runs on the intersection + // of Zen4 and Cannonlake hardware -- not the union. + if (target.has_feature(Target::AVX512_Skylake)) { + // x86-64-v4: SSE4.2, POPCNT, AVX, AVX2, BMI1/2, F16C, FMA, + // LZCNT, MOVBE, plus AVX512F/BW/CD/DQ/VL. + return "x86-64-v4"; } else if (target.has_feature(Target::AVX2)) { - return "haswell"; + // x86-64-v3: SSE4.2, POPCNT, AVX, AVX2, BMI1/2, F16C, FMA, + // LZCNT, MOVBE. Also covers AVX512 / AVX512_KNL, since both + // imply AVX2 (via complete_x86_target), but neither requires + // BW/DQ/VL which would come for free with v4. + return "x86-64-v3"; } else if (target.has_feature(Target::AVX)) { - return "corei7-avx"; - } else if (target.has_feature(Target::SSE41)) { - // We want SSE4.1 but not SSE4.2, hence "penryn" rather than "corei7" - return "penryn"; + // x86-64-v2: SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT. AVX is + // added on top via mattrs. + return "x86-64-v2"; } else { - // Default should not include SSSE3, hence "k8" rather than "core2" - return "k8"; + // Plain x86-64 baseline (SSE, SSE2). The Halide SSE41 level + // (which deliberately stops short of SSE4.2) adds its + // features individually in mattrs. + return "x86-64"; } } @@ -1814,10 +1817,35 @@ string CodeGen_X86::mcpu_tune() const { return mcpu_target(); // Detect "best" CPU from the enabled ISA's. } -// FIXME: we should lower everything here, instead of relying -// that -mcpu= (`mcpu_target()`) implies/sets features for us. +// All features Halide knows about are turned on individually here, +// rather than relying on `-mcpu=` (set by `mcpu_target()`) to pull +// them in. `mcpu_target()` picks only generic x86-64-v{N} levels, so +// we don't accidentally inherit vendor-specific features (e.g. sse4a) +// that aren't present across all CPUs at a given Halide feature +// level. string CodeGen_X86::mattrs() const { std::vector attrs; + + // Features added on top of the plain x86-64 baseline. These are + // already covered by x86-64-v2 (and above), but Halide's SSE41 + // feature level intentionally stops short of SSE4.2, so SSE41 + // alone uses the plain x86-64 mcpu and adds these by hand. + if (target.has_feature(Target::SSE41) && !target.has_feature(Target::AVX)) { + attrs.emplace_back("+sse3"); + attrs.emplace_back("+ssse3"); + attrs.emplace_back("+sse4.1"); + } + + // Features added on top of the x86-64-v2 baseline. AVX without + // AVX2 only needs +avx (the rest of v3 -- BMI/F16C/FMA -- aren't + // guaranteed at the Halide AVX level). + if (target.has_feature(Target::AVX) && !target.has_feature(Target::AVX2)) { + attrs.emplace_back("+avx"); + } + + // FMA / F16C are guaranteed when AVX2 is set (and thus already + // included in x86-64-v3), but are also exposed as independent + // Halide flags, so emit them explicitly when requested. if (target.has_feature(Target::FMA)) { attrs.emplace_back("+fma"); } @@ -1827,39 +1855,46 @@ string CodeGen_X86::mattrs() const { if (target.has_feature(Target::F16C)) { attrs.emplace_back("+f16c"); } + + // AVX512 features. Any AVX512 variant implies AVX2 (via + // complete_x86_target), so the mcpu baseline is at least + // x86-64-v3. Skylake-and-above selects x86-64-v4, which already + // includes F/CD/BW/DQ/VL, but we still add those features + // explicitly so the bare AVX512 / AVX512_KNL paths (which use + // x86-64-v3) also get them. if (target.has_feature(Target::AVX512) || target.has_feature(Target::AVX512_KNL) || target.has_feature(Target::AVX512_Skylake) || target.has_feature(Target::AVX512_Cannonlake)) { attrs.emplace_back("+avx512f"); attrs.emplace_back("+avx512cd"); - if (target.has_feature(Target::AVX512_KNL)) { - attrs.emplace_back("+avx512pf"); - attrs.emplace_back("+avx512er"); - } - if (target.has_feature(Target::AVX512_Skylake) || - target.has_feature(Target::AVX512_Cannonlake)) { - attrs.emplace_back("+avx512vl"); - attrs.emplace_back("+avx512bw"); - attrs.emplace_back("+avx512dq"); - } - if (target.has_feature(Target::AVX512_Cannonlake)) { - attrs.emplace_back("+avx512ifma"); - attrs.emplace_back("+avx512vbmi"); - } - if (target.has_feature(Target::AVX512_Zen4)) { - attrs.emplace_back("+avx512bf16"); - attrs.emplace_back("+avx512vnni"); - attrs.emplace_back("+avx512bitalg"); - attrs.emplace_back("+avx512vbmi2"); - } - if (target.has_feature(Target::AVXVNNI)) { - attrs.emplace_back("+avxvnni"); - } - if (target.has_feature(Target::AVX512_SapphireRapids)) { - attrs.emplace_back("+amx-int8"); - attrs.emplace_back("+amx-bf16"); - } + } + if (target.has_feature(Target::AVX512_KNL)) { + attrs.emplace_back("+avx512pf"); + attrs.emplace_back("+avx512er"); + } + if (target.has_feature(Target::AVX512_Skylake) || + target.has_feature(Target::AVX512_Cannonlake)) { + attrs.emplace_back("+avx512vl"); + attrs.emplace_back("+avx512bw"); + attrs.emplace_back("+avx512dq"); + } + if (target.has_feature(Target::AVX512_Cannonlake)) { + attrs.emplace_back("+avx512ifma"); + attrs.emplace_back("+avx512vbmi"); + } + if (target.has_feature(Target::AVX512_Zen4)) { + attrs.emplace_back("+avx512bf16"); + attrs.emplace_back("+avx512vnni"); + attrs.emplace_back("+avx512bitalg"); + attrs.emplace_back("+avx512vbmi2"); + } + if (target.has_feature(Target::AVXVNNI)) { + attrs.emplace_back("+avxvnni"); + } + if (target.has_feature(Target::AVX512_SapphireRapids)) { + attrs.emplace_back("+amx-int8"); + attrs.emplace_back("+amx-bf16"); } if (gather_might_be_slow(target)) { attrs.emplace_back("+prefer-no-gather");