From 0f5dab178f30162b0a3fbe9f7ee06e0b8bb611ed Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 12 May 2026 12:07:57 -0700
Subject: [PATCH] x86: use generic x86-64-v{N} mcpu and explicit mattrs

The x86 backend previously assumed AVX-512 microarchitectures form a
strict feature hierarchy (Zen4 implies Cannonlake implies Skylake-AVX512
implies AVX512). It then drove `-mcpu` off the highest level in that
chain, picking vendor-specific CPU names like `znver4` or `cannonlake`.

That assumption isn't quite right: vendor-specific mcpu choices enable
vendor-specific features for us (e.g. `-mcpu=znver4` turns on `sse4a`,
which Cannonlake doesn't have). A Halide flag higher in the hierarchy
was therefore not actually a strict superset of one below it -- code
built for `AVX512_Zen4` could use SSE4a and fail to run on a Cannonlake
CPU even though Halide treats Zen4 as "Cannonlake and above".

Switch `mcpu_target()` to pick only generic `x86-64-v{N}` levels, and
have `mattrs()` explicitly enable every feature Halide tracks on top of
that baseline. The Halide feature flags keep their existing meaning,
but a level like `AVX512_Zen4` now produces code that runs on the
intersection of Zen4 and Cannonlake rather than the union, preserving
the "each level is a superset of the level below" invariant the rest of
the backend depends on. Per-CPU tuning via `mcpu_tune()` is untouched,
so users who want znver4/znver5-specific scheduling still get it.

Verified with `make correctness_simd_op_check_x86` (passes) and by
running `correctness_vector_reductions` under Intel SDE's
`-chip_check_die` for pnr/snb/hsw/skx/cnl/icx/spr with matching Halide
targets (all pass).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/CodeGen_X86.cpp | 137 +++++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 51 deletions(-)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 4e42ecdc0703..4ff685a7a4e2 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1718,31 +1718,34 @@ void CodeGen_X86::visit(const Store *op) {
 }
 
 string CodeGen_X86::mcpu_target() const {
-    // Perform an ad-hoc guess for the -mcpu given features.
-    // WARNING: this is used to drive -mcpu, *NOT* -mtune!
-    //          The CPU choice here *WILL* affect -mattrs!
-    if (target.has_feature(Target::AVX512_SapphireRapids)) {
-        return "sapphirerapids";
-    } else if (target.has_feature(Target::AVX512_Zen5)) {
-        return "znver5";
-    } else if (target.has_feature(Target::AVX512_Zen4)) {
-        return "znver4";
-    } else if (target.has_feature(Target::AVX512_Cannonlake)) {
-        return "cannonlake";
-    } else if (target.has_feature(Target::AVX512_Skylake)) {
-        return "skylake-avx512";
-    } else if (target.has_feature(Target::AVX512_KNL)) {
-        return "knl";
+    // Use generic x86-64 microarchitecture levels rather than specific
+    // CPU names. mattrs() turns on the actual features Halide knows
+    // about on top of this baseline. This avoids LLVM enabling
+    // vendor-specific extras for us (e.g. sse4a on AMD CPUs like
+    // "znver4"), which would otherwise mean that a Halide feature
+    // higher in the hierarchy (Zen4) failed to be a strict superset
+    // of one below it (Cannonlake). With this change, a Halide flag
+    // like AVX512_Zen4 produces code that runs on the intersection
+    // of Zen4 and Cannonlake hardware -- not the union.
+    if (target.has_feature(Target::AVX512_Skylake)) {
+        // x86-64-v4: SSE4.2, POPCNT, AVX, AVX2, BMI1/2, F16C, FMA,
+        // LZCNT, MOVBE, plus AVX512F/BW/CD/DQ/VL.
+        return "x86-64-v4";
     } else if (target.has_feature(Target::AVX2)) {
-        return "haswell";
+        // x86-64-v3: SSE4.2, POPCNT, AVX, AVX2, BMI1/2, F16C, FMA,
+        // LZCNT, MOVBE. Also covers AVX512 / AVX512_KNL, since both
+        // imply AVX2 (via complete_x86_target), but neither requires
+        // BW/DQ/VL which would come for free with v4.
+        return "x86-64-v3";
     } else if (target.has_feature(Target::AVX)) {
-        return "corei7-avx";
-    } else if (target.has_feature(Target::SSE41)) {
-        // We want SSE4.1 but not SSE4.2, hence "penryn" rather than "corei7"
-        return "penryn";
+        // x86-64-v2: SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT. AVX is
+        // added on top via mattrs.
+        return "x86-64-v2";
     } else {
-        // Default should not include SSSE3, hence "k8" rather than "core2"
-        return "k8";
+        // Plain x86-64 baseline (SSE, SSE2). The Halide SSE41 level
+        // (which deliberately stops short of SSE4.2) adds its
+        // features individually in mattrs.
+        return "x86-64";
     }
 }
 
@@ -1814,10 +1817,35 @@ string CodeGen_X86::mcpu_tune() const {
     return mcpu_target();  // Detect "best" CPU from the enabled ISA's.
 }
 
-// FIXME: we should lower everything here, instead of relying
-//        that -mcpu= (`mcpu_target()`) implies/sets features for us.
+// All features Halide knows about are turned on individually here,
+// rather than relying on `-mcpu=` (set by `mcpu_target()`) to pull
+// them in. `mcpu_target()` picks only generic x86-64-v{N} levels, so
+// we don't accidentally inherit vendor-specific features (e.g. sse4a)
+// that aren't present across all CPUs at a given Halide feature
+// level.
 string CodeGen_X86::mattrs() const {
     std::vector<std::string_view> attrs;
+
+    // Features added on top of the plain x86-64 baseline. These are
+    // already covered by x86-64-v2 (and above), but Halide's SSE41
+    // feature level intentionally stops short of SSE4.2, so SSE41
+    // alone uses the plain x86-64 mcpu and adds these by hand.
+    if (target.has_feature(Target::SSE41) && !target.has_feature(Target::AVX)) {
+        attrs.emplace_back("+sse3");
+        attrs.emplace_back("+ssse3");
+        attrs.emplace_back("+sse4.1");
+    }
+
+    // Features added on top of the x86-64-v2 baseline. AVX without
+    // AVX2 only needs +avx (the rest of v3 -- BMI/F16C/FMA -- aren't
+    // guaranteed at the Halide AVX level).
+    if (target.has_feature(Target::AVX) && !target.has_feature(Target::AVX2)) {
+        attrs.emplace_back("+avx");
+    }
+
+    // FMA / F16C are guaranteed when AVX2 is set (and thus already
+    // included in x86-64-v3), but are also exposed as independent
+    // Halide flags, so emit them explicitly when requested.
     if (target.has_feature(Target::FMA)) {
         attrs.emplace_back("+fma");
     }
@@ -1827,39 +1855,46 @@ string CodeGen_X86::mattrs() const {
     if (target.has_feature(Target::F16C)) {
         attrs.emplace_back("+f16c");
     }
+
+    // AVX512 features. Any AVX512 variant implies AVX2 (via
+    // complete_x86_target), so the mcpu baseline is at least
+    // x86-64-v3. Skylake-and-above selects x86-64-v4, which already
+    // includes F/CD/BW/DQ/VL, but we still add those features
+    // explicitly so the bare AVX512 / AVX512_KNL paths (which use
+    // x86-64-v3) also get them.
     if (target.has_feature(Target::AVX512) ||
         target.has_feature(Target::AVX512_KNL) ||
         target.has_feature(Target::AVX512_Skylake) ||
         target.has_feature(Target::AVX512_Cannonlake)) {
         attrs.emplace_back("+avx512f");
         attrs.emplace_back("+avx512cd");
-        if (target.has_feature(Target::AVX512_KNL)) {
-            attrs.emplace_back("+avx512pf");
-            attrs.emplace_back("+avx512er");
-        }
-        if (target.has_feature(Target::AVX512_Skylake) ||
-            target.has_feature(Target::AVX512_Cannonlake)) {
-            attrs.emplace_back("+avx512vl");
-            attrs.emplace_back("+avx512bw");
-            attrs.emplace_back("+avx512dq");
-        }
-        if (target.has_feature(Target::AVX512_Cannonlake)) {
-            attrs.emplace_back("+avx512ifma");
-            attrs.emplace_back("+avx512vbmi");
-        }
-        if (target.has_feature(Target::AVX512_Zen4)) {
-            attrs.emplace_back("+avx512bf16");
-            attrs.emplace_back("+avx512vnni");
-            attrs.emplace_back("+avx512bitalg");
-            attrs.emplace_back("+avx512vbmi2");
-        }
-        if (target.has_feature(Target::AVXVNNI)) {
-            attrs.emplace_back("+avxvnni");
-        }
-        if (target.has_feature(Target::AVX512_SapphireRapids)) {
-            attrs.emplace_back("+amx-int8");
-            attrs.emplace_back("+amx-bf16");
-        }
+    }
+    if (target.has_feature(Target::AVX512_KNL)) {
+        attrs.emplace_back("+avx512pf");
+        attrs.emplace_back("+avx512er");
+    }
+    if (target.has_feature(Target::AVX512_Skylake) ||
+        target.has_feature(Target::AVX512_Cannonlake)) {
+        attrs.emplace_back("+avx512vl");
+        attrs.emplace_back("+avx512bw");
+        attrs.emplace_back("+avx512dq");
+    }
+    if (target.has_feature(Target::AVX512_Cannonlake)) {
+        attrs.emplace_back("+avx512ifma");
+        attrs.emplace_back("+avx512vbmi");
+    }
+    if (target.has_feature(Target::AVX512_Zen4)) {
+        attrs.emplace_back("+avx512bf16");
+        attrs.emplace_back("+avx512vnni");
+        attrs.emplace_back("+avx512bitalg");
+        attrs.emplace_back("+avx512vbmi2");
+    }
+    if (target.has_feature(Target::AVXVNNI)) {
+        attrs.emplace_back("+avxvnni");
+    }
+    if (target.has_feature(Target::AVX512_SapphireRapids)) {
+        attrs.emplace_back("+amx-int8");
+        attrs.emplace_back("+amx-bf16");
     }
     if (gather_might_be_slow(target)) {
         attrs.emplace_back("+prefer-no-gather");