Fix moe-mxfp4 check_implementation: avoid cloned weight comparison

Mark Saroufim · Mark Saroufim · commit df582ddbbe84 · 2026-03-04T22:32:00.000-08:00
aiter's fused_moe produces different results when weight tensors are
cloned (same values, different memory). The eval harness clones data
before passing to the submission, so comparing cloned-weight output
against original-weight output always fails. Since fused_moe doesn't
mutate inputs, we use a custom check_implementation that compares
the submission output against a fresh ref_kernel run on the original
(un-cloned) data.
diff --git a/problems/amd_202602/moe-mxfp4/reference.py b/problems/amd_202602/moe-mxfp4/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, verbose_allclose
 from task import input_t, output_t
 import torch
 import torch.nn.functional as F
@@ -296,4 +296,15 @@ def ref_kernel(data: input_t) -> output_t:
 
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=5e-2, atol=5e-2)
+def check_implementation(data, submission_output):
+    """
+    Custom check that re-runs ref_kernel on the ORIGINAL (un-cloned) data.
+
+    aiter's fused_moe is sensitive to weight tensor memory layout — cloned
+    weight tensors (as produced by the eval harness's _clone_data) yield
+    different results even though the values are identical.  Because fused_moe
+    does NOT mutate its inputs, comparing the submission output against a fresh
+    ref_kernel run on the same data object is safe and correct.
+    """
+    expected = ref_kernel(data)
+    return verbose_allclose(submission_output, expected, rtol=5e-2, atol=5e-2)