Update helion problem references and relax tolerances (#122)

yf225 · Mark Saroufim · web-flow · commit 8307b1766454 · 2026-03-13T08:13:41.000-07:00
- Replace hand-written Python reference implementations for
  gated_deltanet problems with FLA's Triton kernels
  (chunk_gated_delta_rule_fwd_h, chunk_fwd_o, recompute_w_u_fwd)
  which use the same tl.dot operations as Helion
- Generate realistic pipeline-derived inputs (normalized k, sigmoid
  beta, logsigmoid g) using FLA utilities instead of raw random tensors
- Remove test cases with non-power-of-2 dimensions (K=128/V=128,
  K=100/V=100) that are unsupported
- Relax tolerances for causal_conv1d (1e-4 -&gt; 1e-2) and fp8_quant
  scale check (1e-4/1e-6 -&gt; 1e-3)

Co-authored-by: Mark Saroufim &lt;marksaroufim@meta.com&gt;
diff --git a/problems/helion/causal_conv1d_py/reference.py b/problems/helion/causal_conv1d_py/reference.py
@@ -32,4 +32,4 @@ def ref_kernel(data: input_t) -> output_t:
         return output
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/helion/fp8_quant_py/reference.py b/problems/helion/fp8_quant_py/reference.py
@@ -46,7 +46,7 @@ def check_implementation(data, output):
     received_q, received_s = output
 
     reasons_q = verbose_allclose(received_q, expected_q, rtol=1e-3, atol=1e-3)
-    reasons_s = verbose_allclose(received_s, expected_s, rtol=1e-4, atol=1e-6)
+    reasons_s = verbose_allclose(received_s, expected_s, rtol=1e-3, atol=1e-3)
 
     reasons = []
     if reasons_q:
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py
@@ -1,58 +1,39 @@
 import torch
+import torch.nn.functional as F
 from task import input_t, output_t
 from utils import verbose_allclose
 
 CHUNK_SIZE = 64
 
+# Use FLA's Triton kernels as reference (same Triton tl.dot as Helion)
+from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h as fla_chunk_fwd_h
+from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd as fla_recompute_w_u_fwd
+from fla.ops.utils import chunk_local_cumsum, solve_tril
+from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
+
 
 def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    w = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    u = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    # Use negative values for g to keep exp(g) bounded in (0, 1] and prevent overflow
-    g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
-    return k, w, u, g
+    torch.manual_seed(seed)
+    device = "cuda"
+    # Generate pipeline-derived inputs: base inputs -> g_cumsum, A, w, u via FLA utilities
+    k = F.normalize(torch.randn(B, T, H, K, dtype=torch.float32, device=device), p=2, dim=-1)
+    v = torch.randn(B, T, H, V, dtype=torch.float32, device=device)
+    beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.float32, device=device))
+    g = F.logsigmoid(torch.randn(B, T, H, dtype=torch.float32, device=device))
+    g_cumsum = chunk_local_cumsum(g, chunk_size=CHUNK_SIZE)
+    A = chunk_scaled_dot_kkt_fwd(k=k, g=g_cumsum, beta=beta, output_dtype=torch.float32)
+    A = solve_tril(A=A, output_dtype=k.dtype)
+    w, u = fla_recompute_w_u_fwd(k=k, v=v, beta=beta, A=A, g=g_cumsum)
+    return k.contiguous(), w.contiguous(), u.contiguous(), g_cumsum.contiguous()
 
 
 def ref_kernel(data: input_t) -> output_t:
     k, w, u, g = data
-    B, T, H, K = k.shape
-    V = u.shape[-1]
-    BT = CHUNK_SIZE
-    NT = T // BT
-
-    h = torch.empty(B, NT, H, K, V, dtype=torch.float32, device=k.device)
-    v_new = torch.empty_like(u)
-
-    for b in range(B):
-        for hh in range(H):
-            b_h = torch.zeros(K, V, dtype=torch.float32, device=k.device)
-
-            for c in range(NT):
-                cs = c * BT
-                ce = cs + BT
-
-                # Store current state
-                h[b, c, hh] = b_h
-
-                # v_new = u - w @ h_state
-                b_w = w[b, cs:ce, hh].float()  # [BT, K]
-                b_u = u[b, cs:ce, hh].float()  # [BT, V]
-                b_v = b_u - torch.matmul(b_w, b_h)  # [BT, V]
-                v_new[b, cs:ce, hh] = b_v
-
-                # Gating
-                b_g = g[b, cs:ce, hh].float()  # [BT]
-                b_g_last = b_g[-1]
-                b_v_gated = b_v * torch.exp(b_g_last - b_g)[:, None]
-
-                # Decay and update
-                b_h = b_h * torch.exp(b_g_last)
-                b_k = k[b, cs:ce, hh].float()  # [BT, K]
-                b_h = b_h + torch.matmul(b_k.T, b_v_gated)
-
+    h, v_new, _ = fla_chunk_fwd_h(
+        k=k, w=w, u=u, g=g,
+        initial_state=None,
+        output_final_state=False,
+    )
     return h, v_new
 
 
@@ -61,8 +42,8 @@ def check_implementation(data, output):
     exp_h, exp_v = expected
     got_h, got_v = output
 
-    reasons_h = verbose_allclose(got_h, exp_h, rtol=2e-2, atol=2e-2)
-    reasons_v = verbose_allclose(got_v, exp_v, rtol=2e-2, atol=2e-2)
+    reasons_h = verbose_allclose(got_h.float(), exp_h.float(), rtol=1e-2, atol=1e-2)
+    reasons_v = verbose_allclose(got_v.float(), exp_v.float(), rtol=1e-2, atol=1e-2)
 
     reasons = []
     if reasons_h:
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml
@@ -50,8 +50,6 @@ tests:
   - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "seed": 4242}
   - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 5236}
   - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "seed": 1001}
-  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "seed": 5531}
-  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "seed": 9173}
 
 benchmarks:
   - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "seed": 31232}
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py
@@ -1,59 +1,41 @@
 import torch
+import torch.nn.functional as F
 from task import input_t, output_t
 from utils import make_match_reference
 
 CHUNK_SIZE = 64
 
+# Use FLA's Triton kernels as reference (same Triton tl.dot as Helion)
+from fla.ops.common.chunk_o import chunk_fwd_o as fla_chunk_fwd_o
+from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h as fla_chunk_fwd_h
+from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd as fla_recompute_w_u_fwd
+from fla.ops.utils import chunk_local_cumsum, solve_tril
+from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
+
 
 def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    NT = T // CHUNK_SIZE
-    q = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    v_new = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    h = torch.randn(B, NT, H, K, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    # Use negative values for g to keep exp(g) bounded in (0, 1]
-    g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
-    return q, k, v_new, h, g
+    torch.manual_seed(seed)
+    device = "cuda"
+    # Generate pipeline-derived inputs: base inputs -> g_cumsum, A, w, u, h, v_new via FLA utilities
+    q = torch.randn(B, T, H, K, dtype=torch.float32, device=device)
+    k = F.normalize(torch.randn(B, T, H, K, dtype=torch.float32, device=device), p=2, dim=-1)
+    v = torch.randn(B, T, H, V, dtype=torch.float32, device=device)
+    beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.float32, device=device))
+    g = F.logsigmoid(torch.randn(B, T, H, dtype=torch.float32, device=device))
+    g_cumsum = chunk_local_cumsum(g, chunk_size=CHUNK_SIZE)
+    A = chunk_scaled_dot_kkt_fwd(k=k, g=g_cumsum, beta=beta, output_dtype=torch.float32)
+    A = solve_tril(A=A, output_dtype=k.dtype)
+    w, u = fla_recompute_w_u_fwd(k=k, v=v, beta=beta, A=A, g=g_cumsum)
+    h, v_new, _ = fla_chunk_fwd_h(k=k, w=w, u=u, g=g_cumsum, output_final_state=False)
+    return q.contiguous(), k.contiguous(), v_new.contiguous(), h.contiguous(), g_cumsum.contiguous()
 
 
 def ref_kernel(data: input_t) -> output_t:
     q, k, v_new, h, g = data
-    B, T, H, K = q.shape
-    V = v_new.shape[-1]
-    BT = CHUNK_SIZE
+    K = q.shape[-1]
     scale = K ** -0.5
-
-    o = torch.empty_like(v_new)
-    causal = torch.tril(torch.ones(BT, BT, device=q.device, dtype=torch.bool))
-
-    for cs in range(0, T, BT):
-        ce = cs + BT
-        c_idx = cs // BT
-
-        # Reshape to [B, H, BT, ...] for batched matmul
-        b_q = q[:, cs:ce, :, :].permute(0, 2, 1, 3).float()   # [B, H, BT, K]
-        b_k = k[:, cs:ce, :, :].permute(0, 2, 1, 3).float()   # [B, H, BT, K]
-        b_v = v_new[:, cs:ce, :, :].permute(0, 2, 1, 3).float()  # [B, H, BT, V]
-        b_h = h[:, c_idx, :, :, :].float()                     # [B, H, K, V]
-        b_g = g[:, cs:ce, :].permute(0, 2, 1).float()          # [B, H, BT]
-
-        # Inter-chunk: q @ h * exp(g)
-        inter = torch.matmul(b_q, b_h)  # [B, H, BT, V]
-        inter = inter * torch.exp(b_g).unsqueeze(-1)
-
-        # Intra-chunk: causal(q @ k^T * exp(g_diff)) @ v_new
-        attn = torch.matmul(b_q, b_k.transpose(-1, -2))  # [B, H, BT, BT]
-        g_diff = b_g.unsqueeze(-1) - b_g.unsqueeze(-2)    # [B, H, BT, BT]
-        attn = attn * torch.exp(g_diff)
-        attn = attn.masked_fill(~causal, 0.0)
-        intra = torch.matmul(attn, b_v)  # [B, H, BT, V]
-
-        b_o = (inter + intra) * scale
-        o[:, cs:ce, :, :] = b_o.permute(0, 2, 1, 3)
-
+    o = fla_chunk_fwd_o(q=q, k=k, v=v_new, h=h, g=g, scale=scale)
     return o
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/task.yml b/problems/helion/gated_deltanet_chunk_fwd_o_py/task.yml
@@ -43,8 +43,6 @@ tests:
   - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "seed": 4242}
   - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 5236}
   - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "seed": 1001}
-  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "seed": 5531}
-  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "seed": 9173}
 
 benchmarks:
   - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "seed": 31232}
diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/reference.py b/problems/helion/gated_deltanet_recompute_w_u_py/reference.py
@@ -1,44 +1,33 @@
 import torch
+import torch.nn.functional as F
 from task import input_t, output_t
 from utils import verbose_allclose
 
 CHUNK_SIZE = 64
 
+# Use FLA's Triton kernels as reference (same Triton tl.dot as Helion)
+from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd as fla_recompute_w_u_fwd
+from fla.ops.utils import chunk_local_cumsum, solve_tril
+from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
+
 
 def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    v = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    beta = torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    A = torch.randn(B, T, H, CHUNK_SIZE, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    # Use negative values for g to keep exp(g) bounded in (0, 1]
-    g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
-    return k, v, beta, A, g
+    torch.manual_seed(seed)
+    device = "cuda"
+    # Generate pipeline-derived inputs: base inputs -> g_cumsum, A via FLA utilities
+    k = F.normalize(torch.randn(B, T, H, K, dtype=torch.float32, device=device), p=2, dim=-1)
+    v = torch.randn(B, T, H, V, dtype=torch.float32, device=device)
+    beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.float32, device=device))
+    g = F.logsigmoid(torch.randn(B, T, H, dtype=torch.float32, device=device))
+    g_cumsum = chunk_local_cumsum(g, chunk_size=CHUNK_SIZE)
+    A = chunk_scaled_dot_kkt_fwd(k=k, g=g_cumsum, beta=beta, output_dtype=torch.float32)
+    A = solve_tril(A=A, output_dtype=k.dtype)
+    return k.contiguous(), v.contiguous(), beta.contiguous(), A.contiguous(), g_cumsum.contiguous()
 
 
 def ref_kernel(data: input_t) -> output_t:
     k, v, beta, A, g = data
-    B, T, H, K = k.shape
-    V = v.shape[-1]
-    BT = CHUNK_SIZE
-
-    w = torch.empty_like(k)
-    u = torch.empty_like(v)
-
-    for cs in range(0, T, BT):
-        ce = cs + BT
-        # Reshape to [B, H, BT, BT] for batched matmul
-        A_bh = A[:, cs:ce, :, :].permute(0, 2, 1, 3).float()
-
-        # u = A @ (v * beta[..., None])
-        vb = (v[:, cs:ce, :, :] * beta[:, cs:ce, :, None]).permute(0, 2, 1, 3).float()
-        u[:, cs:ce, :, :] = torch.matmul(A_bh, vb).permute(0, 2, 1, 3)
-
-        # w = A @ (k * beta[..., None] * exp(g)[..., None])
-        kb = (k[:, cs:ce, :, :] * beta[:, cs:ce, :, None] * torch.exp(g[:, cs:ce, :, None])).permute(0, 2, 1, 3).float()
-        w[:, cs:ce, :, :] = torch.matmul(A_bh, kb).permute(0, 2, 1, 3)
-
+    w, u = fla_recompute_w_u_fwd(k=k, v=v, beta=beta, A=A, g=g)
     return w, u
 
 
@@ -47,8 +36,8 @@ def check_implementation(data, output):
     exp_w, exp_u = expected
     got_w, got_u = output
 
-    reasons_w = verbose_allclose(got_w, exp_w, rtol=1e-3, atol=1e-3)
-    reasons_u = verbose_allclose(got_u, exp_u, rtol=1e-3, atol=1e-3)
+    reasons_w = verbose_allclose(got_w, exp_w, rtol=1e-2, atol=1e-2)
+    reasons_u = verbose_allclose(got_u, exp_u, rtol=1e-2, atol=1e-2)
 
     reasons = []
     if reasons_w:
diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/task.yml b/problems/helion/gated_deltanet_recompute_w_u_py/task.yml
@@ -48,8 +48,6 @@ tests:
   - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "seed": 4242}
   - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 5236}
   - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "seed": 1001}
-  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "seed": 5531}
-  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "seed": 9173}
 
 benchmarks:
   - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "seed": 31232}

Original file line number	Diff line number	Diff line change
`@@ -32,4 +32,4 @@ def ref_kernel(data: input_t) -> output_t:`
`32`	`32`	`return output`
`33`	`33`
`34`	`34`
`35`		`-check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)`
	`35`	`+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)`