Fix NaN in gated_deltanet_chunk_fwd_o reference and submission

yf225 · claude · yf225 · commit 24b5bcee1ac5 · 2026-03-14T10:45:44.000-07:00
The reference kernel computed exp(g_i - g_j) before applying the causal
mask. When g values are very negative (cumulative sums of negative
increments), the upper-triangle differences g_i - g_j overflow exp() to
inf, and inf * 0 (causal mask) produces NaN.

Fix: zero out g_diff in the upper triangle before calling exp(), so we
never compute exp(large_positive). Apply the same fix in the submission
kernel which had a similar issue with exp(-g) overflowing.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py
@@ -103,9 +103,11 @@ def ref_kernel(data: input_t) -> output_t:
     v_c = v_new.float().reshape(B, NT, C, H, V).permute(0, 1, 3, 2, 4)
     g_c = g.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     o_inter = (q_c @ h.float()) * torch.exp(g_c).unsqueeze(-1)
-    qk = q_c @ k_c.transpose(-1, -2) * torch.exp(g_c.unsqueeze(-1) - g_c.unsqueeze(-2))
-    causal = torch.tril(torch.ones(C, C, device=q.device))
-    o = (o_inter + (qk * causal) @ v_c) * scale
+    causal = torch.tril(torch.ones(C, C, dtype=torch.bool, device=q.device))
+    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
+    g_diff = torch.where(causal, g_diff, torch.zeros_like(g_diff))
+    qk = q_c @ k_c.transpose(-1, -2) * torch.exp(g_diff) * causal
+    o = (o_inter + qk @ v_c) * scale
     return o.permute(0, 1, 3, 2, 4).reshape(B, T, H, V).to(q.dtype)
 
 
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py
@@ -54,15 +54,20 @@ def kernel(
             c_idx = tile_t.begin // C
 
             g_vals = g[b_idx, tile_t, h_idx]
-            q_s = q[b_idx, tile_t, h_idx, :] * torch.exp(g_vals)[:, None]
-            k_s = k[b_idx, tile_t, h_idx, :] * torch.exp(-g_vals)[:, None]
+            q_tile = q[b_idx, tile_t, h_idx, :]
+            k_tile = k[b_idx, tile_t, h_idx, :]
+            v_tile = v[b_idx, tile_t, h_idx, :]
 
-            sim = hl.dot(q_s, k_s.T)
+            # intra-chunk: q @ k^T * exp(g_i - g_j), with causal mask
+            qk = hl.dot(q_tile, k_tile.T)
             idx = hl.arange(tile_t.block_size)
-            mask = idx[:, None] >= idx[None, :]
-            sim = torch.where(mask, sim, 0.0)
-            local_out = hl.dot(sim.to(v.dtype), v[b_idx, tile_t, h_idx, :])
+            g_diff = g_vals[:, None] - g_vals[None, :]
+            causal_mask = idx[:, None] >= idx[None, :]
+            sim = torch.where(causal_mask, qk * torch.exp(g_diff), 0.0)
+            local_out = hl.dot(sim.to(v.dtype), v_tile)
 
+            # inter-chunk: (q @ h) * exp(g)
+            q_s = q_tile * torch.exp(g_vals)[:, None]
             global_out = hl.dot(q_s, h[b_idx, c_idx, h_idx, :, :])
 
             out[b_idx, tile_t, h_idx, :] = ((global_out + local_out) * scale).to(out.dtype)