Remove initial_state from gated_deltanet chunk_fwd_h problem (#110)

yf225 · web-flow · commit 6c7120f89e7d · 2026-03-05T17:55:06.000-08:00
initial_state is an inference-only feature (multi-turn/streaming) and
not used during training. Simplify the problem to always start from
zeros, matching the typical training workload.
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py
@@ -5,23 +5,19 @@
 CHUNK_SIZE = 64
 
 
-def generate_input(B: int, T: int, H: int, K: int, V: int, use_initial_state: bool, seed: int) -> input_t:
+def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
     gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
     k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
     w = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
     u = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
     # Use negative values for g to keep exp(g) bounded in (0, 1] and prevent overflow
     g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
-    if use_initial_state:
-        initial_state = torch.randn(B, H, K, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    else:
-        initial_state = torch.zeros(B, H, K, V, dtype=torch.float32, device="cuda").contiguous()
-    return k, w, u, g, initial_state
+    return k, w, u, g
 
 
 def ref_kernel(data: input_t) -> output_t:
-    k, w, u, g, initial_state = data
+    k, w, u, g = data
     B, T, H, K = k.shape
     V = u.shape[-1]
     BT = CHUNK_SIZE
@@ -32,7 +28,7 @@ def ref_kernel(data: input_t) -> output_t:
 
     for b in range(B):
         for hh in range(H):
-            b_h = initial_state[b, hh].float().clone()  # [K, V]
+            b_h = torch.zeros(K, V, dtype=torch.float32, device=k.device)
 
             for c in range(NT):
                 cs = c * BT
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py
@@ -4,7 +4,7 @@
 def custom_kernel(data: input_t) -> output_t:
     import torch
 
-    k, w, u, g, initial_state = data
+    k, w, u, g = data
     B, T, H, K = k.shape
     V = u.shape[-1]
     BT = 64
@@ -15,7 +15,7 @@ def custom_kernel(data: input_t) -> output_t:
 
     for b in range(B):
         for hh in range(H):
-            b_h = initial_state[b, hh].float().clone()
+            b_h = torch.zeros(K, V, dtype=torch.float32, device=k.device)
 
             for c in range(NT):
                 cs = c * BT
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/task.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.py
@@ -1,7 +1,7 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
 
 class TestSpec(TypedDict):
@@ -10,5 +10,4 @@ class TestSpec(TypedDict):
     H: int
     K: int
     V: int
-    use_initial_state: bool
     seed: int
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml
@@ -17,20 +17,19 @@ description: |
   The sequence is divided into chunks of BT=64 timesteps. Processing is sequential
   across chunks but parallel across (B, H) and within each chunk:
 
-  For each (b, h) pair, starting with h_state = initial_state[b, h] (zeros or provided):
+  For each (b, h) pair, starting with h_state = zeros(K, V):
     For each chunk c = 0, 1, ..., NT-1:
       1. Store: h_out[b, c, h] = h_state
       2. Compute: v_new = u - w @ h_state
       3. Gate: v_gated[t] = v_new[t] * exp(g[last_t] - g[t])
       4. Decay: h_state = h_state * exp(g[last_t])
       5. Update: h_state = h_state + k^T @ v_gated
 
-  Input: tuple(k, w, u, g, initial_state) where:
+  Input: tuple(k, w, u, g) where:
     - k: torch.Tensor of shape [B, T, H, K] (float32) — keys
     - w: torch.Tensor of shape [B, T, H, K] (float32) — WY-transformed keys
     - u: torch.Tensor of shape [B, T, H, V] (float32) — WY-transformed values
     - g: torch.Tensor of shape [B, T, H] (float32) — cumulative gate
-    - initial_state: torch.Tensor of shape [B, H, K, V] (float32) — initial hidden state (zeros or random)
 
   Output: tuple(h, v_new) where:
     - h: torch.Tensor of shape [B, NT, H, K, V] (float32) — per-chunk hidden states
@@ -39,7 +38,7 @@ description: |
   Constraint: T must be a multiple of 64. NT = T // 64.
 
   See also: Helion examples/gdn_fwd_h.py for a related implementation
-  (simpler variant that returns only h, without v_new or initial_state support).
+  (simpler variant that returns only h, without v_new output).
 
 config:
   main: "eval.py"
@@ -48,20 +47,20 @@ templates:
   Python: "../template.py"
 
 tests:
-  - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "use_initial_state": false, "seed": 4242}
-  - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "use_initial_state": true, "seed": 5236}
-  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "use_initial_state": false, "seed": 1001}
-  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "use_initial_state": true, "seed": 5531}
-  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "use_initial_state": true, "seed": 9173}
+  - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "seed": 4242}
+  - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 5236}
+  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "seed": 1001}
+  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "seed": 5531}
+  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "seed": 9173}
 
 benchmarks:
-  - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "use_initial_state": false, "seed": 31232}
-  - {"B": 2, "T": 512, "H": 3, "K": 64, "V": 64, "use_initial_state": true, "seed": 4052}
-  - {"B": 2, "T": 1024, "H": 3, "K": 64, "V": 64, "use_initial_state": false, "seed": 2146}
-  - {"B": 3, "T": 1024, "H": 4, "K": 100, "V": 100, "use_initial_state": true, "seed": 3129}
-  - {"B": 4, "T": 1024, "H": 4, "K": 128, "V": 128, "use_initial_state": false, "seed": 54352}
-  - {"B": 2, "T": 1536, "H": 4, "K": 128, "V": 128, "use_initial_state": true, "seed": 71234}
-  - {"B": 4, "T": 2048, "H": 8, "K": 64, "V": 64, "use_initial_state": true, "seed": 82345}
+  - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "seed": 31232}
+  - {"B": 2, "T": 512, "H": 3, "K": 64, "V": 64, "seed": 4052}
+  - {"B": 2, "T": 1024, "H": 3, "K": 64, "V": 64, "seed": 2146}
+  - {"B": 3, "T": 1024, "H": 4, "K": 100, "V": 100, "seed": 3129}
+  - {"B": 4, "T": 1024, "H": 4, "K": 128, "V": 128, "seed": 54352}
+  - {"B": 2, "T": 1536, "H": 4, "K": 128, "V": 128, "seed": 71234}
+  - {"B": 4, "T": 2048, "H": 8, "K": 64, "V": 64, "seed": 82345}
 
 test_timeout: 180
 benchmark_timeout: 180