|
| 1 | +files: |
| 2 | + - {"name": "submission.py", "source": "@SUBMISSION@"} |
| 3 | + - {"name": "task.py", "source": "task.py"} |
| 4 | + - {"name": "utils.py", "source": "../utils.py"} |
| 5 | + - {"name": "reference.py", "source": "reference.py"} |
| 6 | + - {"name": "eval.py", "source": "../eval.py"} |
| 7 | + |
| 8 | +lang: "py" |
| 9 | + |
| 10 | +description: | |
| 11 | + Implement a causal depthwise 1D convolution kernel. |
| 12 | +
|
| 13 | + This is a core component of Mamba/Mamba-2 architectures. Each channel is |
| 14 | + convolved independently (depthwise) with causal (left) zero-padding so that |
| 15 | + output[t] depends only on input[t-W+1:t+1]. |
| 16 | +
|
| 17 | + For each batch b, channel d, and time t: |
| 18 | + out[b, d, t] = bias[d] + sum_{k=0}^{W-1} weight[d, k] * x[b, d, t - W + 1 + k] |
| 19 | + where out-of-bounds values are treated as zero. |
| 20 | +
|
| 21 | + Input: tuple(x, weight, bias) where: |
| 22 | + - x: torch.Tensor of shape [B, D, S] (float32) |
| 23 | + - weight: torch.Tensor of shape [D, W] (float32) |
| 24 | + - bias: torch.Tensor of shape [D] (float32) |
| 25 | +
|
| 26 | + Output: torch.Tensor of shape [B, D, S] (float32) |
| 27 | +
|
| 28 | +config: |
| 29 | + main: "eval.py" |
| 30 | + |
| 31 | +templates: |
| 32 | + Python: "../template.py" |
| 33 | + |
| 34 | +tests: |
| 35 | + - {"B": 1, "D": 64, "S": 64, "W": 4, "seed": 4242} |
| 36 | + - {"B": 2, "D": 128, "S": 128, "W": 4, "seed": 5236} |
| 37 | + - {"B": 1, "D": 256, "S": 256, "W": 3, "seed": 1001} |
| 38 | + - {"B": 1, "D": 128, "S": 64, "W": 8, "seed": 5531} |
| 39 | + - {"B": 4, "D": 64, "S": 128, "W": 4, "seed": 9173} |
| 40 | + |
| 41 | +benchmarks: |
| 42 | + - {"B": 1, "D": 768, "S": 512, "W": 4, "seed": 31232} |
| 43 | + - {"B": 1, "D": 768, "S": 2048, "W": 4, "seed": 4052} |
| 44 | + - {"B": 1, "D": 1536, "S": 2048, "W": 4, "seed": 2146} |
| 45 | + - {"B": 1, "D": 2560, "S": 2048, "W": 4, "seed": 3129} |
| 46 | + - {"B": 1, "D": 2560, "S": 4096, "W": 4, "seed": 54352} |
| 47 | + |
| 48 | +test_timeout: 180 |
| 49 | +benchmark_timeout: 180 |
| 50 | +ranked_timeout: 420 |
| 51 | +ranking_by: "geom" |
0 commit comments