Fix: add better l2 cache clear

S1ro1 · S1ro1 · commit 9ca8ea5d9728 · 2026-01-25T01:10:56.000+01:00
diff --git a/problems/nvidia/nvfp4_group_gemm/utils.py b/problems/nvidia/nvfp4_group_gemm/utils.py
@@ -28,11 +28,7 @@ def get_device(use_cuda: bool = True) -> torch.device:
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
 @torch.no_grad()
 def verbose_allclose(
-        received: torch.Tensor,
-        expected: torch.Tensor,
-        rtol=1e-05,
-        atol=1e-08,
-        max_print=5
+    received: torch.Tensor, expected: torch.Tensor, rtol=1e-05, atol=1e-08, max_print=5
 ) -> list[str]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
@@ -64,9 +60,13 @@ def verbose_allclose(
     nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    posinf_mismatched = torch.logical_xor(
+        torch.isposinf(received), torch.isposinf(expected)
+    )
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    neginf_mismatched = torch.logical_xor(
+        torch.isneginf(received), torch.isneginf(expected)
+    )
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -87,14 +87,18 @@ def verbose_allclose(
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return mismatch_details
 
     return []
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+def verbose_allequal(
+    received: torch.Tensor, expected: torch.Tensor, max_print: int = 5
+):
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -120,32 +124,43 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return mismatch_details
 
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+def match_reference(
+    data, output, reference: callable, rtol=1e-05, atol=1e-08
+) -> tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
     expected = reference(data)
 
     if len(output) != len(expected):
-        return False, f"output length mismatch: got {len(output)}, expected {len(expected)}"
+        return (
+            False,
+            f"output length mismatch: got {len(output)}, expected {len(expected)}",
+        )
 
     for i, (output_i, expected_i) in enumerate(zip(output, expected)):
         reasons = verbose_allclose(output_i, expected_i, rtol=rtol, atol=atol)
         if len(reasons) > 0:
-            return False, f"mismatch found! custom implementation doesn't match reference: {i} {reasons}"
+            return (
+                False,
+                f"mismatch found! custom implementation doesn't match reference: {i} {reasons}",
+            )
 
-    return True, ''
+    return True, ""
 
 
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
+
     return wrapped
 
 
@@ -156,7 +171,7 @@ def __init__(self):
         self.cublas = None
 
     def __enter__(self):
-        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.cublas = os.environ.get("CUBLAS_WORKSPACE_CONFIG", "")
         self.allow_tf32 = torch.backends.cudnn.allow_tf32
         self.deterministic = torch.backends.cudnn.deterministic
         torch.backends.cudnn.allow_tf32 = False
@@ -168,7 +183,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
         torch.backends.cudnn.deterministic = self.deterministic
         torch.use_deterministic_algorithms(False)
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = self.cublas
+
 
 def clear_l2_cache():
     # import cupy as cp
@@ -177,4 +193,12 @@ def clear_l2_cache():
     dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
     # write stuff to
     dummy.fill_(42)
-    del dummy
+    del dummy
+
+
+def clear_l2_cache_large():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.randn((16000, 1024, 1024), device="cuda")
+    del dummy