Rename submission modes: benchmark→private, leaderboard→public

Mark Saroufim · Mark Saroufim · commit 118b06a730e2 · 2026-02-01T17:56:22.000-08:00
This renames the user-facing submission modes for clarity:
- BENCHMARK → PRIVATE (run benchmarks without affecting leaderboard ranking)
- LEADERBOARD → PUBLIC (official submission to the public leaderboard)

Also adds SECRET mode for internal secret validation runs.

Updates Discord commands: /benchmark → /private, /ranked → /public
diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py
@@ -213,9 +213,9 @@ async def to_submit_info(
 
     allowed_modes = [
         SubmissionMode.TEST,
-        SubmissionMode.BENCHMARK,
+        SubmissionMode.PRIVATE,
         SubmissionMode.PROFILE,
-        SubmissionMode.LEADERBOARD,
+        SubmissionMode.PUBLIC,
     ]
     if submission_mode_enum not in allowed_modes:
         raise HTTPException(
diff --git a/src/kernelbot/cogs/leaderboard_cog.py b/src/kernelbot/cogs/leaderboard_cog.py
@@ -64,7 +64,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         for run in sub_data["runs"]:
             if (
                 not run["secret"]
-                and run["mode"] == SubmissionMode.LEADERBOARD.value
+                and run["mode"] == SubmissionMode.PUBLIC.value
                 and run["passed"]
             ):
                 result_lines.append(generate_run_verdict(self.bot.backend, run, sub_data))
@@ -134,7 +134,7 @@ async def submit(
         reporter = MultiProgressReporterDiscord(interaction)
         sub_id, results = await self.bot.backend.submit_full(req, mode, reporter)
 
-        if mode == SubmissionMode.LEADERBOARD:
+        if mode == SubmissionMode.PUBLIC:
             await self.post_submit_hook(interaction, sub_id)
         return sub_id
 
@@ -157,23 +157,23 @@ async def submit_test(
             interaction, leaderboard_name, script, mode=SubmissionMode.TEST, gpu=gpu
         )
 
-    @app_commands.command(name="benchmark", description="Start a benchmarking run")
+    @app_commands.command(name="private", description="Start a private benchmarking run")
     @app_commands.describe(
         leaderboard_name="Name of the competition / kernel to optimize",
         script="The Python / CUDA script file to run",
         gpu="Select GPU. Leave empty for interactive or automatic selection.",
     )
     @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
     @with_error_handling
-    async def submit_bench(
+    async def submit_private(
         self,
         interaction: discord.Interaction,
         script: discord.Attachment,
         leaderboard_name: Optional[str],
         gpu: Optional[str],
     ):
         return await self.submit(
-            interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu
+            interaction, leaderboard_name, script, mode=SubmissionMode.PRIVATE, gpu=gpu
         )
 
     @app_commands.command(name="profile", description="Start a profiling run")
@@ -196,7 +196,7 @@ async def submit_profile(
         )
 
     @app_commands.command(
-        name="ranked", description="Start a ranked run for an official leaderboard submission"
+        name="public", description="Start a public run for an official leaderboard submission"
     )
     @app_commands.describe(
         leaderboard_name="Name of the competition / kernel to optimize",
@@ -205,15 +205,15 @@ async def submit_profile(
     )
     @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
     @with_error_handling
-    async def submit_ranked(
+    async def submit_public(
         self,
         interaction: discord.Interaction,
         script: discord.Attachment,
         leaderboard_name: Optional[str] = None,
         gpu: Optional[str] = None,
     ):
         return await self.submit(
-            interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
+            interaction, leaderboard_name, script, mode=SubmissionMode.PUBLIC, gpu=gpu
         )
 
 
diff --git a/src/kernelbot/cogs/verify_run_cog.py b/src/kernelbot/cogs/verify_run_cog.py
@@ -171,8 +171,8 @@ async def verify_modal_run(
     @app_commands.choices(
         mode=[
             Choice(name=SubmissionMode.TEST.name, value=SubmissionMode.TEST.value),
-            Choice(name=SubmissionMode.BENCHMARK.name, value=SubmissionMode.BENCHMARK.value),
-            Choice(name=SubmissionMode.LEADERBOARD.name, value=SubmissionMode.LEADERBOARD.value),
+            Choice(name=SubmissionMode.PRIVATE.name, value=SubmissionMode.PRIVATE.value),
+            Choice(name=SubmissionMode.PUBLIC.name, value=SubmissionMode.PUBLIC.value),
             Choice(name="All", value="all"),
         ]
     )
@@ -194,9 +194,9 @@ async def verify_task(
 
         modes = []
         if mode is None:
-            modes = [SubmissionMode.LEADERBOARD]
+            modes = [SubmissionMode.PUBLIC]
         elif mode.value == "all":
-            modes = [SubmissionMode.TEST, SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD]
+            modes = [SubmissionMode.TEST, SubmissionMode.PRIVATE, SubmissionMode.PUBLIC]
         else:
             modes = [SubmissionMode(mode.value)]
 
diff --git a/src/libkernelbot/backend.py b/src/libkernelbot/backend.py
@@ -86,7 +86,7 @@ async def submit_full(
                 for gpu in selected_gpus
             ]
 
-            if mode == SubmissionMode.LEADERBOARD:
+            if mode == SubmissionMode.PUBLIC:
                 tasks += [
                     self.submit_leaderboard(
                         sub_id,
@@ -95,7 +95,7 @@ async def submit_full(
                         gpu,
                         reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
                         req.task,
-                        SubmissionMode.PRIVATE,
+                        SubmissionMode.SECRET,
                         req.secret_seed,
                     )
                     for gpu in selected_gpus
@@ -142,12 +142,14 @@ async def submit_leaderboard(  # noqa: C901
 
         if result.success:
             score = None
+            # Check for the mode's result key (public or secret)
+            mode_key = mode.value
             if (
-                "leaderboard" in result.runs
-                and result.runs["leaderboard"].run.success
-                and result.runs["leaderboard"].run.passed
+                mode_key in result.runs
+                and result.runs[mode_key].run.success
+                and result.runs[mode_key].run.passed
             ):
-                score = compute_score(result, task, submission_id)
+                score = compute_score(result, task, submission_id, mode_key)
 
             # verifyruns uses a fake submission id of -1
             if submission_id != -1:
@@ -159,8 +161,8 @@ async def submit_leaderboard(  # noqa: C901
                             end=value.end,
                             mode=key,
                             runner=gpu_type.name,
-                            score=None if key != "leaderboard" else score,
-                            secret=mode == SubmissionMode.PRIVATE,
+                            score=None if key != mode_key else score,
+                            secret=mode == SubmissionMode.SECRET,
                             compilation=value.compilation,
                             result=value.run,
                             system=result.system,
@@ -207,7 +209,7 @@ async def handle_submission(
             await reporter.update_title(reporter.title + " ✅ success")
 
         short_report = make_short_report(
-            result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD]
+            result.runs, full=mode in [SubmissionMode.PUBLIC, SubmissionMode.SECRET]
         )
 
         stream_msg = (
@@ -222,7 +224,7 @@ async def handle_submission(
         )
 
         await reporter.push(short_report)
-        if mode != SubmissionMode.PRIVATE:
+        if mode != SubmissionMode.SECRET:
             try:
                 # does the last message of the short report start with ✅ or ❌?
                 verdict = short_report[-1][0]
diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py
@@ -82,21 +82,22 @@ class SubmissionMode(Enum):
     """
     Different types of submission that can be made:
     Test: Run tests and give detailed results about passed/failed tests. These have short timeouts.
-    Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times.
+    Private: Run benchmarks privately. Each benchmark is tested once, and then run multiple times.
+        Returns detailed timing results but doesn't affect leaderboard ranking.
     Profile: Gather profiling information. One selected benchmark is run under the profiler. No
         testing is performed in this mode (sometimes, you need to profile deliberately broken code)
-    Leaderboard: Official submission to the leaderboard. This first runs public tests, then a
-        repeated invocation of a single benchmark. Feedback for the secret benchmark is only very
-        limited (no stdout/stderr).
-    Private: Special run that does test followed by leaderboard (on a secret seed), but gives only
-        very limited feedback.
+    Public: Official submission to the leaderboard. This first runs public tests, then a
+        repeated invocation of a single benchmark. If all tests pass, the submission is evaluated
+        and ranked on the public leaderboard.
+    Secret: Internal mode for running the full evaluation flow with a secret seed. This is used
+        for secret validation runs that accompany public submissions.
     """
 
     TEST = "test"
-    BENCHMARK = "benchmark"
-    PROFILE = "profile"
-    LEADERBOARD = "leaderboard"
     PRIVATE = "private"
+    PROFILE = "profile"
+    PUBLIC = "public"
+    SECRET = "secret"
 
 
 class Language(Enum):
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
@@ -49,8 +49,8 @@ def get_timeout(config: dict) -> int:
     mode = config.get("mode")
     sec_map = {
         SubmissionMode.TEST.value: config.get("test_timeout"),
-        SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"),
-        SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"),
+        SubmissionMode.PRIVATE.value: config.get("benchmark_timeout"),
+        SubmissionMode.PUBLIC.value: config.get("ranked_timeout"),
     }
     seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60
     return math.ceil(seconds / 60)
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
@@ -176,8 +176,8 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Tests missing")
 
-    if "benchmark" in runs:
-        bench_run = runs["benchmark"].run
+    if "private" in runs:
+        bench_run = runs["private"].run
         if not bench_run.success:
             result.append("❌ Running benchmarks failed" + _short_fail_reason(bench_run))
             return result
@@ -202,16 +202,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
             else:
                 result.append("✅ Profiling successful")
 
-    if "leaderboard" in runs:
-        lb_run = runs["leaderboard"].run
+    # Check for public or secret run results
+    ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
+    if ranked_key:
+        lb_run = runs[ranked_key].run
         if not lb_run.success:
-            result.append("❌ Running leaderboard failed" + _short_fail_reason(lb_run))
+            result.append("❌ Running ranked submission failed" + _short_fail_reason(lb_run))
         elif not lb_run.passed:
-            result.append("❌ Leaderboard run failed")
+            result.append("❌ Ranked submission failed")
         else:
-            result.append("✅ Leaderboard run successful")
+            result.append("✅ Ranked submission successful")
     elif full:
-        result.append("❌ Leaderboard missing")
+        result.append("❌ Ranked submission missing")
     return result
 
 
@@ -339,8 +341,8 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
             num_tests = int(test_run.result.get("test-count", 0))
             report.add_log(f"✅ Passed {num_tests}/{num_tests} tests", make_test_log(test_run))
 
-    if "benchmark" in runs:
-        bench_run = runs["benchmark"]
+    if "private" in runs:
+        bench_run = runs["private"]
         if _handle_crash_report(report, bench_run):
             return report
 
@@ -378,8 +380,10 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
                         base64.b64decode(prof_run.profile.trace),
                     )
 
-    if "leaderboard" in runs:
-        bench_run = runs["leaderboard"]
+    # Check for public or secret run results
+    ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
+    if ranked_key:
+        bench_run = runs[ranked_key]
         if _handle_crash_report(report, bench_run):
             return report
 
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
@@ -556,8 +556,8 @@ def run_single_evaluation(
         if mode == "test":
             timeout = test_timeout
             cases.write(tests)
-        elif mode in ["benchmark", "profile", "leaderboard"]:
-            timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
+        elif mode in ["private", "profile", "public", "secret"]:
+            timeout = ranked_timeout if mode in ["public", "secret"] else benchmark_timeout
             if ranking_by == "last":
                 cases.write(benchmarks.splitlines(keepends=True)[-1])
             else:
@@ -801,22 +801,22 @@ def run_evaluation(
             common_args["benchmarks"] = benchmark
             results[f"{mode}.{i}"] = call(mode=mode, **common_args)
 
-    elif mode in ["test", "benchmark"]:
+    elif mode in ["test", "private"]:
         results[mode] = call(mode=mode, **common_args)
-    elif mode in ["private", "leaderboard"]:
+    elif mode in ["public", "secret"]:
         # first, run the tests
         results["test"] = call(mode="test", **common_args)
 
         if not results["test"].run or not results["test"].run.passed:
             return results
 
-        results["benchmark"] = call(mode="benchmark", **common_args)
+        results["private"] = call(mode="private", **common_args)
 
-        if not results["benchmark"].run or not results["benchmark"].run.passed:
+        if not results["private"].run or not results["private"].run.passed:
             return results
 
-        # if they pass, run the leaderboard validation
-        results["leaderboard"] = call(mode="leaderboard", **common_args)
+        # if they pass, run the public/secret validation
+        results[mode] = call(mode=mode, **common_args)
     else:
         raise AssertionError("Invalid mode")
 
diff --git a/src/libkernelbot/submission.py b/src/libkernelbot/submission.py
@@ -169,8 +169,8 @@ def _get_popcorn_directives(submission: str) -> dict:  # noqa: C901
     return popcorn_info
 
 
-def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) -> float:
-    num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"])
+def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int, mode_key: str = "public") -> float:
+    num_benchmarks = int(result.runs[mode_key].run.result["benchmark-count"])
     if task.ranking_by == RankCriterion.LAST:
         if num_benchmarks != 1:
             logger.error(
@@ -182,11 +182,11 @@ def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int)
             raise KernelBotError(
                 f"Expected submission to have exactly one benchmark, got {num_benchmarks}."
             )
-        score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9
+        score = float(result.runs[mode_key].run.result["benchmark.0.mean"]) / 1e9
     else:
         scores = []
         for i in range(num_benchmarks):
-            scores.append(float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) / 1e9)
+            scores.append(float(result.runs[mode_key].run.result[f"benchmark.{i}.mean"]) / 1e9)
         if task.ranking_by == RankCriterion.MEAN:
             score = sum(scores) / len(scores)
         elif task.ranking_by == RankCriterion.GEOM:
diff --git a/tests/test_backend.py b/tests/test_backend.py
diff --git a/tests/test_github.py b/tests/test_github.py
diff --git a/tests/test_modal.py b/tests/test_modal.py
diff --git a/tests/test_task.py b/tests/test_task.py