Skip to content

Commit 118b06a

Browse files
author
Mark Saroufim
committed
Rename submission modes: benchmark→private, leaderboard→public
This renames the user-facing submission modes for clarity: - BENCHMARK → PRIVATE (run benchmarks without affecting leaderboard ranking) - LEADERBOARD → PUBLIC (official submission to the public leaderboard) Also adds SECRET mode for internal secret validation runs. Updates Discord commands: /benchmark → /private, /ranked → /public
1 parent 1eb2687 commit 118b06a

13 files changed

Lines changed: 86 additions & 79 deletions

File tree

src/kernelbot/api/api_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,9 @@ async def to_submit_info(
213213

214214
allowed_modes = [
215215
SubmissionMode.TEST,
216-
SubmissionMode.BENCHMARK,
216+
SubmissionMode.PRIVATE,
217217
SubmissionMode.PROFILE,
218-
SubmissionMode.LEADERBOARD,
218+
SubmissionMode.PUBLIC,
219219
]
220220
if submission_mode_enum not in allowed_modes:
221221
raise HTTPException(

src/kernelbot/cogs/leaderboard_cog.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
6464
for run in sub_data["runs"]:
6565
if (
6666
not run["secret"]
67-
and run["mode"] == SubmissionMode.LEADERBOARD.value
67+
and run["mode"] == SubmissionMode.PUBLIC.value
6868
and run["passed"]
6969
):
7070
result_lines.append(generate_run_verdict(self.bot.backend, run, sub_data))
@@ -134,7 +134,7 @@ async def submit(
134134
reporter = MultiProgressReporterDiscord(interaction)
135135
sub_id, results = await self.bot.backend.submit_full(req, mode, reporter)
136136

137-
if mode == SubmissionMode.LEADERBOARD:
137+
if mode == SubmissionMode.PUBLIC:
138138
await self.post_submit_hook(interaction, sub_id)
139139
return sub_id
140140

@@ -157,23 +157,23 @@ async def submit_test(
157157
interaction, leaderboard_name, script, mode=SubmissionMode.TEST, gpu=gpu
158158
)
159159

160-
@app_commands.command(name="benchmark", description="Start a benchmarking run")
160+
@app_commands.command(name="private", description="Start a private benchmarking run")
161161
@app_commands.describe(
162162
leaderboard_name="Name of the competition / kernel to optimize",
163163
script="The Python / CUDA script file to run",
164164
gpu="Select GPU. Leave empty for interactive or automatic selection.",
165165
)
166166
@app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
167167
@with_error_handling
168-
async def submit_bench(
168+
async def submit_private(
169169
self,
170170
interaction: discord.Interaction,
171171
script: discord.Attachment,
172172
leaderboard_name: Optional[str],
173173
gpu: Optional[str],
174174
):
175175
return await self.submit(
176-
interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu
176+
interaction, leaderboard_name, script, mode=SubmissionMode.PRIVATE, gpu=gpu
177177
)
178178

179179
@app_commands.command(name="profile", description="Start a profiling run")
@@ -196,7 +196,7 @@ async def submit_profile(
196196
)
197197

198198
@app_commands.command(
199-
name="ranked", description="Start a ranked run for an official leaderboard submission"
199+
name="public", description="Start a public run for an official leaderboard submission"
200200
)
201201
@app_commands.describe(
202202
leaderboard_name="Name of the competition / kernel to optimize",
@@ -205,15 +205,15 @@ async def submit_profile(
205205
)
206206
@app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
207207
@with_error_handling
208-
async def submit_ranked(
208+
async def submit_public(
209209
self,
210210
interaction: discord.Interaction,
211211
script: discord.Attachment,
212212
leaderboard_name: Optional[str] = None,
213213
gpu: Optional[str] = None,
214214
):
215215
return await self.submit(
216-
interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
216+
interaction, leaderboard_name, script, mode=SubmissionMode.PUBLIC, gpu=gpu
217217
)
218218

219219

src/kernelbot/cogs/verify_run_cog.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,8 @@ async def verify_modal_run(
171171
@app_commands.choices(
172172
mode=[
173173
Choice(name=SubmissionMode.TEST.name, value=SubmissionMode.TEST.value),
174-
Choice(name=SubmissionMode.BENCHMARK.name, value=SubmissionMode.BENCHMARK.value),
175-
Choice(name=SubmissionMode.LEADERBOARD.name, value=SubmissionMode.LEADERBOARD.value),
174+
Choice(name=SubmissionMode.PRIVATE.name, value=SubmissionMode.PRIVATE.value),
175+
Choice(name=SubmissionMode.PUBLIC.name, value=SubmissionMode.PUBLIC.value),
176176
Choice(name="All", value="all"),
177177
]
178178
)
@@ -194,9 +194,9 @@ async def verify_task(
194194

195195
modes = []
196196
if mode is None:
197-
modes = [SubmissionMode.LEADERBOARD]
197+
modes = [SubmissionMode.PUBLIC]
198198
elif mode.value == "all":
199-
modes = [SubmissionMode.TEST, SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD]
199+
modes = [SubmissionMode.TEST, SubmissionMode.PRIVATE, SubmissionMode.PUBLIC]
200200
else:
201201
modes = [SubmissionMode(mode.value)]
202202

src/libkernelbot/backend.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ async def submit_full(
8686
for gpu in selected_gpus
8787
]
8888

89-
if mode == SubmissionMode.LEADERBOARD:
89+
if mode == SubmissionMode.PUBLIC:
9090
tasks += [
9191
self.submit_leaderboard(
9292
sub_id,
@@ -95,7 +95,7 @@ async def submit_full(
9595
gpu,
9696
reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
9797
req.task,
98-
SubmissionMode.PRIVATE,
98+
SubmissionMode.SECRET,
9999
req.secret_seed,
100100
)
101101
for gpu in selected_gpus
@@ -142,12 +142,14 @@ async def submit_leaderboard( # noqa: C901
142142

143143
if result.success:
144144
score = None
145+
# Check for the mode's result key (public or secret)
146+
mode_key = mode.value
145147
if (
146-
"leaderboard" in result.runs
147-
and result.runs["leaderboard"].run.success
148-
and result.runs["leaderboard"].run.passed
148+
mode_key in result.runs
149+
and result.runs[mode_key].run.success
150+
and result.runs[mode_key].run.passed
149151
):
150-
score = compute_score(result, task, submission_id)
152+
score = compute_score(result, task, submission_id, mode_key)
151153

152154
# verifyruns uses a fake submission id of -1
153155
if submission_id != -1:
@@ -159,8 +161,8 @@ async def submit_leaderboard( # noqa: C901
159161
end=value.end,
160162
mode=key,
161163
runner=gpu_type.name,
162-
score=None if key != "leaderboard" else score,
163-
secret=mode == SubmissionMode.PRIVATE,
164+
score=None if key != mode_key else score,
165+
secret=mode == SubmissionMode.SECRET,
164166
compilation=value.compilation,
165167
result=value.run,
166168
system=result.system,
@@ -207,7 +209,7 @@ async def handle_submission(
207209
await reporter.update_title(reporter.title + " ✅ success")
208210

209211
short_report = make_short_report(
210-
result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD]
212+
result.runs, full=mode in [SubmissionMode.PUBLIC, SubmissionMode.SECRET]
211213
)
212214

213215
stream_msg = (
@@ -222,7 +224,7 @@ async def handle_submission(
222224
)
223225

224226
await reporter.push(short_report)
225-
if mode != SubmissionMode.PRIVATE:
227+
if mode != SubmissionMode.SECRET:
226228
try:
227229
# does the last message of the short report start with ✅ or ❌?
228230
verdict = short_report[-1][0]

src/libkernelbot/consts.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,21 +82,22 @@ class SubmissionMode(Enum):
8282
"""
8383
Different types of submission that can be made:
8484
Test: Run tests and give detailed results about passed/failed tests. These have short timeouts.
85-
Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times.
85+
Private: Run benchmarks privately. Each benchmark is tested once, and then run multiple times.
86+
Returns detailed timing results but doesn't affect leaderboard ranking.
8687
Profile: Gather profiling information. One selected benchmark is run under the profiler. No
8788
testing is performed in this mode (sometimes, you need to profile deliberately broken code)
88-
Leaderboard: Official submission to the leaderboard. This first runs public tests, then a
89-
repeated invocation of a single benchmark. Feedback for the secret benchmark is only very
90-
limited (no stdout/stderr).
91-
Private: Special run that does test followed by leaderboard (on a secret seed), but gives only
92-
very limited feedback.
89+
Public: Official submission to the leaderboard. This first runs public tests, then a
90+
repeated invocation of a single benchmark. If all tests pass, the submission is evaluated
91+
and ranked on the public leaderboard.
92+
Secret: Internal mode for running the full evaluation flow with a secret seed. This is used
93+
for secret validation runs that accompany public submissions.
9394
"""
9495

9596
TEST = "test"
96-
BENCHMARK = "benchmark"
97-
PROFILE = "profile"
98-
LEADERBOARD = "leaderboard"
9997
PRIVATE = "private"
98+
PROFILE = "profile"
99+
PUBLIC = "public"
100+
SECRET = "secret"
100101

101102

102103
class Language(Enum):

src/libkernelbot/launchers/github.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ def get_timeout(config: dict) -> int:
4949
mode = config.get("mode")
5050
sec_map = {
5151
SubmissionMode.TEST.value: config.get("test_timeout"),
52-
SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"),
53-
SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"),
52+
SubmissionMode.PRIVATE.value: config.get("benchmark_timeout"),
53+
SubmissionMode.PUBLIC.value: config.get("ranked_timeout"),
5454
}
5555
seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60
5656
return math.ceil(seconds / 60)

src/libkernelbot/report.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,8 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n
176176
elif full:
177177
result.append("❌ Tests missing")
178178

179-
if "benchmark" in runs:
180-
bench_run = runs["benchmark"].run
179+
if "private" in runs:
180+
bench_run = runs["private"].run
181181
if not bench_run.success:
182182
result.append("❌ Running benchmarks failed" + _short_fail_reason(bench_run))
183183
return result
@@ -202,16 +202,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n
202202
else:
203203
result.append("✅ Profiling successful")
204204

205-
if "leaderboard" in runs:
206-
lb_run = runs["leaderboard"].run
205+
# Check for public or secret run results
206+
ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
207+
if ranked_key:
208+
lb_run = runs[ranked_key].run
207209
if not lb_run.success:
208-
result.append("❌ Running leaderboard failed" + _short_fail_reason(lb_run))
210+
result.append("❌ Running ranked submission failed" + _short_fail_reason(lb_run))
209211
elif not lb_run.passed:
210-
result.append("❌ Leaderboard run failed")
212+
result.append("❌ Ranked submission failed")
211213
else:
212-
result.append("✅ Leaderboard run successful")
214+
result.append("✅ Ranked submission successful")
213215
elif full:
214-
result.append("❌ Leaderboard missing")
216+
result.append("❌ Ranked submission missing")
215217
return result
216218

217219

@@ -339,8 +341,8 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
339341
num_tests = int(test_run.result.get("test-count", 0))
340342
report.add_log(f"✅ Passed {num_tests}/{num_tests} tests", make_test_log(test_run))
341343

342-
if "benchmark" in runs:
343-
bench_run = runs["benchmark"]
344+
if "private" in runs:
345+
bench_run = runs["private"]
344346
if _handle_crash_report(report, bench_run):
345347
return report
346348

@@ -378,8 +380,10 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
378380
base64.b64decode(prof_run.profile.trace),
379381
)
380382

381-
if "leaderboard" in runs:
382-
bench_run = runs["leaderboard"]
383+
# Check for public or secret run results
384+
ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
385+
if ranked_key:
386+
bench_run = runs[ranked_key]
383387
if _handle_crash_report(report, bench_run):
384388
return report
385389

src/libkernelbot/run_eval.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -556,8 +556,8 @@ def run_single_evaluation(
556556
if mode == "test":
557557
timeout = test_timeout
558558
cases.write(tests)
559-
elif mode in ["benchmark", "profile", "leaderboard"]:
560-
timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
559+
elif mode in ["private", "profile", "public", "secret"]:
560+
timeout = ranked_timeout if mode in ["public", "secret"] else benchmark_timeout
561561
if ranking_by == "last":
562562
cases.write(benchmarks.splitlines(keepends=True)[-1])
563563
else:
@@ -801,22 +801,22 @@ def run_evaluation(
801801
common_args["benchmarks"] = benchmark
802802
results[f"{mode}.{i}"] = call(mode=mode, **common_args)
803803

804-
elif mode in ["test", "benchmark"]:
804+
elif mode in ["test", "private"]:
805805
results[mode] = call(mode=mode, **common_args)
806-
elif mode in ["private", "leaderboard"]:
806+
elif mode in ["public", "secret"]:
807807
# first, run the tests
808808
results["test"] = call(mode="test", **common_args)
809809

810810
if not results["test"].run or not results["test"].run.passed:
811811
return results
812812

813-
results["benchmark"] = call(mode="benchmark", **common_args)
813+
results["private"] = call(mode="private", **common_args)
814814

815-
if not results["benchmark"].run or not results["benchmark"].run.passed:
815+
if not results["private"].run or not results["private"].run.passed:
816816
return results
817817

818-
# if they pass, run the leaderboard validation
819-
results["leaderboard"] = call(mode="leaderboard", **common_args)
818+
# if they pass, run the public/secret validation
819+
results[mode] = call(mode=mode, **common_args)
820820
else:
821821
raise AssertionError("Invalid mode")
822822

src/libkernelbot/submission.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ def _get_popcorn_directives(submission: str) -> dict: # noqa: C901
169169
return popcorn_info
170170

171171

172-
def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) -> float:
173-
num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"])
172+
def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int, mode_key: str = "public") -> float:
173+
num_benchmarks = int(result.runs[mode_key].run.result["benchmark-count"])
174174
if task.ranking_by == RankCriterion.LAST:
175175
if num_benchmarks != 1:
176176
logger.error(
@@ -182,11 +182,11 @@ def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int)
182182
raise KernelBotError(
183183
f"Expected submission to have exactly one benchmark, got {num_benchmarks}."
184184
)
185-
score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9
185+
score = float(result.runs[mode_key].run.result["benchmark.0.mean"]) / 1e9
186186
else:
187187
scores = []
188188
for i in range(num_benchmarks):
189-
scores.append(float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) / 1e9)
189+
scores.append(float(result.runs[mode_key].run.result[f"benchmark.{i}.mean"]) / 1e9)
190190
if task.ranking_by == RankCriterion.MEAN:
191191
score = sum(scores) / len(scores)
192192
elif task.ranking_by == RankCriterion.GEOM:

0 commit comments

Comments
 (0)