Skip to content

Commit ee62fdc

Browse files
author
Mark Saroufim
committed
Fix test files to use new private/public mode naming
Update test data keys and expected values: - test_report.py: Change "benchmark"/"leaderboard" keys to "private"/"public" - test_submission.py: Update compute_score test to use "public" key - test_backend.py: Update mode values and mock data keys
1 parent 118b06a commit ee62fdc

3 files changed

Lines changed: 23 additions & 23 deletions

File tree

tests/test_backend.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory):
101101
"benchmarks": [{"dtype": "float32", "input_size": 10000}],
102102
"lang": "py",
103103
"main": "kernel.py",
104-
"mode": "leaderboard",
104+
"mode": "public",
105105
"multi_gpu": False,
106106
"ranked_timeout": 180,
107107
"ranking_by": "geom",
@@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
232232
task = db.get_leaderboard("submit-leaderboard")["task"]
233233

234234
eval_result = create_eval_result("benchmark")
235-
mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result})
235+
mock_launcher = _mock_launcher(bot, {"public": eval_result})
236236

237237
from libkernelbot.submission import ProcessedSubmissionRequest
238238

@@ -300,7 +300,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
300300
"stdout": "log stdout",
301301
"success": True,
302302
},
303-
"mode": "leaderboard",
303+
"mode": "public",
304304
"passed": True,
305305
"result": {
306306
"benchmark-count": "1",
@@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
344344
"stdout": "log stdout",
345345
"success": True,
346346
},
347-
"mode": "leaderboard",
347+
"mode": "public",
348348
"passed": True,
349349
"result": {
350350
"benchmark-count": "1",

tests/test_report.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def test_make_short_report_benchmarking_failed(sample_eval_result: EvalResult):
241241
sample_eval_result.run.success = False
242242
sample_eval_result.compilation = None
243243
sample_eval_result.run.exit_code = consts.ExitCode.CUDA_FAIL
244-
runs = {"benchmark": sample_eval_result}
244+
runs = {"private": sample_eval_result}
245245

246246
result = make_short_report(runs, full=False)
247247
assert result == ["❌ Running benchmarks failed (cuda api error)"]
@@ -274,27 +274,27 @@ def test_make_short_report_leaderboard_failed(sample_eval_result: EvalResult):
274274
sample_eval_result.run.success = False
275275
sample_eval_result.compilation = None
276276
sample_eval_result.run.exit_code = consts.ExitCode.TEST_SPEC
277-
runs = {"leaderboard": sample_eval_result}
277+
runs = {"public": sample_eval_result}
278278

279279
result = make_short_report(runs, full=False)
280-
assert result == ["❌ Running leaderboard failed (internal error 113)"]
280+
assert result == ["❌ Running ranked submission failed (internal error 113)"]
281281

282282
sample_eval_result.run.success = True
283283
sample_eval_result.run.passed = False
284284
sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL
285285
result = make_short_report(runs)
286286
# TODO is this actually possible? Should profiling do **any** correctness testing?
287-
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard run failed"]
287+
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission failed"]
288288

289289

290290
def test_make_short_report_empty():
291291
result = make_short_report({})
292-
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard missing"]
292+
assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission missing"]
293293

294294

295295
def test_make_short_report_full_success():
296296
runs = {}
297-
for run_type in ["test", "benchmark", "profile", "leaderboard"]:
297+
for run_type in ["test", "private", "profile", "public"]:
298298
runs[run_type] = EvalResult(
299299
start=datetime.datetime.now() - datetime.timedelta(minutes=5),
300300
end=datetime.datetime.now(),
@@ -318,7 +318,7 @@ def test_make_short_report_full_success():
318318
"✅ Testing successful",
319319
"✅ Benchmarking successful",
320320
"✅ Profiling successful",
321-
"✅ Leaderboard run successful",
321+
"✅ Ranked submission successful",
322322
]
323323
assert result == expected
324324

@@ -331,7 +331,7 @@ def test_make_short_report_missing_components():
331331
"✅ Compilation successful",
332332
"✅ Testing successful",
333333
"❌ Benchmarks missing",
334-
"❌ Leaderboard missing",
334+
"❌ Ranked submission missing",
335335
]
336336
assert result == expected
337337

@@ -532,7 +532,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
532532
def test_generate_report_benchmark_failure(sample_full_result: FullResult):
533533
from libkernelbot.report import Log, Text
534534

535-
sample_full_result.runs["benchmark"] = create_eval_result()
535+
sample_full_result.runs["private"] = create_eval_result()
536536
report = generate_report(sample_full_result)
537537
assert report.data == [
538538
Text(
@@ -557,8 +557,8 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
557557
Log(header="Benchmarks", content="❗ Could not find any benchmarks"),
558558
]
559559

560-
sample_full_result.runs["benchmark"].run.passed = False
561-
sample_full_result.runs["benchmark"].run.result = {
560+
sample_full_result.runs["private"].run.passed = False
561+
sample_full_result.runs["private"].run.result = {
562562
"benchmark-count": "2",
563563
"benchmark.0.status": "pass",
564564
"benchmark.0.spec": "Basic functionality",
@@ -607,7 +607,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
607607
def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
608608
from libkernelbot.report import Log, Text
609609

610-
sample_full_result.runs["leaderboard"] = create_eval_result()
610+
sample_full_result.runs["public"] = create_eval_result()
611611
report = generate_report(sample_full_result)
612612
assert report.data == [
613613
Text(
@@ -632,9 +632,9 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
632632
Log(header="Ranked Benchmark", content="❗ Could not find any benchmarks"),
633633
]
634634

635-
sample_full_result.runs["leaderboard"].run.success = False
636-
sample_full_result.runs["leaderboard"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED
637-
sample_full_result.runs["leaderboard"].run.duration = 10.0
635+
sample_full_result.runs["public"].run.success = False
636+
sample_full_result.runs["public"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED
637+
sample_full_result.runs["public"].run.duration = 10.0
638638

639639
report = generate_report(sample_full_result)
640640
assert report.data == [

tests/test_submission.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ def test_compute_score():
303303
# Test LAST ranking with single benchmark
304304
mock_task.ranking_by = RankCriterion.LAST
305305
mock_result.runs = {
306-
"leaderboard": mock.Mock(
306+
"public": mock.Mock(
307307
run=mock.Mock(
308308
result={
309309
"benchmark-count": "1",
@@ -317,7 +317,7 @@ def test_compute_score():
317317

318318
# Test MEAN ranking with multiple benchmarks
319319
mock_task.ranking_by = RankCriterion.MEAN
320-
mock_result.runs["leaderboard"].run.result = {
320+
mock_result.runs["public"].run.result = {
321321
"benchmark-count": "2",
322322
"benchmark.0.mean": "1000000000", # 1 second
323323
"benchmark.1.mean": "3000000000", # 3 seconds
@@ -327,7 +327,7 @@ def test_compute_score():
327327

328328
# Test GEOM ranking with multiple benchmarks
329329
mock_task.ranking_by = RankCriterion.GEOM
330-
mock_result.runs["leaderboard"].run.result = {
330+
mock_result.runs["public"].run.result = {
331331
"benchmark-count": "2",
332332
"benchmark.0.mean": "4000000000", # 4 seconds
333333
"benchmark.1.mean": "9000000000", # 9 seconds
@@ -337,7 +337,7 @@ def test_compute_score():
337337

338338
# Test LAST with multiple benchmarks (should raise error)
339339
mock_task.ranking_by = RankCriterion.LAST
340-
mock_result.runs["leaderboard"].run.result["benchmark-count"] = "2"
340+
mock_result.runs["public"].run.result["benchmark-count"] = "2"
341341
with pytest.raises(KernelBotError, match="exactly one benchmark"):
342342
submission.compute_score(mock_result, mock_task, 1)
343343

0 commit comments

Comments
 (0)