Skip to content

Commit 15e9f75

Browse files
committed
formatting
Signed-off-by: Oleg Silkin <97077423+RobotSail@users.noreply.github.com>
1 parent 1892a79 commit 15e9f75

1 file changed

Lines changed: 34 additions & 30 deletions

File tree

src/instructlab/eval/leaderboard.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
1-
from lm_eval.evaluator import simple_evaluate
2-
from .evaluator import Evaluator
3-
4-
1+
# Standard
2+
from enum import StrEnum
53
from pathlib import Path
4+
import gc
65
import json
7-
from lm_eval.evaluator import simple_evaluate
8-
import typing as t
96
import os
10-
import torch.multiprocessing as mp
7+
import typing as t
8+
9+
# Third Party
1110
from accelerate import Accelerator
12-
import torch.distributed as dist
13-
import torch
11+
from lm_eval.evaluator import simple_evaluate
1412
from torch import cuda
15-
import gc
16-
from enum import StrEnum
13+
import torch
14+
import torch.distributed as dist
15+
import torch.multiprocessing as mp
16+
17+
# Local
18+
from .evaluator import Evaluator
1719

1820

1921
class ParsedScores(t.TypedDict):
@@ -137,6 +139,7 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
137139
def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
138140
# we need to use torch.multiprocessing to run each task in a separate process,
139141
# and then combine the results
142+
# Third Party
140143
import torch.multiprocessing as mp
141144

142145
num_processes = args["num_gpus"]
@@ -166,9 +169,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
166169
p.join()
167170

168171
# extract the result which is not None
169-
assert len([res for res in results.values() if res is not None]) == 1, (
170-
"we expect exactly 1 process to return a results dict properly"
171-
)
172+
assert (
173+
len([res for res in results.values() if res is not None]) == 1
174+
), "we expect exactly 1 process to return a results dict properly"
172175
results_dict = [res for res in results.values() if res is not None][0]
173176
return results_dict
174177

@@ -234,9 +237,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
234237
parsed_scores = parse_multitask_results(
235238
result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
236239
)
237-
assert len(parsed_scores["subtasks"]) == 24, (
238-
"there should be 24 subtasks of bbh run"
239-
)
240+
assert (
241+
len(parsed_scores["subtasks"]) == 24
242+
), "there should be 24 subtasks of bbh run"
240243
return parsed_scores
241244

242245

@@ -287,9 +290,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
287290
scores.append(value)
288291
target_metrics.remove(metric)
289292

290-
assert len(scores) == 2, (
291-
f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
292-
)
293+
assert (
294+
len(scores) == 2
295+
), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
293296
return {
294297
"score": sum(scores) / 2,
295298
}
@@ -313,9 +316,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
313316
parsed_scores = parse_multitask_results(
314317
result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
315318
)
316-
assert len(parsed_scores["subtasks"]) == 3, (
317-
f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
318-
)
319+
assert (
320+
len(parsed_scores["subtasks"]) == 3
321+
), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
319322
return parsed_scores
320323

321324

@@ -326,9 +329,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
326329
parsed_scores = parse_multitask_results(
327330
result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
328331
)
329-
assert len(parsed_scores["subtasks"]) == 7, (
330-
f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
331-
)
332+
assert (
333+
len(parsed_scores["subtasks"]) == 7
334+
), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
332335
return parsed_scores
333336

334337

@@ -363,9 +366,9 @@ def get_scores_from_result_dicts(
363366
# this is just a sanity check step
364367
benchmarks_already_covered = set(parsed_scores.keys())
365368
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
366-
assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
367-
f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
368-
)
369+
assert (
370+
len(benchmarks_already_covered & benchmarks_to_parse) == 0
371+
), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
369372

370373
# now actually add them
371374
for benchmark in benchmarks_to_parse:
@@ -579,5 +582,6 @@ def run(
579582
results["overall_score"] = calculate_overall_leaderboard_score(results)
580583

581584
self._results = results
582-
self.save_to_file(output_file)
585+
if output_file:
586+
self.save_to_file(output_file)
583587
return results

0 commit comments

Comments
 (0)