1- from lm_eval .evaluator import simple_evaluate
2- from .evaluator import Evaluator
3-
4-
1+ # Standard
2+ from enum import StrEnum
53from pathlib import Path
4+ import gc
65import json
7- from lm_eval .evaluator import simple_evaluate
8- import typing as t
96import os
10- import torch .multiprocessing as mp
7+ import typing as t
8+
9+ # Third Party
1110from accelerate import Accelerator
12- import torch .distributed as dist
13- import torch
11+ from lm_eval .evaluator import simple_evaluate
1412from torch import cuda
15- import gc
16- from enum import StrEnum
13+ import torch
14+ import torch .distributed as dist
15+ import torch .multiprocessing as mp
16+
17+ # Local
18+ from .evaluator import Evaluator
1719
1820
1921class ParsedScores (t .TypedDict ):
@@ -137,6 +139,7 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
137139def evaluate_with_hf (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
138140 # we need to use torch.multiprocessing to run each task in a separate process,
139141 # and then combine the results
142+ # Third Party
140143 import torch .multiprocessing as mp
141144
142145 num_processes = args ["num_gpus" ]
@@ -166,9 +169,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
166169 p .join ()
167170
168171 # extract the result which is not None
169- assert len ([ res for res in results . values () if res is not None ]) == 1 , (
170- "we expect exactly 1 process to return a results dict properly"
171- )
172+ assert (
173+ len ([ res for res in results . values () if res is not None ]) == 1
174+ ), "we expect exactly 1 process to return a results dict properly"
172175 results_dict = [res for res in results .values () if res is not None ][0 ]
173176 return results_dict
174177
@@ -234,9 +237,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
234237 parsed_scores = parse_multitask_results (
235238 result_dict , LeaderboardV2Tasks .BBH .value , "acc_norm"
236239 )
237- assert len ( parsed_scores [ "subtasks" ]) == 24 , (
238- "there should be 24 subtasks of bbh run"
239- )
240+ assert (
241+ len ( parsed_scores [ " subtasks" ]) == 24
242+ ), "there should be 24 subtasks of bbh run"
240243 return parsed_scores
241244
242245
@@ -287,9 +290,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
287290 scores .append (value )
288291 target_metrics .remove (metric )
289292
290- assert len ( scores ) == 2 , (
291- f"there should only be 2 values extracted in ifeval, got: { len (scores )} "
292- )
293+ assert (
294+ len (scores ) == 2
295+ ), f"there should only be 2 values extracted in ifeval, got: { len ( scores ) } "
293296 return {
294297 "score" : sum (scores ) / 2 ,
295298 }
@@ -313,9 +316,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
313316 parsed_scores = parse_multitask_results (
314317 result_dict , LeaderboardV2Tasks .GPQA .value , "acc_norm"
315318 )
316- assert len ( parsed_scores [ "subtasks" ]) == 3 , (
317- f"Expected 3 gpqa scores, got { len (parsed_scores [' subtasks' ]) } "
318- )
319+ assert (
320+ len (parsed_scores [" subtasks" ]) == 3
321+ ), f"Expected 3 gpqa scores, got { len ( parsed_scores [ 'subtasks' ]) } "
319322 return parsed_scores
320323
321324
@@ -326,9 +329,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
326329 parsed_scores = parse_multitask_results (
327330 result_dict , LeaderboardV2Tasks .MATH_HARD .value , "exact_match"
328331 )
329- assert len ( parsed_scores [ "subtasks" ]) == 7 , (
330- f"leaderboard_math_hard should have 7 subtasks, found: { len (parsed_scores [' subtasks' ]) } "
331- )
332+ assert (
333+ len (parsed_scores [" subtasks" ]) == 7
334+ ), f"leaderboard_math_hard should have 7 subtasks, found: { len ( parsed_scores [ 'subtasks' ]) } "
332335 return parsed_scores
333336
334337
@@ -363,9 +366,9 @@ def get_scores_from_result_dicts(
363366 # this is just a sanity check step
364367 benchmarks_already_covered = set (parsed_scores .keys ())
365368 overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
366- assert len ( benchmarks_already_covered & benchmarks_to_parse ) == 0 , (
367- f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
368- )
369+ assert (
370+ len ( benchmarks_already_covered & benchmarks_to_parse ) == 0
371+ ), f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
369372
370373 # now actually add them
371374 for benchmark in benchmarks_to_parse :
@@ -579,5 +582,6 @@ def run(
579582 results ["overall_score" ] = calculate_overall_leaderboard_score (results )
580583
581584 self ._results = results
582- self .save_to_file (output_file )
585+ if output_file :
586+ self .save_to_file (output_file )
583587 return results
0 commit comments