formatting

RobotSail · RobotSail · commit 15e9f75bcd91 · 2025-04-16T03:40:42.000Z
Signed-off-by: Oleg Silkin &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
@@ -1,19 +1,21 @@
-from lm_eval.evaluator import simple_evaluate
-from .evaluator import Evaluator
-
-
+# Standard
+from enum import StrEnum
 from pathlib import Path
+import gc
 import json
-from lm_eval.evaluator import simple_evaluate
-import typing as t
 import os
-import torch.multiprocessing as mp
+import typing as t
+
+# Third Party
 from accelerate import Accelerator
-import torch.distributed as dist
-import torch
+from lm_eval.evaluator import simple_evaluate
 from torch import cuda
-import gc
-from enum import StrEnum
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+# Local
+from .evaluator import Evaluator
 
 
 class ParsedScores(t.TypedDict):
@@ -137,6 +139,7 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
 def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
     # we need to use torch.multiprocessing to run each task in a separate process,
     # and then combine the results
+    # Third Party
     import torch.multiprocessing as mp
 
     num_processes = args["num_gpus"]
@@ -166,9 +169,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
         p.join()
 
     # extract the result which is not None
-    assert len([res for res in results.values() if res is not None]) == 1, (
-        "we expect exactly 1 process to return a results dict properly"
-    )
+    assert (
+        len([res for res in results.values() if res is not None]) == 1
+    ), "we expect exactly 1 process to return a results dict properly"
     results_dict = [res for res in results.values() if res is not None][0]
     return results_dict
 
@@ -234,9 +237,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
     )
-    assert len(parsed_scores["subtasks"]) == 24, (
-        "there should be 24 subtasks of bbh run"
-    )
+    assert (
+        len(parsed_scores["subtasks"]) == 24
+    ), "there should be 24 subtasks of bbh run"
     return parsed_scores
 
 
@@ -287,9 +290,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
             scores.append(value)
             target_metrics.remove(metric)
 
-    assert len(scores) == 2, (
-        f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
-    )
+    assert (
+        len(scores) == 2
+    ), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
     return {
         "score": sum(scores) / 2,
     }
@@ -313,9 +316,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
     )
-    assert len(parsed_scores["subtasks"]) == 3, (
-        f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
-    )
+    assert (
+        len(parsed_scores["subtasks"]) == 3
+    ), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
     return parsed_scores
 
 
@@ -326,9 +329,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
     parsed_scores = parse_multitask_results(
         result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
     )
-    assert len(parsed_scores["subtasks"]) == 7, (
-        f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
-    )
+    assert (
+        len(parsed_scores["subtasks"]) == 7
+    ), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
     return parsed_scores
 
 
@@ -363,9 +366,9 @@ def get_scores_from_result_dicts(
         # this is just a sanity check step
         benchmarks_already_covered = set(parsed_scores.keys())
         overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
-        assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
-            f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
-        )
+        assert (
+            len(benchmarks_already_covered & benchmarks_to_parse) == 0
+        ), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
 
         # now actually add them
         for benchmark in benchmarks_to_parse:
@@ -579,5 +582,6 @@ def run(
         results["overall_score"] = calculate_overall_leaderboard_score(results)
 
         self._results = results
-        self.save_to_file(output_file)
+        if output_file:
+            self.save_to_file(output_file)
         return results