Allow max_workers to be passed in after evaluator is created

Dan McPherson · Dan McPherson · commit 52328fde9423 · 2024-09-13T10:55:33.000-04:00
The reason this is necessary is serving_gpus might not be available when the evaluator is constructed.  A typical flow might be:

- Create evaluator
- Launch server # This is when serving_gpus is probably calculated
- generate
- judge

The new logic allows for max_workers and serving_gpus to be passed in as the generate and judge steps occur.

Once all the known callers have been updated (eval and training) we can remove the two attrs from the constructors and make the logic a little simpler.

Signed-off-by: Dan McPherson &lt;dmcphers@redhat.com&gt;
diff --git a/src/instructlab/eval/mt_bench.py b/src/instructlab/eval/mt_bench.py
@@ -52,31 +52,49 @@ def __init__(
         self.output_dir = output_dir
         self.serving_gpus = serving_gpus
         self.merge_system_user_message = merge_system_user_message
+        self.max_workers = self._calc_max_workers(max_workers, serving_gpus)
 
-        if max_workers == "auto":
-            try:
-                # Not available on all platforms
-                usable_cpu_count = len(os.sched_getaffinity(0))  # type: ignore[attr-defined]
-            except AttributeError:
-                usable_cpu_count = multiprocessing.cpu_count()
-            if serving_gpus is not None:
-                # Tune max_workers based on hardware configuration: min(#GPUs being used * 10, #CPU cores)
-                # Please see https://github.com/instructlab/instructlab/issues/2050 for detailed explanation
-                self.max_workers = min(max(serving_gpus, 1) * 10, usable_cpu_count)
-                logger.debug("Auto tuning max_workers to %s", self.max_workers)
+    def _calc_max_workers(
+        self, max_workers: int | str | None, serving_gpus: int | None
+    ):
+        calculated_max_workers = None
+        if max_workers is not None:
+            if max_workers == "auto":
+                try:
+                    # Not available on all platforms
+                    usable_cpu_count = len(os.sched_getaffinity(0))  # type: ignore[attr-defined]
+                except AttributeError:
+                    usable_cpu_count = multiprocessing.cpu_count()
+                if serving_gpus is not None:
+                    # Tune max_workers based on hardware configuration: min(#GPUs being used * 10, #CPU cores)
+                    # Please see https://github.com/instructlab/instructlab/issues/2050 for detailed explanation
+                    calculated_max_workers = min(
+                        max(serving_gpus, 1) * 10, usable_cpu_count
+                    )
+                    logger.debug(
+                        "Auto tuning max_workers to %s", calculated_max_workers
+                    )
+                else:
+                    # Don't be too aggressive when serving_gpus isn't specified. Use half the cpu count.
+                    calculated_max_workers = usable_cpu_count // 2
+                    logger.debug(
+                        "max_workers set to auto but serving_gpus is not specified. Defaulting to (cpu count / 2): %s",
+                        calculated_max_workers,
+                    )
             else:
-                # Don't be too aggressive when serving_gpus isn't specified. Use half the cpu count.
-                self.max_workers = usable_cpu_count // 2
-                logger.debug(
-                    "max_workers set to auto but serving_gpus is not specified. Defaulting to (cpu count / 2): %s",
-                    self.max_workers,
-                )
+                if isinstance(max_workers, int) and max_workers > 0:
+                    logger.debug("max_workers specified as: %s", max_workers)
+                    calculated_max_workers = max_workers
+                else:
+                    raise InvalidMaxWorkersError(max_workers)
+        return calculated_max_workers
+
+    def _get_effective_max_workers(self, max_workers, serving_gpus):
+        if max_workers is not None:
+            effective_max_workers = self._calc_max_workers(max_workers, serving_gpus)
         else:
-            if isinstance(max_workers, int) and max_workers > 0:
-                logger.debug("max_workers specified as: %s", max_workers)
-                self.max_workers = max_workers
-            else:
-                raise InvalidMaxWorkersError(max_workers)
+            effective_max_workers = self.max_workers
+        return effective_max_workers
 
 
 class MTBenchEvaluator(AbstractMTBenchEvaluator):
@@ -94,30 +112,46 @@ class MTBenchEvaluator(AbstractMTBenchEvaluator):
 
     name = "mt_bench"
 
-    def gen_answers(self, server_url, api_key: str | None = None) -> None:
+    def gen_answers(
+        self,
+        server_url,
+        api_key: str | None = None,
+        max_workers: int | str | None = None,
+        serving_gpus: int | None = None,
+    ) -> None:
         """
         Asks questions to model
 
         Attributes
             server_url      Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
             api_key         API token for authenticating with model server
+            max_workers     Max parallel workers to run the evaluation with (int or "auto").  None indicates to use value specified in constructor.
+            serving_gpus    Number of gpus allocated for serving.  Used to tune with max_workers=auto.  None indicates to use value specified in constructor.
         """
         logger.debug(locals())
         mt_bench_answers.generate_answers(
             self.model_name,
             server_url,
             api_key=api_key,
             output_dir=self.output_dir,
-            max_workers=self.max_workers,
+            max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
         )
 
-    def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
+    def judge_answers(
+        self,
+        server_url,
+        api_key: str | None = None,
+        max_workers: int | str | None = None,
+        serving_gpus: int | None = None,
+    ) -> tuple:
         """
         Runs MT-Bench judgment
 
         Attributes
             server_url      Model server endpoint (Ex: http://localhost:8000/v1) for the judge model
             api_key         API token for authenticating with model server
+            max_workers     Max parallel workers to run the evaluation with (int or "auto").  None indicates to use value specified in constructor.
+            serving_gpus    Number of gpus allocated for serving.  Used to tune with max_workers=auto.  None indicates to use value specified in constructor.
 
         Returns:
             overall_score   MT-Bench score for the overall model evaluation
@@ -130,7 +164,7 @@ def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
             self.judge_model_name,
             server_url,
             api_key=api_key,
-            max_workers=self.max_workers,
+            max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
             output_dir=self.output_dir,
             merge_system_user_message=self.merge_system_user_message,
         )
@@ -175,13 +209,21 @@ def __init__(
         self.taxonomy_git_repo_path = taxonomy_git_repo_path
         self.branch = branch
 
-    def gen_answers(self, server_url, api_key: str | None = None) -> None:
+    def gen_answers(
+        self,
+        server_url,
+        api_key: str | None = None,
+        max_workers: int | str | None = None,
+        serving_gpus: int | None = None,
+    ) -> None:
         """
         Asks questions to model
 
         Attributes
             server_url  Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
             api_key     API token for authenticating with model server
+            max_workers     Max parallel workers to run the evaluation with (int or "auto").  None indicates to use value specified in constructor.
+            serving_gpus    Number of gpus allocated for serving.  Used to tune with max_workers=auto.  None indicates to use value specified in constructor.
         """
         logger.debug(locals())
         mt_bench_branch_generator.generate(
@@ -197,17 +239,25 @@ def gen_answers(self, server_url, api_key: str | None = None) -> None:
             branch=self.branch,
             output_dir=self.output_dir,
             data_dir=self.output_dir,
-            max_workers=self.max_workers,
+            max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
             bench_name="mt_bench_branch",
         )
 
-    def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
+    def judge_answers(
+        self,
+        server_url,
+        api_key: str | None = None,
+        max_workers: int | str | None = None,
+        serving_gpus: int | None = None,
+    ) -> tuple:
         """
         Runs MT-Bench-Branch judgment.  Judgments can be compared across runs with consistent question_id -> qna file name.
 
         Attributes
             server_url      Model server endpoint (Ex: http://localhost:8000/v1) for the judge model
             api_key         API token for authenticating with model server
+            max_workers     Max parallel workers to run the evaluation with (int or "auto").  None indicates to use value specified in constructor.
+            serving_gpus    Number of gpus allocated for serving.  Used to tune with max_workers=auto.  None indicates to use value specified in constructor.
 
         Returns:
             qa_pairs        Question and answer pairs (with scores) from the evaluation
@@ -219,7 +269,7 @@ def judge_answers(self, server_url, api_key: str | None = None) -> tuple:
             server_url,
             api_key=api_key,
             branch=self.branch,
-            max_workers=self.max_workers,
+            max_workers=self._get_effective_max_workers(max_workers, serving_gpus),
             output_dir=self.output_dir,
             data_dir=self.output_dir,
             bench_name="mt_bench_branch",