Skip to content

Commit bd95672

Browse files
committed
enable users to override the default vLLM + HF settings, as well as options for the simple_evaluate function
Signed-off-by: Oleg Silkin <97077423+RobotSail@users.noreply.github.com>
1 parent b43f697 commit bd95672

1 file changed

Lines changed: 152 additions & 24 deletions

File tree

src/instructlab/eval/leaderboard.py

Lines changed: 152 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import json
66
import os
77
import typing as t
8+
from copy import deepcopy
89

910
# Third Party
1011
from accelerate import Accelerator
@@ -46,12 +47,17 @@ class LeaderboardV2Tasks(StrEnum):
4647
BBH = "leaderboard_bbh"
4748

4849

49-
class LeaderboardArgs(t.TypedDict):
50+
class LeaderboardArgsRequired(t.TypedDict):
5051
model_path: str
5152
num_gpus: int
5253
tasks: t.List[str]
5354

5455

56+
class LeaderboardArgs(LeaderboardArgsRequired, total=False):
57+
eval_config: t.Dict[str, t.Any]
58+
backend_config: t.Dict[str, t.Any]
59+
60+
5561
class TaskGrouping(t.TypedDict):
5662
"""
5763
Class used to group the tasks by their optimal runtime.
@@ -61,6 +67,30 @@ class TaskGrouping(t.TypedDict):
6167
vllm: t.List[str]
6268

6369

70+
# Default configuration parameters for evaluation
71+
DEFAULT_EVAL_CONFIG = {
72+
"batch_size": "auto",
73+
"apply_chat_template": True,
74+
"fewshot_as_multiturn": True,
75+
"confirm_run_unsafe_code": True,
76+
"max_model_len": 32768,
77+
"system_instruction": None,
78+
}
79+
80+
# Default backend-specific configuration parameters
81+
DEFAULT_VLLM_CONFIG = {
82+
"dtype": "float16",
83+
"gpu_memory_utilization": 0.8,
84+
"disable_custom_all_reduce": True,
85+
"enforce_eager": False,
86+
}
87+
88+
DEFAULT_HF_CONFIG = {
89+
"dtype": "float16",
90+
"trust_remote_code": True,
91+
"cache_requests": True,
92+
}
93+
6494
# generative tasks go here
6595
LEADERBOARD_V2_GENERATIVE_TASKS = [
6696
LeaderboardV2Tasks.MATH_HARD.value,
@@ -77,22 +107,36 @@ class TaskGrouping(t.TypedDict):
77107

78108

79109
def evaluate_with_vllm(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
110+
# Start with default configurations
111+
eval_config = deepcopy(DEFAULT_EVAL_CONFIG)
112+
backend_config = deepcopy(DEFAULT_VLLM_CONFIG)
113+
114+
# Override with user-provided configurations
115+
if "eval_config" in args and args["eval_config"]:
116+
eval_config.update(args["eval_config"])
117+
if "backend_config" in args and args["backend_config"]:
118+
backend_config.update(args["backend_config"])
119+
120+
# Prepare model_args
121+
model_args = {
122+
"pretrained": args["model_path"],
123+
"data_parallel_size": args["num_gpus"],
124+
**backend_config,
125+
}
126+
127+
# Set max_model_len if provided in eval_config
128+
if "max_model_len" in eval_config:
129+
model_args["max_model_len"] = eval_config.pop("max_model_len")
130+
131+
# Extract system_instruction if provided
132+
system_instruction = eval_config.pop("system_instruction", None)
133+
80134
results = simple_evaluate(
81135
tasks=args["tasks"],
82136
model="vllm",
83-
model_args={
84-
"pretrained": args["model_path"],
85-
"dtype": "float16",
86-
"data_parallel_size": args["num_gpus"],
87-
"gpu_memory_utilization": 0.8,
88-
"max_model_len": 32768,
89-
"disable_custom_all_reduce": True,
90-
"enforce_eager": False,
91-
},
92-
apply_chat_template=True,
93-
fewshot_as_multiturn=True,
94-
batch_size="auto",
95-
confirm_run_unsafe_code=True,
137+
model_args=model_args,
138+
system_instruction=system_instruction,
139+
**eval_config,
96140
)
97141
return results
98142

@@ -108,20 +152,33 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
108152
device = accelerator.device
109153
assert device.type == "cuda", f"device is not a cuda device: {device}"
110154

155+
# Start with default configurations
156+
eval_config = deepcopy(DEFAULT_EVAL_CONFIG)
157+
backend_config = deepcopy(DEFAULT_HF_CONFIG)
158+
159+
# Override with user-provided configurations
160+
if "eval_config" in args and args["eval_config"]:
161+
eval_config.update(args["eval_config"])
162+
if "backend_config" in args and args["backend_config"]:
163+
backend_config.update(args["backend_config"])
164+
165+
# Prepare model_args
166+
model_args = {"pretrained": args["model_path"], **backend_config}
167+
168+
# Set max_model_len if provided in eval_config
169+
if "max_model_len" in eval_config:
170+
model_args["max_model_len"] = eval_config.pop("max_model_len")
171+
172+
# Extract system_instruction if provided
173+
system_instruction = eval_config.pop("system_instruction", None)
174+
111175
results = simple_evaluate(
112176
model="hf",
113-
model_args={
114-
"pretrained": args["model_path"],
115-
"dtype": "float16",
116-
"trust_remote_code": True,
117-
},
177+
model_args=model_args,
118178
tasks=args["tasks"],
119-
apply_chat_template=True,
120-
fewshot_as_multiturn=True,
121-
batch_size="auto",
122179
device=f"cuda:{device.index}",
123-
cache_requests=True,
124-
confirm_run_unsafe_code=True,
180+
system_instruction=system_instruction,
181+
**eval_config,
125182
)
126183

127184
result_queue.put((rank, results))
@@ -483,7 +540,40 @@ def __init__(
483540
tasks: t.Optional[t.List[str]] = None,
484541
num_gpus: t.Optional[int] = None,
485542
output_file: t.Optional[str] = None,
543+
eval_config: t.Optional[t.Dict[str, t.Any]] = None,
544+
vllm_config: t.Optional[t.Dict[str, t.Any]] = None,
545+
hf_config: t.Optional[t.Dict[str, t.Any]] = None,
486546
):
547+
"""
548+
Initialize the evaluator.
549+
550+
Args:
551+
model_path: Path to the model to evaluate.
552+
tasks: List of tasks to evaluate on.
553+
num_gpus: Number of GPUs to use.
554+
output_file: Path to save results to.
555+
eval_config: Configuration for general evaluation parameters that apply to both backends.
556+
Default values (can be overridden):
557+
- batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
558+
- apply_chat_template: True - Whether to apply chat template formatting
559+
- fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
560+
- confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
561+
- max_model_len: 32768 - Maximum sequence length for the model
562+
- system_instruction: None - Optional system instruction to prepend to prompts
563+
vllm_config: Configuration for vLLM-specific parameters.
564+
Default values (can be overridden):
565+
- dtype: "float16" - Data type for model weights
566+
- gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
567+
- disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
568+
- enforce_eager: False - Whether to enforce eager execution
569+
And any other vLLM parameters supported by simple_evaluate.
570+
hf_config: Configuration for HuggingFace-specific parameters.
571+
Default values (can be overridden):
572+
- dtype: "float16" - Data type for model weights
573+
- trust_remote_code: True - Whether to trust remote code in model loading
574+
- cache_requests: True - Whether to cache requests
575+
And any other HuggingFace parameters supported by simple_evaluate.
576+
"""
487577
self.model_path = model_path
488578
if not cuda.is_available():
489579
raise ValueError(
@@ -494,6 +584,11 @@ def __init__(
494584
self.num_gpus = num_gpus
495585
self.tasks = tasks
496586

587+
# Store evaluation configurations
588+
self.eval_config = eval_config or {}
589+
self.vllm_config = vllm_config or {}
590+
self.hf_config = hf_config or {}
591+
497592
# validate output file
498593
self.output_file = output_file
499594
self._results: t.Optional[LeaderboardV2EvalResult] = None
@@ -546,6 +641,9 @@ def run(
546641
tasks: t.Optional[t.List[str]] = None,
547642
num_gpus: t.Optional[int] = None,
548643
output_file: t.Optional[str] = None,
644+
eval_config: t.Optional[t.Dict[str, t.Any]] = None,
645+
vllm_config: t.Optional[t.Dict[str, t.Any]] = None,
646+
hf_config: t.Optional[t.Dict[str, t.Any]] = None,
549647
) -> LeaderboardV2EvalResult:
550648
"""
551649
Run the Open LLM Leaderboard v2 evaluation.
@@ -558,6 +656,27 @@ def run(
558656
tasks: The tasks to evaluate.
559657
num_gpus: The number of GPUs to use.
560658
output_file: The path to the file to save the results to.
659+
eval_config: Configuration for general evaluation parameters that apply to both backends.
660+
Default values (can be overridden):
661+
- batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
662+
- apply_chat_template: True - Whether to apply chat template formatting
663+
- fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
664+
- confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
665+
- max_model_len: 32768 - Maximum sequence length for the model
666+
- system_instruction: None - Optional system instruction to prepend to prompts
667+
vllm_config: Configuration for vLLM-specific parameters.
668+
Default values (can be overridden):
669+
- dtype: "float16" - Data type for model weights
670+
- gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
671+
- disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
672+
- enforce_eager: False - Whether to enforce eager execution
673+
And any other vLLM parameters supported by simple_evaluate.
674+
hf_config: Configuration for HuggingFace-specific parameters.
675+
Default values (can be overridden):
676+
- dtype: "float16" - Data type for model weights
677+
- trust_remote_code: True - Whether to trust remote code in model loading
678+
- cache_requests: True - Whether to cache requests
679+
And any other HuggingFace parameters supported by simple_evaluate.
561680
562681
Returns:
563682
LeaderboardV2EvalResult: A dict containing the overall leaderboard score and the breakdown per subtask.
@@ -567,6 +686,11 @@ def run(
567686
num_gpus = self.num_gpus if not num_gpus else num_gpus
568687
output_file = self.output_file if not output_file else output_file
569688

689+
# Merge configurations with instance configurations, with run-time configs taking precedence
690+
final_eval_config = {**self.eval_config, **(eval_config or {})}
691+
final_vllm_config = {**self.vllm_config, **(vllm_config or {})}
692+
final_hf_config = {**self.hf_config, **(hf_config or {})}
693+
570694
if not tasks:
571695
tasks = LEADERBOARD_V2_MCQ_TASKS + LEADERBOARD_V2_GENERATIVE_TASKS
572696

@@ -598,6 +722,8 @@ def run(
598722
"model_path": model_path,
599723
"num_gpus": num_gpus,
600724
"tasks": vllm_tasks,
725+
"eval_config": final_eval_config,
726+
"backend_config": final_vllm_config,
601727
}
602728
vllm_results = evaluate_with_vllm(args_vllm)
603729
self._lm_eval_results.append(vllm_results)
@@ -606,6 +732,8 @@ def run(
606732
"model_path": model_path,
607733
"num_gpus": num_gpus,
608734
"tasks": hf_tasks,
735+
"eval_config": final_eval_config,
736+
"backend_config": final_hf_config,
609737
}
610738
hf_results = evaluate_with_hf(args_hf)
611739
self._lm_eval_results.append(hf_results)

0 commit comments

Comments
 (0)