55import json
66import os
77import typing as t
8+ from copy import deepcopy
89
910# Third Party
1011from accelerate import Accelerator
@@ -46,12 +47,17 @@ class LeaderboardV2Tasks(StrEnum):
4647 BBH = "leaderboard_bbh"
4748
4849
49- class LeaderboardArgs (t .TypedDict ):
50+ class LeaderboardArgsRequired (t .TypedDict ):
5051 model_path : str
5152 num_gpus : int
5253 tasks : t .List [str ]
5354
5455
56+ class LeaderboardArgs (LeaderboardArgsRequired , total = False ):
57+ eval_config : t .Dict [str , t .Any ]
58+ backend_config : t .Dict [str , t .Any ]
59+
60+
5561class TaskGrouping (t .TypedDict ):
5662 """
5763 Class used to group the tasks by their optimal runtime.
@@ -61,6 +67,30 @@ class TaskGrouping(t.TypedDict):
6167 vllm : t .List [str ]
6268
6369
70+ # Default configuration parameters for evaluation
71+ DEFAULT_EVAL_CONFIG = {
72+ "batch_size" : "auto" ,
73+ "apply_chat_template" : True ,
74+ "fewshot_as_multiturn" : True ,
75+ "confirm_run_unsafe_code" : True ,
76+ "max_model_len" : 32768 ,
77+ "system_instruction" : None ,
78+ }
79+
80+ # Default backend-specific configuration parameters
81+ DEFAULT_VLLM_CONFIG = {
82+ "dtype" : "float16" ,
83+ "gpu_memory_utilization" : 0.8 ,
84+ "disable_custom_all_reduce" : True ,
85+ "enforce_eager" : False ,
86+ }
87+
88+ DEFAULT_HF_CONFIG = {
89+ "dtype" : "float16" ,
90+ "trust_remote_code" : True ,
91+ "cache_requests" : True ,
92+ }
93+
6494# generative tasks go here
6595LEADERBOARD_V2_GENERATIVE_TASKS = [
6696 LeaderboardV2Tasks .MATH_HARD .value ,
@@ -77,22 +107,36 @@ class TaskGrouping(t.TypedDict):
77107
78108
79109def evaluate_with_vllm (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
110+ # Start with default configurations
111+ eval_config = deepcopy (DEFAULT_EVAL_CONFIG )
112+ backend_config = deepcopy (DEFAULT_VLLM_CONFIG )
113+
114+ # Override with user-provided configurations
115+ if "eval_config" in args and args ["eval_config" ]:
116+ eval_config .update (args ["eval_config" ])
117+ if "backend_config" in args and args ["backend_config" ]:
118+ backend_config .update (args ["backend_config" ])
119+
120+ # Prepare model_args
121+ model_args = {
122+ "pretrained" : args ["model_path" ],
123+ "data_parallel_size" : args ["num_gpus" ],
124+ ** backend_config ,
125+ }
126+
127+ # Set max_model_len if provided in eval_config
128+ if "max_model_len" in eval_config :
129+ model_args ["max_model_len" ] = eval_config .pop ("max_model_len" )
130+
131+ # Extract system_instruction if provided
132+ system_instruction = eval_config .pop ("system_instruction" , None )
133+
80134 results = simple_evaluate (
81135 tasks = args ["tasks" ],
82136 model = "vllm" ,
83- model_args = {
84- "pretrained" : args ["model_path" ],
85- "dtype" : "float16" ,
86- "data_parallel_size" : args ["num_gpus" ],
87- "gpu_memory_utilization" : 0.8 ,
88- "max_model_len" : 32768 ,
89- "disable_custom_all_reduce" : True ,
90- "enforce_eager" : False ,
91- },
92- apply_chat_template = True ,
93- fewshot_as_multiturn = True ,
94- batch_size = "auto" ,
95- confirm_run_unsafe_code = True ,
137+ model_args = model_args ,
138+ system_instruction = system_instruction ,
139+ ** eval_config ,
96140 )
97141 return results
98142
@@ -108,20 +152,33 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
108152 device = accelerator .device
109153 assert device .type == "cuda" , f"device is not a cuda device: { device } "
110154
155+ # Start with default configurations
156+ eval_config = deepcopy (DEFAULT_EVAL_CONFIG )
157+ backend_config = deepcopy (DEFAULT_HF_CONFIG )
158+
159+ # Override with user-provided configurations
160+ if "eval_config" in args and args ["eval_config" ]:
161+ eval_config .update (args ["eval_config" ])
162+ if "backend_config" in args and args ["backend_config" ]:
163+ backend_config .update (args ["backend_config" ])
164+
165+ # Prepare model_args
166+ model_args = {"pretrained" : args ["model_path" ], ** backend_config }
167+
168+ # Set max_model_len if provided in eval_config
169+ if "max_model_len" in eval_config :
170+ model_args ["max_model_len" ] = eval_config .pop ("max_model_len" )
171+
172+ # Extract system_instruction if provided
173+ system_instruction = eval_config .pop ("system_instruction" , None )
174+
111175 results = simple_evaluate (
112176 model = "hf" ,
113- model_args = {
114- "pretrained" : args ["model_path" ],
115- "dtype" : "float16" ,
116- "trust_remote_code" : True ,
117- },
177+ model_args = model_args ,
118178 tasks = args ["tasks" ],
119- apply_chat_template = True ,
120- fewshot_as_multiturn = True ,
121- batch_size = "auto" ,
122179 device = f"cuda:{ device .index } " ,
123- cache_requests = True ,
124- confirm_run_unsafe_code = True ,
180+ system_instruction = system_instruction ,
181+ ** eval_config ,
125182 )
126183
127184 result_queue .put ((rank , results ))
@@ -483,7 +540,40 @@ def __init__(
483540 tasks : t .Optional [t .List [str ]] = None ,
484541 num_gpus : t .Optional [int ] = None ,
485542 output_file : t .Optional [str ] = None ,
543+ eval_config : t .Optional [t .Dict [str , t .Any ]] = None ,
544+ vllm_config : t .Optional [t .Dict [str , t .Any ]] = None ,
545+ hf_config : t .Optional [t .Dict [str , t .Any ]] = None ,
486546 ):
547+ """
548+ Initialize the evaluator.
549+
550+ Args:
551+ model_path: Path to the model to evaluate.
552+ tasks: List of tasks to evaluate on.
553+ num_gpus: Number of GPUs to use.
554+ output_file: Path to save results to.
555+ eval_config: Configuration for general evaluation parameters that apply to both backends.
556+ Default values (can be overridden):
557+ - batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
558+ - apply_chat_template: True - Whether to apply chat template formatting
559+ - fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
560+ - confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
561+ - max_model_len: 32768 - Maximum sequence length for the model
562+ - system_instruction: None - Optional system instruction to prepend to prompts
563+ vllm_config: Configuration for vLLM-specific parameters.
564+ Default values (can be overridden):
565+ - dtype: "float16" - Data type for model weights
566+ - gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
567+ - disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
568+ - enforce_eager: False - Whether to enforce eager execution
569+ And any other vLLM parameters supported by simple_evaluate.
570+ hf_config: Configuration for HuggingFace-specific parameters.
571+ Default values (can be overridden):
572+ - dtype: "float16" - Data type for model weights
573+ - trust_remote_code: True - Whether to trust remote code in model loading
574+ - cache_requests: True - Whether to cache requests
575+ And any other HuggingFace parameters supported by simple_evaluate.
576+ """
487577 self .model_path = model_path
488578 if not cuda .is_available ():
489579 raise ValueError (
@@ -494,6 +584,11 @@ def __init__(
494584 self .num_gpus = num_gpus
495585 self .tasks = tasks
496586
587+ # Store evaluation configurations
588+ self .eval_config = eval_config or {}
589+ self .vllm_config = vllm_config or {}
590+ self .hf_config = hf_config or {}
591+
497592 # validate output file
498593 self .output_file = output_file
499594 self ._results : t .Optional [LeaderboardV2EvalResult ] = None
@@ -546,6 +641,9 @@ def run(
546641 tasks : t .Optional [t .List [str ]] = None ,
547642 num_gpus : t .Optional [int ] = None ,
548643 output_file : t .Optional [str ] = None ,
644+ eval_config : t .Optional [t .Dict [str , t .Any ]] = None ,
645+ vllm_config : t .Optional [t .Dict [str , t .Any ]] = None ,
646+ hf_config : t .Optional [t .Dict [str , t .Any ]] = None ,
549647 ) -> LeaderboardV2EvalResult :
550648 """
551649 Run the Open LLM Leaderboard v2 evaluation.
@@ -558,6 +656,27 @@ def run(
558656 tasks: The tasks to evaluate.
559657 num_gpus: The number of GPUs to use.
560658 output_file: The path to the file to save the results to.
659+ eval_config: Configuration for general evaluation parameters that apply to both backends.
660+ Default values (can be overridden):
661+ - batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
662+ - apply_chat_template: True - Whether to apply chat template formatting
663+ - fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
664+ - confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
665+ - max_model_len: 32768 - Maximum sequence length for the model
666+ - system_instruction: None - Optional system instruction to prepend to prompts
667+ vllm_config: Configuration for vLLM-specific parameters.
668+ Default values (can be overridden):
669+ - dtype: "float16" - Data type for model weights
670+ - gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
671+ - disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
672+ - enforce_eager: False - Whether to enforce eager execution
673+ And any other vLLM parameters supported by simple_evaluate.
674+ hf_config: Configuration for HuggingFace-specific parameters.
675+ Default values (can be overridden):
676+ - dtype: "float16" - Data type for model weights
677+ - trust_remote_code: True - Whether to trust remote code in model loading
678+ - cache_requests: True - Whether to cache requests
679+ And any other HuggingFace parameters supported by simple_evaluate.
561680
562681 Returns:
563682 LeaderboardV2EvalResult: A dict containing the overall leaderboard score and the breakdown per subtask.
@@ -567,6 +686,11 @@ def run(
567686 num_gpus = self .num_gpus if not num_gpus else num_gpus
568687 output_file = self .output_file if not output_file else output_file
569688
689+ # Merge configurations with instance configurations, with run-time configs taking precedence
690+ final_eval_config = {** self .eval_config , ** (eval_config or {})}
691+ final_vllm_config = {** self .vllm_config , ** (vllm_config or {})}
692+ final_hf_config = {** self .hf_config , ** (hf_config or {})}
693+
570694 if not tasks :
571695 tasks = LEADERBOARD_V2_MCQ_TASKS + LEADERBOARD_V2_GENERATIVE_TASKS
572696
@@ -598,6 +722,8 @@ def run(
598722 "model_path" : model_path ,
599723 "num_gpus" : num_gpus ,
600724 "tasks" : vllm_tasks ,
725+ "eval_config" : final_eval_config ,
726+ "backend_config" : final_vllm_config ,
601727 }
602728 vllm_results = evaluate_with_vllm (args_vllm )
603729 self ._lm_eval_results .append (vllm_results )
@@ -606,6 +732,8 @@ def run(
606732 "model_path" : model_path ,
607733 "num_gpus" : num_gpus ,
608734 "tasks" : hf_tasks ,
735+ "eval_config" : final_eval_config ,
736+ "backend_config" : final_hf_config ,
609737 }
610738 hf_results = evaluate_with_hf (args_hf )
611739 self ._lm_eval_results .append (hf_results )
0 commit comments