Skip to content

Commit aa573d9

Browse files
committed
enable leaderboard to run with a remote openai provider
Signed-off-by: Oleg Silkin <97077423+RobotSail@users.noreply.github.com>
1 parent a257e92 commit aa573d9

1 file changed

Lines changed: 153 additions & 59 deletions

File tree

src/instructlab/eval/leaderboard.py

Lines changed: 153 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class TaskGrouping(t.TypedDict):
6565

6666
huggingface: t.List[str]
6767
vllm: t.List[str]
68+
openai: t.List[str]
6869

6970

7071
# Default configuration parameters for evaluation
@@ -73,7 +74,6 @@ class TaskGrouping(t.TypedDict):
7374
"apply_chat_template": True,
7475
"fewshot_as_multiturn": True,
7576
"confirm_run_unsafe_code": True,
76-
"max_model_len": 32768,
7777
"system_instruction": None,
7878
"cache_requests": False,
7979
}
@@ -84,11 +84,20 @@ class TaskGrouping(t.TypedDict):
8484
"gpu_memory_utilization": 0.8,
8585
"disable_custom_all_reduce": True,
8686
"enforce_eager": False,
87+
"max_model_len": 32768,
8788
}
8889

8990
DEFAULT_HF_CONFIG = {
9091
"dtype": "float16",
9192
"trust_remote_code": True,
93+
"max_length": 32768,
94+
}
95+
96+
# 1. Add OpenAI configuration defaults
97+
DEFAULT_OPENAI_CONFIG = {
98+
"max_tokens": 768,
99+
"temperature": 0.0,
100+
"seed": 1337,
92101
}
93102

94103
# generative tasks go here
@@ -124,10 +133,6 @@ def evaluate_with_vllm(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
124133
**backend_config,
125134
}
126135

127-
# Set max_model_len if provided in eval_config
128-
if "max_model_len" in eval_config:
129-
model_args["max_model_len"] = eval_config.pop("max_model_len")
130-
131136
# Extract system_instruction if provided
132137
system_instruction = eval_config.pop("system_instruction", None)
133138

@@ -165,10 +170,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
165170
# Prepare model_args
166171
model_args = {"pretrained": args["model_path"], **backend_config}
167172

168-
# Set max_model_len if provided in eval_config
169-
if "max_model_len" in eval_config:
170-
model_args["max_model_len"] = eval_config.pop("max_model_len")
171-
172173
# Extract system_instruction if provided
173174
system_instruction = eval_config.pop("system_instruction", None)
174175

@@ -504,14 +505,24 @@ def validate_leaderboard_v2_tasks(tasks: t.List[str]):
504505
)
505506

506507

507-
def get_task_groupings(tasks: t.List[str]) -> TaskGrouping:
508+
def get_task_groupings(
509+
tasks: t.List[str], api_endpoint: t.Optional[str] = None
510+
) -> TaskGrouping:
508511
"""
509512
Given a list of tasks, bucket them per their optimal runtime.
513+
When an API endpoint is provided, all tasks are routed to OpenAI.
510514
"""
515+
if api_endpoint:
516+
# When using an API endpoint, route all tasks to OpenAI
517+
return {"vllm": [], "huggingface": [], "openai": tasks}
518+
519+
# Default behavior when no API endpoint is provided
511520
task_grouping: TaskGrouping = {
512521
"vllm": [task for task in tasks if task in LEADERBOARD_V2_GENERATIVE_TASKS],
513522
"huggingface": [task for task in tasks if task in LEADERBOARD_V2_MCQ_TASKS],
523+
"openai": [],
514524
}
525+
515526
overlapping_tasks = set(task_grouping["vllm"]) & set(task_grouping["huggingface"])
516527
assert not overlapping_tasks
517528
return task_grouping
@@ -543,51 +554,63 @@ def __init__(
543554
eval_config: t.Optional[t.Dict[str, t.Any]] = None,
544555
vllm_config: t.Optional[t.Dict[str, t.Any]] = None,
545556
hf_config: t.Optional[t.Dict[str, t.Any]] = None,
557+
openai_config: t.Optional[t.Dict[str, t.Any]] = None,
558+
api_endpoint: t.Optional[str] = None,
546559
):
547560
"""
548561
Initialize the evaluator.
549562
550563
Args:
551-
model_path: Path to the model to evaluate.
564+
model_path: Path to the model to evaluate or model name for OpenAI API.
552565
tasks: List of tasks to evaluate on.
553-
num_gpus: Number of GPUs to use.
566+
num_gpus: Number of GPUs to use (ignored when using API endpoint).
554567
output_file: Path to save results to.
555-
eval_config: Configuration for general evaluation parameters that apply to both backends.
568+
eval_config: Configuration for general evaluation parameters that apply to all backends.
556569
Default values (can be overridden):
557570
- batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
558571
- apply_chat_template: True - Whether to apply chat template formatting
559572
- fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
560573
- confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
561-
- max_model_len: 32768 - Maximum sequence length for the model
562574
- system_instruction: None - Optional system instruction to prepend to prompts
563-
- cache_requests: False - Whether to cache requests for the dataset
575+
- cache_requests: False - Whether to cache requests
564576
vllm_config: Configuration for vLLM-specific parameters.
565577
Default values (can be overridden):
566578
- dtype: "float16" - Data type for model weights
567579
- gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
568580
- disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
569581
- enforce_eager: False - Whether to enforce eager execution
582+
- max_model_len: 32768 - Maximum sequence length for the model
570583
And any other vLLM parameters supported by simple_evaluate.
571584
hf_config: Configuration for HuggingFace-specific parameters.
572585
Default values (can be overridden):
573586
- dtype: "float16" - Data type for model weights
574587
- trust_remote_code: True - Whether to trust remote code in model loading
588+
- max_length: 32768 - Maximum sequence length for the model
575589
And any other HuggingFace parameters supported by simple_evaluate.
590+
openai_config: Configuration for OpenAI-specific parameters.
591+
Default values (can be overridden):
592+
- max_tokens: 768 - Maximum tokens to generate
593+
- temperature: 0.0 - Temperature for sampling
594+
- seed: 1337 - Seed for reproducibility
595+
api_endpoint: Optional OpenAI-compatible API endpoint.
596+
When provided, tasks are evaluated using the OpenAI API instead of local models.
576597
"""
577598
self.model_path = model_path
578-
if not cuda.is_available():
599+
if not api_endpoint and not cuda.is_available():
579600
raise ValueError(
580-
"Running without CUDA is currently unsupported. Contributions are welcome."
601+
"Running without CUDA is currently unsupported unless using an API endpoint."
581602
)
582603

583604
# set whatever we need here
584605
self.num_gpus = num_gpus
585606
self.tasks = tasks
607+
self.api_endpoint = api_endpoint
586608

587609
# Store evaluation configurations
588610
self.eval_config = eval_config or {}
589611
self.vllm_config = vllm_config or {}
590612
self.hf_config = hf_config or {}
613+
self.openai_config = openai_config or {}
591614

592615
# validate output file
593616
self.output_file = output_file
@@ -644,38 +667,49 @@ def run(
644667
eval_config: t.Optional[t.Dict[str, t.Any]] = None,
645668
vllm_config: t.Optional[t.Dict[str, t.Any]] = None,
646669
hf_config: t.Optional[t.Dict[str, t.Any]] = None,
670+
openai_config: t.Optional[t.Dict[str, t.Any]] = None,
671+
api_endpoint: t.Optional[str] = None,
647672
) -> LeaderboardV2EvalResult:
648673
"""
649674
Run the Open LLM Leaderboard v2 evaluation.
650675
651-
This function will use both HF transformers and inline vLLM to run the evaluation.
652-
It will then parse the results and save them to a file.
676+
This function will use the appropriate backend based on the provided parameters:
677+
- With api_endpoint: Uses the OpenAI API for all tasks
678+
- Without api_endpoint: Uses both HF transformers and vLLM for optimal performance
653679
654680
Args:
655-
model_path: The path to the model to evaluate.
681+
model_path: The path to the model to evaluate or model name for API.
656682
tasks: The tasks to evaluate.
657-
num_gpus: The number of GPUs to use.
683+
num_gpus: The number of GPUs to use (ignored when using API).
658684
output_file: The path to the file to save the results to.
659-
eval_config: Configuration for general evaluation parameters that apply to both backends.
685+
eval_config: Configuration for general evaluation parameters that apply to all backends.
660686
Default values (can be overridden):
661687
- batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
662688
- apply_chat_template: True - Whether to apply chat template formatting
663689
- fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
664690
- confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
665-
- max_model_len: 32768 - Maximum sequence length for the model
666691
- system_instruction: None - Optional system instruction to prepend to prompts
692+
- cache_requests: False - Whether to cache requests
667693
vllm_config: Configuration for vLLM-specific parameters.
668694
Default values (can be overridden):
669695
- dtype: "float16" - Data type for model weights
670696
- gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
671697
- disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
672698
- enforce_eager: False - Whether to enforce eager execution
699+
- max_model_len: 32768 - Maximum sequence length for the model
673700
And any other vLLM parameters supported by simple_evaluate.
674701
hf_config: Configuration for HuggingFace-specific parameters.
675702
Default values (can be overridden):
676703
- dtype: "float16" - Data type for model weights
677704
- trust_remote_code: True - Whether to trust remote code in model loading
705+
- max_length: 32768 - Maximum sequence length for the model
678706
And any other HuggingFace parameters supported by simple_evaluate.
707+
openai_config: Configuration for OpenAI-specific parameters.
708+
Default values (can be overridden):
709+
- max_tokens: 768 - Maximum tokens to generate
710+
- temperature: 0.0 - Temperature for sampling
711+
- seed: 1337 - Seed for reproducibility
712+
api_endpoint: Optional OpenAI-compatible API endpoint.
679713
680714
Returns:
681715
LeaderboardV2EvalResult: A dict containing the overall leaderboard score and the breakdown per subtask.
@@ -684,60 +718,76 @@ def run(
684718
tasks = self.tasks if not tasks else tasks
685719
num_gpus = self.num_gpus if not num_gpus else num_gpus
686720
output_file = self.output_file if not output_file else output_file
721+
api_endpoint = self.api_endpoint if api_endpoint is None else api_endpoint
687722

688723
# Merge configurations with instance configurations, with run-time configs taking precedence
689724
final_eval_config = {**self.eval_config, **(eval_config or {})}
690725
final_vllm_config = {**self.vllm_config, **(vllm_config or {})}
691726
final_hf_config = {**self.hf_config, **(hf_config or {})}
727+
final_openai_config = {**self.openai_config, **(openai_config or {})}
728+
729+
# If API endpoint is provided, add it to the OpenAI config
730+
if api_endpoint and "base_url" not in final_openai_config:
731+
final_openai_config["base_url"] = api_endpoint
692732

693733
if not tasks:
694734
tasks = LEADERBOARD_V2_MCQ_TASKS + LEADERBOARD_V2_GENERATIVE_TASKS
695735

696736
# validation logic
697-
# no need to validate model path -- the inference libraries will either be able to
698-
# load it, or they won't
699-
700737
validate_leaderboard_v2_tasks(tasks)
701-
if not num_gpus:
702-
num_gpus = cuda.device_count()
703-
if num_gpus <= 0 or num_gpus > cuda.device_count():
704-
raise ValueError(
705-
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
706-
)
738+
739+
# Only validate GPU requirements when not using an API endpoint
740+
if not api_endpoint:
741+
if not num_gpus:
742+
num_gpus = cuda.device_count()
743+
if num_gpus <= 0 or num_gpus > cuda.device_count():
744+
raise ValueError(
745+
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
746+
)
747+
707748
if output_file:
708749
validate_output_path(output_file)
709750

710-
# now we just have to run the task group in their most appropriate runtime
711-
# this is important because certain tasks like MCQ are better-suited to be
712-
# excuted in raw transformers due to the lack of KV-Cache overhead,
713-
# whereas generative tasks are better suited for vLLM due to their need for
714-
# accessing previous tokens
715-
716-
grouped_tasks = get_task_groupings(tasks)
751+
# Group tasks by optimal runtime
752+
grouped_tasks = get_task_groupings(tasks, api_endpoint)
717753
self._lm_eval_results = []
718-
vllm_results, hf_results = None, None
719-
if vllm_tasks := grouped_tasks["vllm"]:
720-
args_vllm: LeaderboardArgs = {
721-
"model_path": model_path,
722-
"num_gpus": num_gpus,
723-
"tasks": vllm_tasks,
724-
"eval_config": final_eval_config,
725-
"backend_config": final_vllm_config,
726-
}
727-
vllm_results = evaluate_with_vllm(args_vllm)
728-
self._lm_eval_results.append(vllm_results)
729-
if hf_tasks := grouped_tasks["huggingface"]:
730-
args_hf: LeaderboardArgs = {
754+
755+
# Execute tasks using the appropriate backends
756+
if openai_tasks := grouped_tasks["openai"]:
757+
args_openai: LeaderboardArgs = {
731758
"model_path": model_path,
732-
"num_gpus": num_gpus,
733-
"tasks": hf_tasks,
759+
"num_gpus": 1, # Not used for API calls but required by the type
760+
"tasks": openai_tasks,
734761
"eval_config": final_eval_config,
735-
"backend_config": final_hf_config,
762+
"backend_config": final_openai_config,
736763
}
737-
hf_results = evaluate_with_hf(args_hf)
738-
self._lm_eval_results.append(hf_results)
739-
740-
# convert the output of lm-eval into something that's already parsed
764+
openai_results = evaluate_with_openai(args_openai)
765+
self._lm_eval_results.append(openai_results)
766+
else:
767+
# Only run local evaluation if not using OpenAI API
768+
if vllm_tasks := grouped_tasks["vllm"]:
769+
args_vllm: LeaderboardArgs = {
770+
"model_path": model_path,
771+
"num_gpus": num_gpus,
772+
"tasks": vllm_tasks,
773+
"eval_config": final_eval_config,
774+
"backend_config": final_vllm_config,
775+
}
776+
vllm_results = evaluate_with_vllm(args_vllm)
777+
self._lm_eval_results.append(vllm_results)
778+
779+
if hf_tasks := grouped_tasks["huggingface"]:
780+
args_hf: LeaderboardArgs = {
781+
"model_path": model_path,
782+
"num_gpus": num_gpus,
783+
"tasks": hf_tasks,
784+
"eval_config": final_eval_config,
785+
"backend_config": final_hf_config,
786+
}
787+
hf_results = evaluate_with_hf(args_hf)
788+
self._lm_eval_results.append(hf_results)
789+
790+
# Convert the output of lm-eval into something that's already parsed
741791
results: LeaderboardV2EvalResult = get_scores_from_result_dicts(
742792
*self._lm_eval_results
743793
)
@@ -746,3 +796,47 @@ def run(
746796
if output_file:
747797
self.save_to_file(output_file)
748798
return results
799+
800+
801+
def evaluate_with_openai(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
802+
# Start with default configurations
803+
eval_config = deepcopy(DEFAULT_EVAL_CONFIG)
804+
backend_config = deepcopy(DEFAULT_OPENAI_CONFIG)
805+
806+
# Override with user-provided configurations
807+
if "eval_config" in args and args["eval_config"]:
808+
eval_config.update(args["eval_config"])
809+
if "backend_config" in args and args["backend_config"]:
810+
backend_config.update(args["backend_config"])
811+
812+
# Extract base_url and api_key from backend_config if provided
813+
base_url = backend_config.pop("base_url", None)
814+
api_key = backend_config.pop("api_key", None)
815+
816+
# Build model_args for lm-eval's OpenAI client
817+
model_args = {
818+
"model": args["model_path"], # model name as recognized by the API
819+
}
820+
821+
# Add base_url if provided
822+
if base_url:
823+
model_args["base_url"] = base_url
824+
825+
# Add API key if provided
826+
if api_key:
827+
model_args["api_key"] = api_key
828+
829+
# Add any remaining backend config options
830+
model_args.update(backend_config)
831+
832+
# Extract system_instruction if provided
833+
system_instruction = eval_config.pop("system_instruction", None)
834+
835+
results = simple_evaluate(
836+
tasks=args["tasks"],
837+
model="openai",
838+
model_args=model_args,
839+
system_instruction=system_instruction,
840+
**eval_config,
841+
)
842+
return results

0 commit comments

Comments
 (0)