@@ -65,6 +65,7 @@ class TaskGrouping(t.TypedDict):
6565
6666 huggingface : t .List [str ]
6767 vllm : t .List [str ]
68+ openai : t .List [str ]
6869
6970
7071# Default configuration parameters for evaluation
@@ -73,7 +74,6 @@ class TaskGrouping(t.TypedDict):
7374 "apply_chat_template" : True ,
7475 "fewshot_as_multiturn" : True ,
7576 "confirm_run_unsafe_code" : True ,
76- "max_model_len" : 32768 ,
7777 "system_instruction" : None ,
7878 "cache_requests" : False ,
7979}
@@ -84,11 +84,20 @@ class TaskGrouping(t.TypedDict):
8484 "gpu_memory_utilization" : 0.8 ,
8585 "disable_custom_all_reduce" : True ,
8686 "enforce_eager" : False ,
87+ "max_model_len" : 32768 ,
8788}
8889
8990DEFAULT_HF_CONFIG = {
9091 "dtype" : "float16" ,
9192 "trust_remote_code" : True ,
93+ "max_length" : 32768 ,
94+ }
95+
96+ # 1. Add OpenAI configuration defaults
97+ DEFAULT_OPENAI_CONFIG = {
98+ "max_tokens" : 768 ,
99+ "temperature" : 0.0 ,
100+ "seed" : 1337 ,
92101}
93102
94103# generative tasks go here
@@ -124,10 +133,6 @@ def evaluate_with_vllm(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
124133 ** backend_config ,
125134 }
126135
127- # Set max_model_len if provided in eval_config
128- if "max_model_len" in eval_config :
129- model_args ["max_model_len" ] = eval_config .pop ("max_model_len" )
130-
131136 # Extract system_instruction if provided
132137 system_instruction = eval_config .pop ("system_instruction" , None )
133138
@@ -165,10 +170,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
165170 # Prepare model_args
166171 model_args = {"pretrained" : args ["model_path" ], ** backend_config }
167172
168- # Set max_model_len if provided in eval_config
169- if "max_model_len" in eval_config :
170- model_args ["max_model_len" ] = eval_config .pop ("max_model_len" )
171-
172173 # Extract system_instruction if provided
173174 system_instruction = eval_config .pop ("system_instruction" , None )
174175
@@ -504,14 +505,24 @@ def validate_leaderboard_v2_tasks(tasks: t.List[str]):
504505 )
505506
506507
507- def get_task_groupings (tasks : t .List [str ]) -> TaskGrouping :
508+ def get_task_groupings (
509+ tasks : t .List [str ], api_endpoint : t .Optional [str ] = None
510+ ) -> TaskGrouping :
508511 """
509512 Given a list of tasks, bucket them per their optimal runtime.
513+ When an API endpoint is provided, all tasks are routed to OpenAI.
510514 """
515+ if api_endpoint :
516+ # When using an API endpoint, route all tasks to OpenAI
517+ return {"vllm" : [], "huggingface" : [], "openai" : tasks }
518+
519+ # Default behavior when no API endpoint is provided
511520 task_grouping : TaskGrouping = {
512521 "vllm" : [task for task in tasks if task in LEADERBOARD_V2_GENERATIVE_TASKS ],
513522 "huggingface" : [task for task in tasks if task in LEADERBOARD_V2_MCQ_TASKS ],
523+ "openai" : [],
514524 }
525+
515526 overlapping_tasks = set (task_grouping ["vllm" ]) & set (task_grouping ["huggingface" ])
516527 assert not overlapping_tasks
517528 return task_grouping
@@ -543,51 +554,63 @@ def __init__(
543554 eval_config : t .Optional [t .Dict [str , t .Any ]] = None ,
544555 vllm_config : t .Optional [t .Dict [str , t .Any ]] = None ,
545556 hf_config : t .Optional [t .Dict [str , t .Any ]] = None ,
557+ openai_config : t .Optional [t .Dict [str , t .Any ]] = None ,
558+ api_endpoint : t .Optional [str ] = None ,
546559 ):
547560 """
548561 Initialize the evaluator.
549562
550563 Args:
551- model_path: Path to the model to evaluate.
564+ model_path: Path to the model to evaluate or model name for OpenAI API .
552565 tasks: List of tasks to evaluate on.
553- num_gpus: Number of GPUs to use.
566+ num_gpus: Number of GPUs to use (ignored when using API endpoint) .
554567 output_file: Path to save results to.
555- eval_config: Configuration for general evaluation parameters that apply to both backends.
568+ eval_config: Configuration for general evaluation parameters that apply to all backends.
556569 Default values (can be overridden):
557570 - batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
558571 - apply_chat_template: True - Whether to apply chat template formatting
559572 - fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
560573 - confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
561- - max_model_len: 32768 - Maximum sequence length for the model
562574 - system_instruction: None - Optional system instruction to prepend to prompts
563- - cache_requests: False - Whether to cache requests for the dataset
575+ - cache_requests: False - Whether to cache requests
564576 vllm_config: Configuration for vLLM-specific parameters.
565577 Default values (can be overridden):
566578 - dtype: "float16" - Data type for model weights
567579 - gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
568580 - disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
569581 - enforce_eager: False - Whether to enforce eager execution
582+ - max_model_len: 32768 - Maximum sequence length for the model
570583 And any other vLLM parameters supported by simple_evaluate.
571584 hf_config: Configuration for HuggingFace-specific parameters.
572585 Default values (can be overridden):
573586 - dtype: "float16" - Data type for model weights
574587 - trust_remote_code: True - Whether to trust remote code in model loading
588+ - max_length: 32768 - Maximum sequence length for the model
575589 And any other HuggingFace parameters supported by simple_evaluate.
590+ openai_config: Configuration for OpenAI-specific parameters.
591+ Default values (can be overridden):
592+ - max_tokens: 768 - Maximum tokens to generate
593+ - temperature: 0.0 - Temperature for sampling
594+ - seed: 1337 - Seed for reproducibility
595+ api_endpoint: Optional OpenAI-compatible API endpoint.
596+ When provided, tasks are evaluated using the OpenAI API instead of local models.
576597 """
577598 self .model_path = model_path
578- if not cuda .is_available ():
599+ if not api_endpoint and not cuda .is_available ():
579600 raise ValueError (
580- "Running without CUDA is currently unsupported. Contributions are welcome ."
601+ "Running without CUDA is currently unsupported unless using an API endpoint ."
581602 )
582603
583604 # set whatever we need here
584605 self .num_gpus = num_gpus
585606 self .tasks = tasks
607+ self .api_endpoint = api_endpoint
586608
587609 # Store evaluation configurations
588610 self .eval_config = eval_config or {}
589611 self .vllm_config = vllm_config or {}
590612 self .hf_config = hf_config or {}
613+ self .openai_config = openai_config or {}
591614
592615 # validate output file
593616 self .output_file = output_file
@@ -644,38 +667,49 @@ def run(
644667 eval_config : t .Optional [t .Dict [str , t .Any ]] = None ,
645668 vllm_config : t .Optional [t .Dict [str , t .Any ]] = None ,
646669 hf_config : t .Optional [t .Dict [str , t .Any ]] = None ,
670+ openai_config : t .Optional [t .Dict [str , t .Any ]] = None ,
671+ api_endpoint : t .Optional [str ] = None ,
647672 ) -> LeaderboardV2EvalResult :
648673 """
649674 Run the Open LLM Leaderboard v2 evaluation.
650675
651- This function will use both HF transformers and inline vLLM to run the evaluation.
652- It will then parse the results and save them to a file.
676+ This function will use the appropriate backend based on the provided parameters:
677+ - With api_endpoint: Uses the OpenAI API for all tasks
678+ - Without api_endpoint: Uses both HF transformers and vLLM for optimal performance
653679
654680 Args:
655- model_path: The path to the model to evaluate.
681+ model_path: The path to the model to evaluate or model name for API .
656682 tasks: The tasks to evaluate.
657- num_gpus: The number of GPUs to use.
683+ num_gpus: The number of GPUs to use (ignored when using API) .
658684 output_file: The path to the file to save the results to.
659- eval_config: Configuration for general evaluation parameters that apply to both backends.
685+ eval_config: Configuration for general evaluation parameters that apply to all backends.
660686 Default values (can be overridden):
661687 - batch_size: "auto" - Batch size for evaluation, or "auto" for automatic batching
662688 - apply_chat_template: True - Whether to apply chat template formatting
663689 - fewshot_as_multiturn: True - Whether to format few-shot examples as multi-turn conversations
664690 - confirm_run_unsafe_code: True - Whether to run potentially unsafe code without confirmation
665- - max_model_len: 32768 - Maximum sequence length for the model
666691 - system_instruction: None - Optional system instruction to prepend to prompts
692+ - cache_requests: False - Whether to cache requests
667693 vllm_config: Configuration for vLLM-specific parameters.
668694 Default values (can be overridden):
669695 - dtype: "float16" - Data type for model weights
670696 - gpu_memory_utilization: 0.8 - Fraction of GPU memory to use
671697 - disable_custom_all_reduce: True - Whether to disable custom all-reduce implementation
672698 - enforce_eager: False - Whether to enforce eager execution
699+ - max_model_len: 32768 - Maximum sequence length for the model
673700 And any other vLLM parameters supported by simple_evaluate.
674701 hf_config: Configuration for HuggingFace-specific parameters.
675702 Default values (can be overridden):
676703 - dtype: "float16" - Data type for model weights
677704 - trust_remote_code: True - Whether to trust remote code in model loading
705+ - max_length: 32768 - Maximum sequence length for the model
678706 And any other HuggingFace parameters supported by simple_evaluate.
707+ openai_config: Configuration for OpenAI-specific parameters.
708+ Default values (can be overridden):
709+ - max_tokens: 768 - Maximum tokens to generate
710+ - temperature: 0.0 - Temperature for sampling
711+ - seed: 1337 - Seed for reproducibility
712+ api_endpoint: Optional OpenAI-compatible API endpoint.
679713
680714 Returns:
681715 LeaderboardV2EvalResult: A dict containing the overall leaderboard score and the breakdown per subtask.
@@ -684,60 +718,76 @@ def run(
684718 tasks = self .tasks if not tasks else tasks
685719 num_gpus = self .num_gpus if not num_gpus else num_gpus
686720 output_file = self .output_file if not output_file else output_file
721+ api_endpoint = self .api_endpoint if api_endpoint is None else api_endpoint
687722
688723 # Merge configurations with instance configurations, with run-time configs taking precedence
689724 final_eval_config = {** self .eval_config , ** (eval_config or {})}
690725 final_vllm_config = {** self .vllm_config , ** (vllm_config or {})}
691726 final_hf_config = {** self .hf_config , ** (hf_config or {})}
727+ final_openai_config = {** self .openai_config , ** (openai_config or {})}
728+
729+ # If API endpoint is provided, add it to the OpenAI config
730+ if api_endpoint and "base_url" not in final_openai_config :
731+ final_openai_config ["base_url" ] = api_endpoint
692732
693733 if not tasks :
694734 tasks = LEADERBOARD_V2_MCQ_TASKS + LEADERBOARD_V2_GENERATIVE_TASKS
695735
696736 # validation logic
697- # no need to validate model path -- the inference libraries will either be able to
698- # load it, or they won't
699-
700737 validate_leaderboard_v2_tasks (tasks )
701- if not num_gpus :
702- num_gpus = cuda .device_count ()
703- if num_gpus <= 0 or num_gpus > cuda .device_count ():
704- raise ValueError (
705- f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
706- )
738+
739+ # Only validate GPU requirements when not using an API endpoint
740+ if not api_endpoint :
741+ if not num_gpus :
742+ num_gpus = cuda .device_count ()
743+ if num_gpus <= 0 or num_gpus > cuda .device_count ():
744+ raise ValueError (
745+ f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
746+ )
747+
707748 if output_file :
708749 validate_output_path (output_file )
709750
710- # now we just have to run the task group in their most appropriate runtime
711- # this is important because certain tasks like MCQ are better-suited to be
712- # excuted in raw transformers due to the lack of KV-Cache overhead,
713- # whereas generative tasks are better suited for vLLM due to their need for
714- # accessing previous tokens
715-
716- grouped_tasks = get_task_groupings (tasks )
751+ # Group tasks by optimal runtime
752+ grouped_tasks = get_task_groupings (tasks , api_endpoint )
717753 self ._lm_eval_results = []
718- vllm_results , hf_results = None , None
719- if vllm_tasks := grouped_tasks ["vllm" ]:
720- args_vllm : LeaderboardArgs = {
721- "model_path" : model_path ,
722- "num_gpus" : num_gpus ,
723- "tasks" : vllm_tasks ,
724- "eval_config" : final_eval_config ,
725- "backend_config" : final_vllm_config ,
726- }
727- vllm_results = evaluate_with_vllm (args_vllm )
728- self ._lm_eval_results .append (vllm_results )
729- if hf_tasks := grouped_tasks ["huggingface" ]:
730- args_hf : LeaderboardArgs = {
754+
755+ # Execute tasks using the appropriate backends
756+ if openai_tasks := grouped_tasks ["openai" ]:
757+ args_openai : LeaderboardArgs = {
731758 "model_path" : model_path ,
732- "num_gpus" : num_gpus ,
733- "tasks" : hf_tasks ,
759+ "num_gpus" : 1 , # Not used for API calls but required by the type
760+ "tasks" : openai_tasks ,
734761 "eval_config" : final_eval_config ,
735- "backend_config" : final_hf_config ,
762+ "backend_config" : final_openai_config ,
736763 }
737- hf_results = evaluate_with_hf (args_hf )
738- self ._lm_eval_results .append (hf_results )
739-
740- # convert the output of lm-eval into something that's already parsed
764+ openai_results = evaluate_with_openai (args_openai )
765+ self ._lm_eval_results .append (openai_results )
766+ else :
767+ # Only run local evaluation if not using OpenAI API
768+ if vllm_tasks := grouped_tasks ["vllm" ]:
769+ args_vllm : LeaderboardArgs = {
770+ "model_path" : model_path ,
771+ "num_gpus" : num_gpus ,
772+ "tasks" : vllm_tasks ,
773+ "eval_config" : final_eval_config ,
774+ "backend_config" : final_vllm_config ,
775+ }
776+ vllm_results = evaluate_with_vllm (args_vllm )
777+ self ._lm_eval_results .append (vllm_results )
778+
779+ if hf_tasks := grouped_tasks ["huggingface" ]:
780+ args_hf : LeaderboardArgs = {
781+ "model_path" : model_path ,
782+ "num_gpus" : num_gpus ,
783+ "tasks" : hf_tasks ,
784+ "eval_config" : final_eval_config ,
785+ "backend_config" : final_hf_config ,
786+ }
787+ hf_results = evaluate_with_hf (args_hf )
788+ self ._lm_eval_results .append (hf_results )
789+
790+ # Convert the output of lm-eval into something that's already parsed
741791 results : LeaderboardV2EvalResult = get_scores_from_result_dicts (
742792 * self ._lm_eval_results
743793 )
@@ -746,3 +796,47 @@ def run(
746796 if output_file :
747797 self .save_to_file (output_file )
748798 return results
799+
800+
801+ def evaluate_with_openai (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
802+ # Start with default configurations
803+ eval_config = deepcopy (DEFAULT_EVAL_CONFIG )
804+ backend_config = deepcopy (DEFAULT_OPENAI_CONFIG )
805+
806+ # Override with user-provided configurations
807+ if "eval_config" in args and args ["eval_config" ]:
808+ eval_config .update (args ["eval_config" ])
809+ if "backend_config" in args and args ["backend_config" ]:
810+ backend_config .update (args ["backend_config" ])
811+
812+ # Extract base_url and api_key from backend_config if provided
813+ base_url = backend_config .pop ("base_url" , None )
814+ api_key = backend_config .pop ("api_key" , None )
815+
816+ # Build model_args for lm-eval's OpenAI client
817+ model_args = {
818+ "model" : args ["model_path" ], # model name as recognized by the API
819+ }
820+
821+ # Add base_url if provided
822+ if base_url :
823+ model_args ["base_url" ] = base_url
824+
825+ # Add API key if provided
826+ if api_key :
827+ model_args ["api_key" ] = api_key
828+
829+ # Add any remaining backend config options
830+ model_args .update (backend_config )
831+
832+ # Extract system_instruction if provided
833+ system_instruction = eval_config .pop ("system_instruction" , None )
834+
835+ results = simple_evaluate (
836+ tasks = args ["tasks" ],
837+ model = "openai" ,
838+ model_args = model_args ,
839+ system_instruction = system_instruction ,
840+ ** eval_config ,
841+ )
842+ return results
0 commit comments