Skip to content

Commit b896d0a

Browse files
author
Dan McPherson
committed
Remove task logic with lm_eval 0.4.4 for agg_score
lm_eval used to return an extra entry that corresponded to the tasks requested. Ex: mmlu_pr. As of 0.4.4 the entries are now the same whether the tasks are custom are not and the extra entry is removed. So the agg score now needs to be calculated from the individual task scores returned so the logic can be shared with mmluevaluator. Signed-off-by: Dan McPherson <dmcphers@redhat.com>
1 parent 40cc370 commit b896d0a

1 file changed

Lines changed: 31 additions & 64 deletions

File tree

src/instructlab/eval/mmlu.py

Lines changed: 31 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,37 @@ def __init__(
122122
self.batch_size = batch_size
123123
self.device = device
124124

125+
def run(self, server_url: str | None = None) -> tuple:
126+
"""
127+
Runs evaluation
128+
129+
Attributes
130+
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
131+
132+
Returns:
133+
overall_score Average score for the task group
134+
individual_scores Individual scores for each task in the task group
135+
"""
136+
logger.debug(locals())
137+
138+
# TODO: make this a parameter for class?
139+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
140+
141+
individual_scores: dict = {}
142+
agg_score: float = 0.0
143+
144+
results = self._run_mmlu(server_url)
145+
for task, result in results.items():
146+
agg_score += float(result["acc,none"])
147+
individual_scores[task] = {
148+
"score": float(result["acc,none"]),
149+
"stderr": float(result["acc_stderr,none"]),
150+
}
151+
152+
overall_score = float(agg_score / len(self.tasks))
153+
154+
return overall_score, individual_scores
155+
125156
def _run_mmlu(self, server_url: str | None = None) -> dict:
126157
if server_url is not None:
127158
# Requires lm_eval >= 0.4.4
@@ -205,36 +236,6 @@ def __init__(
205236
model_path, None, tasks, model_dtype, few_shots, batch_size, device
206237
)
207238

208-
def run(self, server_url: str | None = None) -> tuple:
209-
"""
210-
Runs MMLU evaluation
211-
212-
Attributes
213-
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
214-
215-
Returns:
216-
overall_score MMLU score for the overall model evaluation
217-
individual_scores Individual MMLU score for each task
218-
"""
219-
logger.debug(locals())
220-
# TODO: make this a parameter for class?
221-
os.environ["TOKENIZERS_PARALLELISM"] = "true"
222-
223-
individual_scores: dict = {}
224-
agg_score: float = 0.0
225-
226-
results = self._run_mmlu(server_url)
227-
228-
for task in self.tasks:
229-
mmlu_res = results[task]
230-
agg_score += float(mmlu_res["acc,none"])
231-
individual_scores[task] = {}
232-
individual_scores[task]["score"] = float(mmlu_res["acc,none"])
233-
individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])
234-
235-
overall_score = float(agg_score / len(self.tasks))
236-
return overall_score, individual_scores
237-
238239

239240
class MMLUBranchEvaluator(AbstractMMLUEvaluator):
240241
"""
@@ -251,37 +252,3 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
251252
"""
252253

253254
name = "mmlu_branch"
254-
255-
def run(self, server_url: str | None = None) -> tuple:
256-
"""
257-
Runs MMLUBranch evaluation
258-
259-
Attributes
260-
server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated
261-
262-
Returns:
263-
overall_score Average MMLUBranch score for the task group
264-
individual_scores Individual MMLUBranch scores for each task in the task group
265-
"""
266-
logger.debug(locals())
267-
268-
# TODO: make this a parameter for class?
269-
os.environ["TOKENIZERS_PARALLELISM"] = "true"
270-
271-
individual_scores: dict = {}
272-
agg_score: float = 0.0
273-
274-
results = self._run_mmlu(server_url)
275-
276-
for task, result in results.items():
277-
if task in self.tasks:
278-
agg_score += float(result["acc,none"])
279-
else:
280-
individual_scores[task] = {
281-
"score": float(result["acc,none"]),
282-
"stderr": float(result["acc_stderr,none"]),
283-
}
284-
285-
overall_score = float(agg_score / len(self.tasks))
286-
287-
return overall_score, individual_scores

0 commit comments

Comments
 (0)