Skip to content

Commit 219bca1

Browse files
committed
return overall_score from MTBenchBranch.generate_judgement()
This allows the overall_score to be shown by callers of the library along with qa pairs and the error rate. This commit changes what a function in the library returns and thus is not backwards compatible. Signed-off-by: Ali Maredia <amaredia@redhat.com>
1 parent eba1962 commit 219bca1

2 files changed

Lines changed: 9 additions & 3 deletions

File tree

src/instructlab/eval/mt_bench.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,10 +260,12 @@ def judge_answers(
260260
serving_gpus Number of gpus allocated for serving. Used to tune with max_workers=auto. None indicates to use value specified in constructor.
261261
262262
Returns:
263+
overall_score overall score from the evaluation
263264
qa_pairs Question and answer pairs (with scores) from the evaluation
265+
error_rate percentage of questions dropped due to errors during evaluation
264266
"""
265267
logger.debug(locals())
266-
_, qa_pairs, _, error_rate = mt_bench_judgment.generate_judgment(
268+
overall_score, qa_pairs, _, error_rate = mt_bench_judgment.generate_judgment(
267269
self.model_name,
268270
self.judge_model_name,
269271
server_url,
@@ -275,4 +277,4 @@ def judge_answers(
275277
bench_name="mt_bench_branch",
276278
merge_system_user_message=self.merge_system_user_message,
277279
)
278-
return qa_pairs, error_rate
280+
return overall_score, qa_pairs, error_rate

tests/test_branch_judge_answers.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010
"../taxonomy",
1111
"main",
1212
)
13-
qa_pairs, error_rate = mt_bench_branch.judge_answers("http://localhost:8000/v1")
13+
overall_score, qa_pairs, error_rate = mt_bench_branch.judge_answers(
14+
"http://localhost:8000/v1"
15+
)
16+
17+
print(f"Overall Score: {overall_score}")
1418
print(f"Error Rate: {error_rate}")
1519
print(f"QA Pair 0:")
1620
pprint.pprint(qa_pairs[0])

0 commit comments

Comments
 (0)