Skip to content

Commit 2db611f

Browse files
Merge pull request #769 from mlcommons/scoring_fix
cosmetic and functional fixes to scoring code
2 parents dc6f189 + be6560e commit 2db611f

2 files changed

Lines changed: 100 additions & 35 deletions

File tree

scoring/performance_profile.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,17 @@
2626
the dictionary of submissions.
2727
"""
2828
import itertools
29+
import logging
2930
import operator
3031
import os
3132
import re
3233

3334
from absl import logging
35+
import matplotlib as mpl
3436
import matplotlib.pyplot as plt
3537
import numpy as np
3638
import pandas as pd
39+
from tabulate import tabulate
3740

3841
from algorithmic_efficiency.workloads.workloads import get_base_workload_name
3942
import algorithmic_efficiency.workloads.workloads as workloads_registry
@@ -63,6 +66,37 @@
6366

6467
MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu']
6568

69+
#MPL params
70+
mpl.rcParams['figure.figsize'] = (16, 10) # Width, height in inches
71+
mpl.rcParams['font.family'] = 'serif'
72+
mpl.rcParams['font.serif'] = [
73+
'Times New Roman'
74+
] + mpl.rcParams['font.serif'] # Add Times New Roman as first choice
75+
mpl.rcParams['font.size'] = 22
76+
mpl.rcParams['savefig.dpi'] = 300 # Set resolution for saved figures
77+
78+
# Plot Elements
79+
mpl.rcParams['lines.linewidth'] = 3 # Adjust line thickness if needed
80+
mpl.rcParams['lines.markersize'] = 6 # Adjust marker size if needed
81+
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(
82+
color=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
83+
"#9467bd"]) # Example color cycle (consider ColorBrewer or viridis)
84+
mpl.rcParams['axes.labelsize'] = 22 # Axis label font size
85+
mpl.rcParams['xtick.labelsize'] = 20 # Tick label font size
86+
mpl.rcParams['ytick.labelsize'] = 20
87+
88+
# Legends and Gridlines
89+
mpl.rcParams['legend.fontsize'] = 20 # Legend font size
90+
mpl.rcParams[
91+
'legend.loc'] = 'best' # Let matplotlib decide the best legend location
92+
mpl.rcParams['axes.grid'] = True # Enable grid
93+
mpl.rcParams['grid.alpha'] = 0.4 # Gridline transparency
94+
95+
96+
def print_dataframe(df):
97+
tabulated_df = tabulate(df.T, headers='keys', tablefmt='psql')
98+
logging.info(tabulated_df)
99+
66100

67101
def generate_eval_cols(metrics):
68102
splits = ['train', 'validation']
@@ -177,11 +211,13 @@ def get_workloads_time_to_target(submission,
177211
num_trials = len(group)
178212
if num_trials != NUM_TRIALS and not self_tuning_ruleset:
179213
if strict:
180-
raise ValueError(f'Expecting {NUM_TRIALS} trials for workload '
181-
f'{workload} but found {num_trials} trials.')
214+
raise ValueError(
215+
f'In Study {study}: Expecting {NUM_TRIALS} trials for workload '
216+
f'{workload} but found {num_trials} trials.')
182217
else:
183-
logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
184-
f'{workload} but found {num_trials} trials.')
218+
logging.warning(
219+
f'In Study {study}: Expecting {NUM_TRIALS} trials for workload '
220+
f'{workload} but found {num_trials} trials.')
185221

186222
# Get trial and time index that reaches target
187223
trial_idx, time_idx = get_best_trial_index(
@@ -194,13 +230,12 @@ def get_workloads_time_to_target(submission,
194230

195231
workloads.append({
196232
'submission': submission_name,
197-
'workload': workload,
233+
'workload': re.sub(r'_(jax|pytorch)$', '', workload),
198234
time_col: np.median(time_vals_per_study),
199235
})
200236

201237
df = pd.DataFrame.from_records(workloads)
202238
df = df.pivot(index='submission', columns='workload', values=time_col)
203-
204239
return df
205240

206241

@@ -276,19 +311,13 @@ def compute_performance_profiles(submissions,
276311
# For each held-out workload if variant target was not hit set submission to inf
277312
framework = None
278313
for workload in df.keys():
279-
# Check if this is a variant
280-
framework = workload.split('_')[-1]
281-
workload_ = workload.split(f'_{framework}')[0]
282-
if workload_ not in BASE_WORKLOADS:
314+
if workload not in BASE_WORKLOADS:
283315
# If variants do not have finite score set base_workload score to inf
284-
base_workload = get_base_workload_name(workload_)
316+
base_workload = get_base_workload_name(workload)
285317
df[base_workload] = df.apply(
286-
variant_criteria_filter(base_workload + f'_{framework}', workload),
287-
axis=1)
318+
variant_criteria_filter(base_workload, workload), axis=1)
288319

289-
base_workloads = [w + f'_{framework}' for w in BASE_WORKLOADS]
290-
df = df[base_workloads]
291-
print(df)
320+
df = df[BASE_WORKLOADS]
292321

293322
if verbosity > 0:
294323
logging.info('\n`{time_col}` to reach target:')
@@ -375,8 +404,7 @@ def plot_performance_profiles(perf_df,
375404
df_col,
376405
scale='linear',
377406
save_dir=None,
378-
figsize=(30, 10),
379-
font_size=18):
407+
figsize=(30, 10)):
380408
"""Plot performance profiles.
381409
382410
Args:
@@ -396,12 +424,12 @@ def plot_performance_profiles(perf_df,
396424
Returns:
397425
None. If a valid save_dir is provided, save both the plot and perf_df.
398426
"""
399-
fig = perf_df.T.plot(figsize=figsize)
427+
fig = perf_df.T.plot(figsize=figsize, alpha=0.7)
400428
df_col_display = f'log10({df_col})' if scale == 'log' else df_col
401-
fig.set_xlabel(
402-
f'Ratio of `{df_col_display}` to best submission', size=font_size)
403-
fig.set_ylabel('Proportion of workloads', size=font_size)
404-
fig.legend(prop={'size': font_size}, bbox_to_anchor=(1.0, 1.0))
429+
fig.set_xlabel(f'Ratio of `{df_col_display}` to best submission')
430+
fig.set_ylabel('Proportion of workloads')
431+
fig.legend(bbox_to_anchor=(1.0, 1.0))
432+
plt.tight_layout()
405433
maybe_save_figure(save_dir, f'performance_profile_by_{df_col_display}')
406434
maybe_save_df_to_csv(save_dir,
407435
perf_df,

scoring/score_submissions.py

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,20 @@
1212
--compute_performance_profiles
1313
"""
1414

15+
import json
1516
import operator
1617
import os
18+
import pickle
1719

1820
from absl import app
1921
from absl import flags
2022
from absl import logging
2123
import numpy as np
2224
import pandas as pd
25+
import performance_profile
2326
import scoring_utils
2427
from tabulate import tabulate
2528

26-
from scoring import performance_profile
27-
2829
flags.DEFINE_string(
2930
'submission_directory',
3031
None,
@@ -45,6 +46,16 @@
4546
'self_tuning_ruleset',
4647
False,
4748
'Whether to score on self-tuning ruleset or externally tuned ruleset')
49+
flags.DEFINE_string(
50+
'save_results_to_filename',
51+
None,
52+
'Filename to save the processed results that are fed into the performance profile functions.'
53+
)
54+
flags.DEFINE_boolean(
55+
'load_results_from_filename',
56+
None,
57+
'Filename to load processed results from that are fed into performance profile functions'
58+
)
4859
FLAGS = flags.FLAGS
4960

5061

@@ -101,8 +112,13 @@ def get_summary_df(workload, workload_df, include_test_split=False):
101112
return summary_df
102113

103114

104-
def print_submission_summary(df, include_test_split=True):
115+
def get_submission_summary(df, include_test_split=True):
116+
"""Summarizes the submission results into metric and time tables
117+
organized by workload.
118+
"""
119+
105120
dfs = []
121+
print(df)
106122
for workload, group in df.groupby('workload'):
107123
summary_df = get_summary_df(
108124
workload, group, include_test_split=include_test_split)
@@ -115,15 +131,36 @@ def print_submission_summary(df, include_test_split=True):
115131

116132
def main(_):
117133
results = {}
118-
119-
for submission in os.listdir(FLAGS.submission_directory):
120-
experiment_path = os.path.join(FLAGS.submission_directory, submission)
121-
df = scoring_utils.get_experiment_df(experiment_path)
122-
results[submission] = df
123-
summary_df = print_submission_summary(df)
124-
with open(os.path.join(FLAGS.output_dir, f'{submission}_summary.csv'),
125-
'w') as fout:
126-
summary_df.to_csv(fout)
134+
os.makedirs(FLAGS.output_dir, exist_ok=True)
135+
136+
# Optionally read results to filename
137+
if FLAGS.load_results_from_filename:
138+
with open(
139+
os.path.join(FLAGS.output_dir, FLAGS.load_results_from_filename),
140+
'rb') as f:
141+
results = pickle.load(f)
142+
else:
143+
for team in os.listdir(FLAGS.submission_directory):
144+
for submission in os.listdir(
145+
os.path.join(FLAGS.submission_directory, team)):
146+
print(submission)
147+
experiment_path = os.path.join(FLAGS.submission_directory,
148+
team,
149+
submission)
150+
df = scoring_utils.get_experiment_df(experiment_path)
151+
results[submission] = df
152+
summary_df = get_submission_summary(df)
153+
with open(
154+
os.path.join(FLAGS.output_dir, f'{submission}_summary.csv'),
155+
'w') as fout:
156+
summary_df.to_csv(fout)
157+
158+
# Optionally save results to filename
159+
if FLAGS.save_results_to_filename:
160+
with open(
161+
os.path.join(FLAGS.output_dir, FLAGS.save_results_to_filename),
162+
'wb') as f:
163+
pickle.dump(results, f)
127164

128165
if not FLAGS.strict:
129166
logging.warning(

0 commit comments

Comments
 (0)