Skip to content

Commit 262a9e6

Browse files
committed
Merge branch 'main' into scoring
2 parents a0e4502 + a23b5ea commit 262a9e6

6 files changed

Lines changed: 281 additions & 55 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ algorithmic_efficiency/workloads/librispeech_conformer/work_dir
2020
*.vocab
2121
wandb/
2222
*.txt
23+
scoring/plots/
2324

2425
!scoring/test_data/experiment_dir/study_0/mnist_jax/trial_0/eval_measurements.csv
2526
!scoring/test_data/experiment_dir/study_0/mnist_jax/trial_1/eval_measurements.csv

CALL_FOR_SUBMISSIONS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Submissions can compete under two hyperparameter tuning rulesets (with separate
1717
- **Registration deadline to express non-binding intent to submit: February 28th, 2024**.\
1818
Please fill out the (mandatory but non-binding) [**registration form**](https://forms.gle/K7ty8MaYdi2AxJ4N8).
1919
- **Submission deadline: April 04th, 2024** *(moved by a week from the initial March 28th, 2024)*
20-
- [tentative] Announcement of all results: July 15th, 2024
20+
- [Announcement of all results](https://mlcommons.org/2024/08/mlc-algoperf-benchmark-competition/): August 1st, 2024
2121

2222
For a detailed and up-to-date timeline see the [Competition Rules](/COMPETITION_RULES.md).
2323

README.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727
---
2828

2929
> [!IMPORTANT]
30-
> Submitters are no longer required to self-report results.
31-
> We are currently in the process of evaluating and scoring received submissions.
32-
> We are aiming to release results by July 15th 2024.
33-
> For other key dates please see [Call for Submissions](CALL_FOR_SUBMISSIONS.md).
30+
> The results of the inaugural AlgoPerf: Training Algorithms benchmark competition have been announced. See the [MLCommons blog post](https://mlcommons.org/2024/08/mlc-algoperf-benchmark-competition/) for an overview and the [results page](https://mlcommons.org/benchmarks/algorithms/) for more details on the results. We are currently preparing an in-depth analysis of the results in the form of a paper and plan the next iteration of the benchmark competition.
3431
3532
## Table of Contents <!-- omit from toc -->
3633

scoring/compute_speedups.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""File to compute speedups (i.e. geometric means between runtimes)."""
2+
3+
import pickle
4+
5+
from absl import app
6+
from absl import flags
7+
import numpy as np
8+
import pandas as pd
9+
from performance_profile import BASE_WORKLOADS
10+
from performance_profile import get_workloads_time_to_target
11+
from scipy import stats
12+
13+
flags.DEFINE_string('results_txt', None, 'Path to full scoring results file.')
14+
flags.DEFINE_string(
15+
'base',
16+
'prize_qualification_baseline',
17+
'Base submission to compare to. Defaults to the `prize_qualification_baseline`.'
18+
)
19+
flags.DEFINE_string('comparison', None, 'Submission to compute the speedup of.')
20+
flags.DEFINE_boolean('self_tuning_ruleset',
21+
False,
22+
'Whether the self-tuning ruleset is being scored.')
23+
flags.DEFINE_boolean('save_results',
24+
False,
25+
'Whether to save the results to disk.')
26+
FLAGS = flags.FLAGS
27+
28+
MAX_BUDGETS = {
29+
'criteo1tb': 7703,
30+
'fastmri': 8859,
31+
'imagenet_resnet': 63_008,
32+
'imagenet_vit': 77_520,
33+
'librispeech_conformer': 61_068,
34+
'librispeech_deepspeech': 55_506,
35+
'ogbg': 18_477,
36+
'wmt': 48_151,
37+
}
38+
39+
40+
def replace_inf(row):
41+
"""Replace ifs with maximum runtime budget (+1 second).
42+
43+
Args:
44+
row (pd.Series): The original row.
45+
46+
Returns:
47+
pd.Series: The row with infs replaced.
48+
"""
49+
workload_name = row.name
50+
# Factor of 3 for self-tuning ruleset
51+
factor = 3 if FLAGS.self_tuning_ruleset else 1
52+
max_runtime_workload = factor * MAX_BUDGETS[workload_name]
53+
row.replace(np.inf, max_runtime_workload + 1, inplace=True)
54+
return row
55+
56+
57+
def compute_speedup():
58+
"""Compute speedup between two algorithms."""
59+
# Load results from disk
60+
with open(FLAGS.results_txt, 'rb') as f:
61+
results = pickle.load(f)
62+
63+
# Compute median over runtimes for both training algorithms
64+
base_results = get_workloads_time_to_target(
65+
results[FLAGS.base],
66+
FLAGS.base,
67+
time_col="score",
68+
self_tuning_ruleset=FLAGS.self_tuning_ruleset,
69+
)
70+
comparison_results = get_workloads_time_to_target(
71+
results[FLAGS.comparison],
72+
FLAGS.comparison,
73+
time_col="score",
74+
self_tuning_ruleset=FLAGS.self_tuning_ruleset,
75+
)
76+
77+
# Merge results
78+
merged_results = pd.concat([base_results, comparison_results]).transpose()
79+
80+
# Ignore workload variants (only consider base workloads) for speedup
81+
merged_results = merged_results.loc[merged_results.index.isin(BASE_WORKLOADS)]
82+
83+
# Replace infs with maximum runtime budget (+1 second)
84+
merged_results = merged_results.apply(replace_inf, axis=1)
85+
86+
# Compute speedup
87+
merged_results['speedup'] = merged_results[
88+
f'{FLAGS.comparison}'] / merged_results[f'{FLAGS.base}']
89+
speedups = merged_results['speedup'].to_numpy()
90+
mean_speedup = stats.gmean(speedups) # Geometric mean over workload speedups
91+
92+
print(merged_results, end='\n\n')
93+
print(
94+
f"Average speedup of {FLAGS.comparison} compared to {FLAGS.base}: {mean_speedup} or roughly {(1-mean_speedup):.1%}"
95+
)
96+
97+
if FLAGS.save_results:
98+
# Optionally save results to disk
99+
print("Saving results to disk...")
100+
filename = f'{FLAGS.comparison}_vs_{FLAGS.base}_speedup_{(1-mean_speedup):.1%}.csv'
101+
merged_results.to_csv(filename)
102+
103+
104+
def main(_):
105+
"""Main function to compute speedup between two algorithms."""
106+
compute_speedup()
107+
108+
109+
if __name__ == '__main__':
110+
flags.mark_flag_as_required('results_txt')
111+
flags.mark_flag_as_required('comparison')
112+
app.run(main)

scoring/performance_profile.py

Lines changed: 83 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,17 @@
2626
the dictionary of submissions.
2727
"""
2828
import itertools
29+
import json
2930
import operator
3031
import os
3132
import re
3233

3334
from absl import logging
35+
import matplotlib as mpl
3436
import matplotlib.pyplot as plt
3537
import numpy as np
3638
import pandas as pd
39+
from tabulate import tabulate
3740

3841
from algorithmic_efficiency.workloads.workloads import get_base_workload_name
3942
import algorithmic_efficiency.workloads.workloads as workloads_registry
@@ -43,6 +46,10 @@
4346
BASE_WORKLOADS = workloads_registry.BASE_WORKLOADS
4447
WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
4548
BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
49+
# Open json file to read heldout workloads
50+
# TODO: This probably shouldn't be hardcoded but passed as an argument.
51+
with open("held_out_workloads_algoperf_v05.json", "r") as f:
52+
HELDOUT_WORKLOADS = json.load(f)
4653
# These global variables have to be set according to the current set of
4754
# workloads and rules for the scoring to be correct.
4855
# We do not use the workload registry since it contains test and development
@@ -63,6 +70,37 @@
6370

6471
MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu']
6572

73+
#MPL params
74+
mpl.rcParams['figure.figsize'] = (16, 10) # Width, height in inches
75+
mpl.rcParams['font.family'] = 'serif'
76+
mpl.rcParams['font.serif'] = [
77+
'Times New Roman'
78+
] + mpl.rcParams['font.serif'] # Add Times New Roman as first choice
79+
mpl.rcParams['font.size'] = 22
80+
mpl.rcParams['savefig.dpi'] = 300 # Set resolution for saved figures
81+
82+
# Plot Elements
83+
mpl.rcParams['lines.linewidth'] = 3 # Adjust line thickness if needed
84+
mpl.rcParams['lines.markersize'] = 6 # Adjust marker size if needed
85+
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(
86+
color=["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
87+
"#9467bd"]) # Example color cycle (consider ColorBrewer or viridis)
88+
mpl.rcParams['axes.labelsize'] = 22 # Axis label font size
89+
mpl.rcParams['xtick.labelsize'] = 20 # Tick label font size
90+
mpl.rcParams['ytick.labelsize'] = 20
91+
92+
# Legends and Gridlines
93+
mpl.rcParams['legend.fontsize'] = 20 # Legend font size
94+
mpl.rcParams[
95+
'legend.loc'] = 'best' # Let matplotlib decide the best legend location
96+
mpl.rcParams['axes.grid'] = True # Enable grid
97+
mpl.rcParams['grid.alpha'] = 0.4 # Gridline transparency
98+
99+
100+
def print_dataframe(df):
101+
tabulated_df = tabulate(df.T, headers='keys', tablefmt='psql')
102+
logging.info(tabulated_df)
103+
66104

67105
def generate_eval_cols(metrics):
68106
splits = ['train', 'validation']
@@ -150,10 +188,10 @@ def get_workloads_time_to_target(submission,
150188
if strict:
151189
raise ValueError(
152190
f'Expecting {NUM_BASE_WORKLOADS + NUM_VARIANT_WORKLOADS} workloads '
153-
f'but found {num_workloads} workloads.')
191+
f'but found {num_workloads} workloads for {submission_name}.')
154192
logging.warning(
155193
f'Expecting {NUM_BASE_WORKLOADS + NUM_VARIANT_WORKLOADS} workloads '
156-
f'but found {num_workloads} workloads.')
194+
f'but found {num_workloads} workloads for {submission_name}.')
157195

158196
# For each workload get submission time get the submission times to target.
159197
for workload, group in submission.groupby('workload'):
@@ -164,11 +202,13 @@ def get_workloads_time_to_target(submission,
164202
num_studies = len(group.groupby('study'))
165203
if num_studies != NUM_STUDIES:
166204
if strict:
167-
raise ValueError(f'Expecting {NUM_STUDIES} trials for workload '
168-
f'{workload} but found {num_studies} trials.')
205+
raise ValueError(f'Expecting {NUM_STUDIES} studies for workload '
206+
f'{workload} but found {num_studies} studies '
207+
f'for {submission_name}.')
169208
else:
170-
logging.warning(f'Expecting {NUM_STUDIES} trials for workload '
171-
f'{workload} but found {num_studies} trials.')
209+
logging.warning(f'Expecting {NUM_STUDIES} studies for workload '
210+
f'{workload} but found {num_studies} studies '
211+
f'for {submission_name}.')
172212

173213
# For each study check trials
174214
for study, group in group.groupby('study'):
@@ -177,11 +217,15 @@ def get_workloads_time_to_target(submission,
177217
num_trials = len(group)
178218
if num_trials != NUM_TRIALS and not self_tuning_ruleset:
179219
if strict:
180-
raise ValueError(f'Expecting {NUM_TRIALS} trials for workload '
181-
f'{workload} but found {num_trials} trials.')
220+
raise ValueError(
221+
f'In Study {study}: Expecting {NUM_TRIALS} trials for workload '
222+
f'{workload} but found {num_trials} trials '
223+
f'for {submission_name}.')
182224
else:
183-
logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
184-
f'{workload} but found {num_trials} trials.')
225+
logging.warning(
226+
f'In Study {study}: Expecting {NUM_TRIALS} trials for workload '
227+
f'{workload} but found {num_trials} trials '
228+
f'for {submission_name}.')
185229

186230
# Get trial and time index that reaches target
187231
trial_idx, time_idx = get_best_trial_index(
@@ -194,13 +238,12 @@ def get_workloads_time_to_target(submission,
194238

195239
workloads.append({
196240
'submission': submission_name,
197-
'workload': workload,
241+
'workload': re.sub(r'_(jax|pytorch)$', '', workload),
198242
time_col: np.median(time_vals_per_study),
199243
})
200244

201245
df = pd.DataFrame.from_records(workloads)
202246
df = df.pivot(index='submission', columns='workload', values=time_col)
203-
204247
return df
205248

206249

@@ -210,6 +253,9 @@ def filter(x):
210253
try:
211254
if x[variant_workload] == np.inf:
212255
return np.inf
256+
# Also check for nan values (e.g. OOMs)
257+
elif np.isnan(x[variant_workload]):
258+
return np.inf
213259
else:
214260
return x[base_workload]
215261
except KeyError as e:
@@ -268,27 +314,33 @@ def compute_performance_profiles(submissions,
268314
self_tuning_ruleset,
269315
strict))
270316
df = pd.concat(dfs)
317+
# Restrict to base and sampled held-out workloads
318+
# (ignore the additional workload variants of the baseline
319+
# as they cause issues when checking for nans in workload variants).
320+
df = df[BASE_WORKLOADS + HELDOUT_WORKLOADS]
321+
# Sort workloads alphabetically (for better display)
322+
df = df.reindex(sorted(df.columns), axis=1)
323+
324+
# For each held-out workload set to inf if the base workload is inf or nan
325+
for workload in df.keys():
326+
if workload not in BASE_WORKLOADS:
327+
# If base do not have finite score set variant score to inf
328+
base_workload = get_base_workload_name(workload)
329+
df[workload] = df.apply(
330+
variant_criteria_filter(workload, base_workload), axis=1)
271331

272332
# Set score to inf if not within 4x of fastest submission
273333
best_scores = df.min(axis=0)
274334
df[df.apply(lambda x: x > 4 * best_scores, axis=1)] = np.inf
275335

276-
# For each held-out workload if variant target was not hit set submission to inf
277-
framework = None
336+
# For each base workload if variant target was not hit set submission to inf
278337
for workload in df.keys():
279-
# Check if this is a variant
280-
framework = workload.split('_')[-1]
281-
workload_ = workload.split(f'_{framework}')[0]
282-
if workload_ not in BASE_WORKLOADS:
338+
if workload not in BASE_WORKLOADS:
283339
# If variants do not have finite score set base_workload score to inf
284-
base_workload = get_base_workload_name(workload_)
340+
base_workload = get_base_workload_name(workload)
285341
df[base_workload] = df.apply(
286-
variant_criteria_filter(base_workload + f'_{framework}', workload),
287-
axis=1)
288-
289-
base_workloads = [w + f'_{framework}' for w in BASE_WORKLOADS]
290-
df = df[base_workloads]
291-
print(df)
342+
variant_criteria_filter(base_workload, workload), axis=1)
343+
df = df[BASE_WORKLOADS]
292344

293345
if verbosity > 0:
294346
logging.info('\n`{time_col}` to reach target:')
@@ -375,8 +427,7 @@ def plot_performance_profiles(perf_df,
375427
df_col,
376428
scale='linear',
377429
save_dir=None,
378-
figsize=(30, 10),
379-
font_size=18):
430+
figsize=(30, 10)):
380431
"""Plot performance profiles.
381432
382433
Args:
@@ -396,12 +447,12 @@ def plot_performance_profiles(perf_df,
396447
Returns:
397448
None. If a valid save_dir is provided, save both the plot and perf_df.
398449
"""
399-
fig = perf_df.T.plot(figsize=figsize)
450+
fig = perf_df.T.plot(figsize=figsize, alpha=0.7)
400451
df_col_display = f'log10({df_col})' if scale == 'log' else df_col
401-
fig.set_xlabel(
402-
f'Ratio of `{df_col_display}` to best submission', size=font_size)
403-
fig.set_ylabel('Proportion of workloads', size=font_size)
404-
fig.legend(prop={'size': font_size}, bbox_to_anchor=(1.0, 1.0))
452+
fig.set_xlabel(f'Ratio of `{df_col_display}` to best submission')
453+
fig.set_ylabel('Proportion of workloads')
454+
fig.legend(bbox_to_anchor=(1.0, 1.0))
455+
plt.tight_layout()
405456
maybe_save_figure(save_dir, f'performance_profile_by_{df_col_display}')
406457
maybe_save_df_to_csv(save_dir,
407458
perf_df,

0 commit comments

Comments
 (0)