Merge pull request #921 from mlcommons/dev

priyakasimbeg · web-flow · commit b21be29be0a1 · 2026-03-27T14:02:55.000-07:00
Dev -&gt; main
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -83,8 +83,8 @@ RUN if [ "$framework" = "jax" ] ; then \
 RUN cd /algorithmic-efficiency && git fetch origin 
 RUN cd /algorithmic-efficiency && git pull
 
-# Todo: remove this, this is temporary for developing
-COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh
+# Uncomment this for developing purposes
+# COPY scripts/startup.sh /algorithmic-efficiency/docker/scripts/startup.sh
 RUN chmod a+x /algorithmic-efficiency/docker/scripts/startup.sh
 
 ENTRYPOINT ["bash", "/algorithmic-efficiency/docker/scripts/startup.sh"]
diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh
@@ -17,7 +17,7 @@ done
 
 # Artifact repostiory
 if [ "$PROJECT" = "mlcommons-algoperf" ]; then
-    ARTIFACT_REPO="europe-west-4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
+    ARTIFACT_REPO="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
 else
     ARTIFACT_REPO="us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo"
 fi
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,7 +46,8 @@ dependencies = [
   "clu==0.0.12",
   "matplotlib>=3.9.2",
   "tabulate==0.9.0",
-  "wandb==0.21.0"
+  "wandb==0.21.0",
+  "importlib_resources"
 ]
 
 [build-system]
diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
@@ -59,7 +59,7 @@
 # workloads and rules for the scoring to be correct.
 # We do not use the workload registry since it contains test and development
 # workloads as well.
-NUM_BASE_WORKLOADS = 8
+NUM_BASE_WORKLOADS = 9
 NUM_VARIANT_WORKLOADS = 0
 NUM_TRIALS = 5
 NUM_STUDIES = 3
diff --git a/scoring/score_submissions.py b/scoring/score_submissions.py
@@ -75,10 +75,10 @@
 FLAGS = flags.FLAGS
 
 
-def get_summary_df(workload, workload_df, include_test_split=False):
+def get_summary_df(workload, workload_df):
   print(f' WORKLOAD: {workload}')
   validation_metric, validation_target = (
-    scoring_utils.get_workload_metrics_and_targets(workload, split='validation')
+    scoring_utils.get_workload_metrics_and_targets(workload)
   )
 
   is_minimized = performance_profile.check_if_minimized(validation_metric)
@@ -127,7 +127,7 @@ def get_summary_df(workload, workload_df, include_test_split=False):
 
   # compute the step times
   def delta(series):
-    return series.shift(1, fill_value=0) - series
+    return series.apply(lambda x: np.diff(x, prepend=0))
 
   accumulated_time_intervals = delta(workload_df['accumulated_submission_time'])
   step_intervals = delta(workload_df['global_step'])
@@ -136,57 +136,27 @@ def delta(series):
       f'WARNING: The number of evals may be too low to calculate reliable step time for {workload}'
     )
 
-  summary_df['step_time (s)'] = np.median(
-    (accumulated_time_intervals / step_intervals).iloc[0]
-  )
-
-  summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
-
-  # test metrics
-  if include_test_split:
-    test_metric, test_target = scoring_utils.get_workload_metrics_and_targets(
-      workload, split='test'
+  # Flatten all intervals from all trials and take the global median
+  with np.errstate(divide='ignore', invalid='ignore'):
+    all_ratios = np.concatenate(
+      (accumulated_time_intervals / step_intervals).values
     )
+  summary_df['step_time (s)'] = np.nanmedian(all_ratios)
 
-    summary_df['test target metric name'] = test_metric
-    summary_df['test target metric value'] = test_target
-
-    summary_df['test target reached'] = (
-      workload_df[test_metric]
-      .apply(lambda x: target_op(x, test_target))
-      .apply(np.any)
-    )
-    summary_df['best metric value on test'] = workload_df[test_metric].apply(
-      lambda x: best_op(x)
-    )
-    workload_df['index best eval on test'] = workload_df[test_metric].apply(
-      lambda x: idx_op(x)
-    )
-    summary_df['time to best eval on test (s)'] = workload_df.apply(
-      lambda x: x['accumulated_submission_time'][x['index best eval on test']],
-      axis=1,
-    )
-    summary_df['time to target on test (s)'] = summary_df.apply(
-      lambda x: x['time to best eval on test (s)']
-      if x['test target reached']
-      else np.inf,
-      axis=1,
-    )
+  summary_df['step_hint'] = scoring_utils.get_workload_stephint(workload)
 
   return summary_df
 
 
-def get_submission_summary(df, include_test_split=False):
+def get_submission_summary(df):
   """Summarizes the submission results into metric and time tables
   organized by workload.
   """
 
   dfs = []
   print(df)
   for workload, group in df.groupby('workload'):
-    summary_df = get_summary_df(
-      workload, group, include_test_split=include_test_split
-    )
+    summary_df = get_summary_df(workload, group)
     dfs.append(summary_df)
 
   df = pd.concat(dfs)
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
@@ -214,7 +214,7 @@ def get_experiment_df(experiment_dir):
 
 
 ## Get workload properties
-def get_workload_metrics_and_targets(workload, split='validation'):
+def get_workload_metrics_and_targets(workload):
   """Returns workload target metric name and value."""
   workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
   framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
@@ -233,12 +233,8 @@ def get_workload_metrics_and_targets(workload, split='validation'):
     workload_init_kwargs=workload_init_kwargs,
   )
   metric_name = workload_obj.target_metric_name
-  if split == 'validation':
-    metric = f'validation/{metric_name}'
-    target = workload_obj.validation_target_value
-  elif split == 'test':
-    metric = f'test/{metric_name}'
-    target = workload_obj.test_target_value
+  metric = f'validation/{metric_name}'
+  target = workload_obj.validation_target_value
   return metric, target
 
 
diff --git a/scoring/utils/slurm/README.md b/scoring/utils/slurm/README.md
@@ -48,6 +48,29 @@ LOGS_BUCKET="algoperf-runs-internal"
 sbatch run_jobs.sh
 ```
 
+## Convenient bash script to launch SLURM jobs
+
+The run_submissions.sh script does all the steps above for you. It is intended to be used on a slurm login node. It however does expect a very specific directory structure. You need to be in the $HOME dir with the algorithmic-efficiency and submissions_algorithms git repos in the home dir.
+
+```
+$USER$@$USER$:~/$ tree -L 1
+.
+├── algorithmic-efficiency
+└── submissions_algorithms
+```
+
+And you run the script with a command like so:
+
+```
+./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \
+  --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2
+  --dry_run false
+```
+
+The submission path points to the dir where the submission exists (in the submissions git repo). `dry_run` is set to true by default (which limits max global steps to 10) to prevent accidental commands from wasting resources. Explicitly set it to false for full runs.
+
+The script will figure out the rest and run them for you (creating the config, saving it to a path with a reasonable name, and running the sbatch script with the right flags).
+
 # Set up new SLURM cluster
 
 If you are setting up a new cluster, we recommend using the [HPC toolkit to set up a SLURM cluster](https://cloud.google.com/cluster-toolkit/docs/quickstarts/slurm-cluster).
diff --git a/scoring/utils/slurm/make_job_config.py b/scoring/utils/slurm/make_job_config.py
@@ -9,6 +9,7 @@
 
 import json
 import os
+import struct
 
 import jax
 from absl import app, flags
@@ -17,8 +18,6 @@
 TUNING_SEARCH_SPACE = (
   'reference_algorithms/paper_baselines/adamw/tuning_search_space.json'
 )
-NUM_TUNING_TRIALS = 3  # For external tuning ruleset
-NUM_STUDIES = 3
 
 flags.DEFINE_string(
   'submission_path',
@@ -35,11 +34,6 @@
   'experiments',
   'Path to experiment dir where logs will be saved.',
 )
-flags.DEFINE_string(
-  'experiment_dir',
-  'experiments/',
-  'Path to experiment dir where logs will be saved.',
-)
 flags.DEFINE_enum(
   'framework',
   'jax',
@@ -56,14 +50,13 @@
 flags.DEFINE_string(
   'workloads', None, help='Comma seperated list of workloads to run.'
 )
-flags.DEFINE_integer('num_studies', NUM_STUDIES, help='Number of studies.')
+flags.DEFINE_integer('num_studies', None, help='Number of studies.')
+flags.DEFINE_integer('num_tuning_trials', None, help='Number of tuning trials.')
 
 FLAGS = flags.FLAGS
 
 MIN_INT = -(2 ** (31))
 MAX_INT = 2 ** (31) - 1
-NUM_TUNING_TRIALS = 5  # For external tuning ruleset
-NUM_STUDIES = 3
 
 WORKLOADS = {
   'imagenet_resnet': {'dataset': 'imagenet'},
@@ -74,6 +67,12 @@
   'librispeech_deepspeech': {'dataset': 'librispeech'},
   'criteo1tb': {'dataset': 'criteo1tb'},
   'librispeech_conformer': {'dataset': 'librispeech'},
+  'finewebedu_lm': {'dataset': 'fineweb_edu_10B'},
+}
+
+RULESET_CONFIGS = {
+  'self': {'num_studies': 3, 'num_tuning_trials': 1},
+  'external': {'num_studies': 3, 'num_tuning_trials': 5},
 }
 
 
@@ -83,17 +82,31 @@ def main(_):
   else:
     workloads = FLAGS.workloads.split(',')
 
-  key = jax.random.key(FLAGS.seed)
+  if not FLAGS.seed:
+    FLAGS.seed = struct.unpack('I', os.urandom(4))[0]
+
+  # Set defaults based on tuning_ruleset if not provided by user
+  num_studies = FLAGS.num_studies
+  if num_studies is None:
+    num_studies = RULESET_CONFIGS[FLAGS.tuning_ruleset]['num_studies']
+
+  num_tuning_trials = FLAGS.num_tuning_trials
+  if num_tuning_trials is None:
+    num_tuning_trials = RULESET_CONFIGS[FLAGS.tuning_ruleset][
+      'num_tuning_trials'
+    ]
+
+  key = jax.random.PRNGKey(FLAGS.seed)
 
   jobs = []
 
   for workload in workloads:
     # Fold in hash(workload) mod(max(uint32))
     workload_key = jax.random.fold_in(key, hash(workload) % (2**32 - 1))
-    for study_index in range(NUM_STUDIES):
+    for study_index in range(num_studies):
       study_key = jax.random.fold_in(workload_key, study_index)
       if FLAGS.tuning_ruleset == 'external':
-        for hparam_index in range(NUM_TUNING_TRIALS):
+        for hparam_index in range(num_tuning_trials):
           run_key = jax.random.fold_in(study_key, hparam_index)
           seed = jax.random.randint(run_key, (1,), MIN_INT, MAX_INT)[0].item()
           print(seed)
@@ -107,7 +120,7 @@ def main(_):
           job['experiment_dir'] = study_dir
           job['rng_seed'] = seed
           job['tuning_ruleset'] = FLAGS.tuning_ruleset
-          job['num_tuning_trials'] = NUM_TUNING_TRIALS
+          job['num_tuning_trials'] = num_tuning_trials
           job['hparam_start_index'] = hparam_index
           job['hparam_end_index'] = hparam_index + 1
           job['tuning_search_space'] = FLAGS.tuning_search_space
diff --git a/scoring/utils/slurm/run_jobs.sh b/scoring/utils/slurm/run_jobs.sh
diff --git a/scoring/utils/slurm/run_submission.sh b/scoring/utils/slurm/run_submission.sh

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,8 @@ dependencies = [`
`46`	`46`	`"clu==0.0.12",`
`47`	`47`	`"matplotlib>=3.9.2",`
`48`	`48`	`"tabulate==0.9.0",`
`49`		`- "wandb==0.21.0"`
	`49`	`+ "wandb==0.21.0",`
	`50`	`+ "importlib_resources"`
`50`	`51`	`]`
`51`	`52`
`52`	`53`	`[build-system]`