Merge pull request #757 from mlcommons/dev

priyakasimbeg · web-flow · commit 698e945a5ad6 · 2024-04-09T13:04:58.000-07:00
Dev -&gt; Main
diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
@@ -360,6 +360,14 @@ Valid submissions must rely on new algorithmic or mathematical ideas and should
 
 </details>
 
+##### Submissions vs. Baselines
+
+Submitters may also submit algorithms marked as *baselines*. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters. Baseline algorithms will still appear on the leaderboard but will be clearly marked as such. We highly encourage the submission of baselines for educational purposes.
+
+Baseline algorithms might, for example, include existing algorithms with different search spaces or learning rate schedules.
+Another example involves porting submissions to different frameworks. For instance, a participant may wish to assess their algorithm in both JAX and PyTorch to demonstrate the impact of the framework. However, in such cases, one of these submissions must be designated as eligible for prize consideration, while the other is marked as a baseline. This prevents circumventing of tuning rules and the spirit of the benchmark by creating additional "lottery tickets".
+Baselines might not be prioritized when using the compute resources by the sponsors of the benchmark.
+
 ##### Software dependencies
 
 We require submissions to use specific versions of `PyTorch`/`JAX` as well as additional dependencies in order to facilitate fair comparisons. Submitters must build on top of these provided software packages, which might be provided as a `Docker` container. Additional dependencies can be added as long as they include a comment describing what was added and why. Submitters are free to add dependencies that support new algorithmic and mathematical ideas but they should not circumvent the intention of the benchmark to measure training speedups due to new training methods. For example, software engineering techniques that lead to faster implementations of existing software, e.g. using newer versions of `PyTorch` or `JAX`, are not allowed and these are described in more detail in the [Disallowed submissions](#disallowed-submissions) section.
@@ -545,7 +553,7 @@ new Compute Instance with the "Deep Learning on Linux" Image in Boot disk option
 
 Our benchmark allows multiple submissions by the same team of submitters as long as they are substantially different. We disallow submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark.
 
-Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters.
+Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters. See the [Submissions vs. Baselines](#submissions-vs-baselines) Section.
 
 #### Can my submission be structured using multiple files?
 
diff --git a/algorithmic_efficiency/workloads/wmt/wmt_jax/workload.py b/algorithmic_efficiency/workloads/wmt/wmt_jax/workload.py
@@ -315,7 +315,7 @@ class WmtWorkloadAttentionTemp(WmtWorkload):
 
   @property
   def validation_target_value(self) -> float:
-    return 29.8611
+    return 29.3379
 
   @property
   def test_target_value(self) -> float:
@@ -331,7 +331,7 @@ class WmtWorkloadGLUTanH(WmtWorkload):
 
   @property
   def validation_target_value(self) -> float:
-    return 29.6517
+    return 29.5779
 
   @property
   def test_target_value(self) -> float:
diff --git a/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py b/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py
@@ -371,7 +371,7 @@ class WmtWorkloadAttentionTemp(WmtWorkload):
 
   @property
   def validation_target_value(self) -> float:
-    return 29.8611
+    return 29.3379
 
   @property
   def test_target_value(self) -> float:
@@ -387,7 +387,7 @@ class WmtWorkloadGLUTanH(WmtWorkload):
 
   @property
   def validation_target_value(self) -> float:
-    return 29.6517
+    return 29.5779
 
   @property
   def test_target_value(self) -> float:
diff --git a/prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py b/prize_qualification_baselines/external_tuning/jax_nadamw_full_budget.py
@@ -307,6 +307,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/external_tuning/jax_nadamw_target_setting.py b/prize_qualification_baselines/external_tuning/jax_nadamw_target_setting.py
@@ -307,6 +307,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py b/prize_qualification_baselines/external_tuning/pytorch_nadamw_full_budget.py
@@ -309,6 +309,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py b/prize_qualification_baselines/external_tuning/pytorch_nadamw_target_setting.py
@@ -309,6 +309,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/self_tuning/jax_nadamw_full_budget.py b/prize_qualification_baselines/self_tuning/jax_nadamw_full_budget.py
@@ -322,6 +322,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/self_tuning/jax_nadamw_target_setting.py b/prize_qualification_baselines/self_tuning/jax_nadamw_target_setting.py
@@ -322,6 +322,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/self_tuning/pytorch_nadamw_full_budget.py b/prize_qualification_baselines/self_tuning/pytorch_nadamw_full_budget.py
@@ -324,6 +324,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/prize_qualification_baselines/self_tuning/pytorch_nadamw_target_setting.py b/prize_qualification_baselines/self_tuning/pytorch_nadamw_target_setting.py
@@ -324,6 +324,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/adamw/jax/submission.py b/reference_algorithms/paper_baselines/adamw/jax/submission.py
@@ -165,6 +165,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/adamw/pytorch/submission.py b/reference_algorithms/paper_baselines/adamw/pytorch/submission.py
@@ -133,6 +133,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/momentum/jax/submission.py b/reference_algorithms/paper_baselines/momentum/jax/submission.py
@@ -199,6 +199,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/momentum/pytorch/submission.py b/reference_algorithms/paper_baselines/momentum/pytorch/submission.py
@@ -152,6 +152,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/nadamw/jax/submission.py b/reference_algorithms/paper_baselines/nadamw/jax/submission.py
@@ -307,6 +307,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/nadamw/pytorch/submission.py b/reference_algorithms/paper_baselines/nadamw/pytorch/submission.py
@@ -309,6 +309,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/nesterov/jax/submission.py b/reference_algorithms/paper_baselines/nesterov/jax/submission.py
@@ -199,6 +199,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/paper_baselines/nesterov/pytorch/submission.py b/reference_algorithms/paper_baselines/nesterov/pytorch/submission.py
@@ -152,6 +152,10 @@ def get_batch_size(workload_name):
     return 32
   elif workload_name == 'imagenet_resnet':
     return 1024
+  elif workload_name == 'imagenet_resnet_silu':
+    return 512
+  elif workload_name == 'imagenet_resnet_gelu':
+    return 512
   elif workload_name == 'imagenet_vit':
     return 1024
   elif workload_name == 'librispeech_conformer':
diff --git a/reference_algorithms/target_setting_algorithms/wmt_attention_temp/tuning_search_space.json b/reference_algorithms/target_setting_algorithms/wmt_attention_temp/tuning_search_space.json
@@ -1,17 +1,17 @@
 {
     "learning_rate": {
         "feasible_points": [
-            0.0003477912008450351
+            0.000590120167916659
         ]
     },
     "beta1": {
         "feasible_points": [
-            0.9936632117510711
+            0.737199286155609
         ]
     },
     "beta2": {
         "feasible_points": [
-            0.9967873550453692
+            0.05919391544031072
         ]
     },
     "warmup_steps": {
@@ -21,7 +21,7 @@
     },
     "weight_decay": {
         "feasible_points": [
-            0.04120183162940475
+            0.14128519778326312
         ]
     },
     "label_smoothing": {
diff --git a/reference_algorithms/target_setting_algorithms/wmt_glu_tanh/tuning_search_space.json b/reference_algorithms/target_setting_algorithms/wmt_glu_tanh/tuning_search_space.json
@@ -1,32 +1,32 @@
 {
     "learning_rate": {
         "feasible_points": [
-            0.0002111193022461917
+            0.000872041489644454
         ]
     },
     "beta1": {
         "feasible_points": [
-            0.8748186204170956
+            0.45562164405092065
         ]
     },
     "beta2": {
         "feasible_points": [
-            0.8576876516215266
+            0.9982167124443476
         ]
     },
     "warmup_steps": {
         "feasible_points": [
-            9999
+            4999
         ]
     },
     "weight_decay": {
         "feasible_points": [
-            0.18033280763289028
+            0.01536114562763022
         ]
     },
     "label_smoothing": {
         "feasible_points": [
-            0.0
+            0.1
         ]
     }
 }
diff --git a/scoring/generate_held_out_workloads.py b/scoring/generate_held_out_workloads.py
@@ -7,9 +7,11 @@
 from absl import logging
 import numpy as np
 
-flags.DEFINE_integer('held_out_workloads_seed',
-                     None,
-                     'Random seed for scoring.')
+flags.DEFINE_integer(
+    'held_out_workloads_seed',
+    None,
+    'Random seed for scoring.'
+    'AlgoPerf v0.5 seed: 3438810845')
 flags.DEFINE_string('output_filename',
                     'held_out_workloads.json',
                     'Path to file to record sampled held_out workloads.')
@@ -19,7 +21,10 @@
     'librispeech': [
         'librispeech_conformer_attention_temperature',
         'librispeech_conformer_layernorm',
-        'librispeech_conformer_gelu'
+        # 'librispeech_conformer_gelu', # Removed due to bug in target setting procedure
+        'librispeech_deepspeech_no_resnet',
+        'librispeech_deepspeech_norm_and_spec_aug',
+        'librispeech_deepspeech_tanh'
     ],
     'imagenet': [
         'imagenet_resnet_silu',
diff --git a/scoring/held_out_workloads_algoperf_v05.json b/scoring/held_out_workloads_algoperf_v05.json
@@ -0,0 +1 @@
+["librispeech_conformer_layernorm", "imagenet_resnet_large_bn_init", "ogbg_model_size", "wmt_glu_tanh", "fastmri_tanh", "criteo1tb_embed_init"]
diff --git a/submissions/template/submission.py b/submissions/template/submission.py
@@ -49,7 +49,8 @@ def get_batch_size(workload_name):
     Args: 
       workload_name (str): Valid workload_name values are: "wmt", "ogbg", 
         "criteo1tb", "fastmri", "imagenet_resnet", "imagenet_vit", 
-        "librispeech_deepspeech", "librispeech_conformer".
+        "librispeech_deepspeech", "librispeech_conformer" or any of the
+        variants.
     Returns:
       int: batch_size 
     Raises: