Change the workload name to finewebedu everywhere to maintain uniformity.

aahladc · aahladc · commit 852811808e1c · 2026-03-30T10:47:46.000-07:00
diff --git a/.github/workflows/regression_tests.yml b/.github/workflows/regression_tests.yml
@@ -116,7 +116,7 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  -d fineweb_edu_10B -f jax -s algorithms/archived_paper_baselines/adamw/jax/submission.py -w finewebedu_lm -t algorithms/archived_paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs    
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  -d finewebedu -f jax -s algorithms/archived_paper_baselines/adamw/jax/submission.py -w finewebedu_lm -t algorithms/archived_paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs    
   fastmri_pytorch:
     runs-on: self-hosted
     needs: build_and_push_pytorch_docker_image
@@ -198,4 +198,4 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}  
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}  -d fineweb_edu_10B -f pytorch -s algorithms/archived_paper_baselines/adamw/pytorch/submission.py -w finewebedu_lm -t algorithms/archived_paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs     
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}  -d finewebedu -f pytorch -s algorithms/archived_paper_baselines/adamw/pytorch/submission.py -w finewebedu_lm -t algorithms/archived_paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs     
diff --git a/dataset/dataset_setup.py b/dataset/dataset_setup.py
@@ -782,7 +782,7 @@ def download_finewebedu(
 ):
   """Download FineWebEdu-10B."""
 
-  data_dir = os.path.join(data_dir, 'fineweb_edu_10B')
+  data_dir = os.path.join(data_dir, 'finewebedu')
   _maybe_mkdir(data_dir)
   _maybe_mkdir(tmp_dir)
 
diff --git a/docker/scripts/startup.sh b/docker/scripts/startup.sh
@@ -174,7 +174,7 @@ fi
 
 # Check if arguments are valid
 VALID_DATASETS=("criteo1tb" "imagenet"  "fastmri" "ogbg" "librispeech" \
-                "wmt" "mnist" "fineweb_edu_10B")
+                "wmt" "mnist" "finewebedu")
 VALID_WORKLOADS=("criteo1tb" "imagenet_resnet" "imagenet_resnet_silu" "imagenet_resnet_gelu" \
                  "imagenet_resnet_large_bn_init" "imagenet_vit" "imagenet_vit_glu" \
                  "imagenet_vit_post_ln" "imagenet_vit_map" "fastmri" "ogbg" \
diff --git a/scoring/utils/slurm/make_job_config.py b/scoring/utils/slurm/make_job_config.py
@@ -67,7 +67,7 @@
   'librispeech_deepspeech': {'dataset': 'librispeech'},
   'criteo1tb': {'dataset': 'criteo1tb'},
   'librispeech_conformer': {'dataset': 'librispeech'},
-  'finewebedu_lm': {'dataset': 'fineweb_edu_10B'},
+  'finewebedu_lm': {'dataset': 'finewebedu'},
 }
 
 RULESET_CONFIGS = {
diff --git a/scoring/utils/workload_metadata_external_tuning.json b/scoring/utils/workload_metadata_external_tuning.json
@@ -33,6 +33,6 @@
   },
   "finewebedu_lm" : {
     "max_steps": 55000,
-    "dataset":"fineweb_edu_10B"
+    "dataset":"finewebedu"
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@`
`67`	`67`	`'librispeech_deepspeech': {'dataset': 'librispeech'},`
`68`	`68`	`'criteo1tb': {'dataset': 'criteo1tb'},`
`69`	`69`	`'librispeech_conformer': {'dataset': 'librispeech'},`
`70`		`- 'finewebedu_lm': {'dataset': 'fineweb_edu_10B'},`
	`70`	`+ 'finewebedu_lm': {'dataset': 'finewebedu'},`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`RULESET_CONFIGS = {`
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,6 @@`
`33`	`33`	`},`
`34`	`34`	`"finewebedu_lm" : {`
`35`	`35`	`"max_steps": 55000,`
`36`		`- "dataset":"fineweb_edu_10B"`
	`36`	`+ "dataset":"finewebedu"`
`37`	`37`	`}`
`38`	`38`	`}`