Skip to content

Commit fb6cbe5

Browse files
committed
WIP: Testing scoring
1 parent ec5eae0 commit fb6cbe5

4 files changed

Lines changed: 409 additions & 15 deletions

File tree

pytorch_scoring_config_1.json

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
{
2+
"0": {
3+
"framework": "pytorch",
4+
"workload": "imagenet_resnet",
5+
"dataset": "imagenet",
6+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
7+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
8+
"rng_seed": -1447200680,
9+
"tuning_ruleset": "self",
10+
"num_tuning_trials": 1,
11+
"max_global_steps": 10
12+
},
13+
"1": {
14+
"framework": "pytorch",
15+
"workload": "imagenet_resnet",
16+
"dataset": "imagenet",
17+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
18+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
19+
"rng_seed": -1977906563,
20+
"tuning_ruleset": "self",
21+
"num_tuning_trials": 1,
22+
"max_global_steps": 10
23+
},
24+
"2": {
25+
"framework": "pytorch",
26+
"workload": "imagenet_resnet",
27+
"dataset": "imagenet",
28+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
29+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
30+
"rng_seed": 666869491,
31+
"tuning_ruleset": "self",
32+
"num_tuning_trials": 1,
33+
"max_global_steps": 10
34+
},
35+
"3": {
36+
"framework": "pytorch",
37+
"workload": "imagenet_vit",
38+
"dataset": "imagenet",
39+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
40+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
41+
"rng_seed": -796448826,
42+
"tuning_ruleset": "self",
43+
"num_tuning_trials": 1,
44+
"max_global_steps": 10
45+
},
46+
"4": {
47+
"framework": "pytorch",
48+
"workload": "imagenet_vit",
49+
"dataset": "imagenet",
50+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
51+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
52+
"rng_seed": -557820510,
53+
"tuning_ruleset": "self",
54+
"num_tuning_trials": 1,
55+
"max_global_steps": 10
56+
},
57+
"5": {
58+
"framework": "pytorch",
59+
"workload": "imagenet_vit",
60+
"dataset": "imagenet",
61+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
62+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
63+
"rng_seed": -1307522002,
64+
"tuning_ruleset": "self",
65+
"num_tuning_trials": 1,
66+
"max_global_steps": 10
67+
},
68+
"6": {
69+
"framework": "pytorch",
70+
"workload": "fastmri",
71+
"dataset": "fastmri",
72+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
73+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
74+
"rng_seed": 1083014187,
75+
"tuning_ruleset": "self",
76+
"num_tuning_trials": 1,
77+
"max_global_steps": 10
78+
},
79+
"7": {
80+
"framework": "pytorch",
81+
"workload": "fastmri",
82+
"dataset": "fastmri",
83+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
84+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
85+
"rng_seed": -1077277636,
86+
"tuning_ruleset": "self",
87+
"num_tuning_trials": 1,
88+
"max_global_steps": 10
89+
},
90+
"8": {
91+
"framework": "pytorch",
92+
"workload": "fastmri",
93+
"dataset": "fastmri",
94+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
95+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
96+
"rng_seed": -397959160,
97+
"tuning_ruleset": "self",
98+
"num_tuning_trials": 1,
99+
"max_global_steps": 10
100+
},
101+
"9": {
102+
"framework": "pytorch",
103+
"workload": "ogbg",
104+
"dataset": "ogbg",
105+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
106+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
107+
"rng_seed": 1662399765,
108+
"tuning_ruleset": "self",
109+
"num_tuning_trials": 1,
110+
"max_global_steps": 10
111+
},
112+
"10": {
113+
"framework": "pytorch",
114+
"workload": "ogbg",
115+
"dataset": "ogbg",
116+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
117+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
118+
"rng_seed": 486196682,
119+
"tuning_ruleset": "self",
120+
"num_tuning_trials": 1,
121+
"max_global_steps": 10
122+
},
123+
"11": {
124+
"framework": "pytorch",
125+
"workload": "ogbg",
126+
"dataset": "ogbg",
127+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
128+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
129+
"rng_seed": 1039483369,
130+
"tuning_ruleset": "self",
131+
"num_tuning_trials": 1,
132+
"max_global_steps": 10
133+
},
134+
"12": {
135+
"framework": "pytorch",
136+
"workload": "wmt",
137+
"dataset": "wmt",
138+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
139+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
140+
"rng_seed": -811149048,
141+
"tuning_ruleset": "self",
142+
"num_tuning_trials": 1,
143+
"max_global_steps": 10
144+
},
145+
"13": {
146+
"framework": "pytorch",
147+
"workload": "wmt",
148+
"dataset": "wmt",
149+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
150+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
151+
"rng_seed": -1485236731,
152+
"tuning_ruleset": "self",
153+
"num_tuning_trials": 1,
154+
"max_global_steps": 10
155+
},
156+
"14": {
157+
"framework": "pytorch",
158+
"workload": "wmt",
159+
"dataset": "wmt",
160+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
161+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
162+
"rng_seed": -439753961,
163+
"tuning_ruleset": "self",
164+
"num_tuning_trials": 1,
165+
"max_global_steps": 10
166+
},
167+
"15": {
168+
"framework": "pytorch",
169+
"workload": "librispeech_deepspeech",
170+
"dataset": "librispeech",
171+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
172+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
173+
"rng_seed": -1459326687,
174+
"tuning_ruleset": "self",
175+
"num_tuning_trials": 1,
176+
"max_global_steps": 10
177+
},
178+
"16": {
179+
"framework": "pytorch",
180+
"workload": "librispeech_deepspeech",
181+
"dataset": "librispeech",
182+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
183+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
184+
"rng_seed": 1889675898,
185+
"tuning_ruleset": "self",
186+
"num_tuning_trials": 1,
187+
"max_global_steps": 10
188+
},
189+
"17": {
190+
"framework": "pytorch",
191+
"workload": "librispeech_deepspeech",
192+
"dataset": "librispeech",
193+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
194+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
195+
"rng_seed": -1297403039,
196+
"tuning_ruleset": "self",
197+
"num_tuning_trials": 1,
198+
"max_global_steps": 10
199+
},
200+
"18": {
201+
"framework": "pytorch",
202+
"workload": "criteo1tb",
203+
"dataset": "criteo1tb",
204+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
205+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
206+
"rng_seed": -1790695410,
207+
"tuning_ruleset": "self",
208+
"num_tuning_trials": 1,
209+
"max_global_steps": 10
210+
},
211+
"19": {
212+
"framework": "pytorch",
213+
"workload": "criteo1tb",
214+
"dataset": "criteo1tb",
215+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
216+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
217+
"rng_seed": -816806699,
218+
"tuning_ruleset": "self",
219+
"num_tuning_trials": 1,
220+
"max_global_steps": 10
221+
},
222+
"20": {
223+
"framework": "pytorch",
224+
"workload": "criteo1tb",
225+
"dataset": "criteo1tb",
226+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
227+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
228+
"rng_seed": 1704852417,
229+
"tuning_ruleset": "self",
230+
"num_tuning_trials": 1,
231+
"max_global_steps": 10
232+
},
233+
"21": {
234+
"framework": "pytorch",
235+
"workload": "librispeech_conformer",
236+
"dataset": "librispeech",
237+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
238+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
239+
"rng_seed": 1605670948,
240+
"tuning_ruleset": "self",
241+
"num_tuning_trials": 1,
242+
"max_global_steps": 10
243+
},
244+
"22": {
245+
"framework": "pytorch",
246+
"workload": "librispeech_conformer",
247+
"dataset": "librispeech",
248+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
249+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
250+
"rng_seed": -1323816683,
251+
"tuning_ruleset": "self",
252+
"num_tuning_trials": 1,
253+
"max_global_steps": 10
254+
},
255+
"23": {
256+
"framework": "pytorch",
257+
"workload": "librispeech_conformer",
258+
"dataset": "librispeech",
259+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
260+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
261+
"rng_seed": -1881486829,
262+
"tuning_ruleset": "self",
263+
"num_tuning_trials": 1,
264+
"max_global_steps": 10
265+
},
266+
"24": {
267+
"framework": "pytorch",
268+
"workload": "finewebedu_lm",
269+
"dataset": "fineweb_edu_10B",
270+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
271+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_0",
272+
"rng_seed": -304430747,
273+
"tuning_ruleset": "self",
274+
"num_tuning_trials": 1,
275+
"max_global_steps": 10
276+
},
277+
"25": {
278+
"framework": "pytorch",
279+
"workload": "finewebedu_lm",
280+
"dataset": "fineweb_edu_10B",
281+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
282+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_1",
283+
"rng_seed": -912336586,
284+
"tuning_ruleset": "self",
285+
"num_tuning_trials": 1,
286+
"max_global_steps": 10
287+
},
288+
"26": {
289+
"framework": "pytorch",
290+
"workload": "finewebedu_lm",
291+
"dataset": "fineweb_edu_10B",
292+
"submission_path": "submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2/submission.py",
293+
"experiment_dir": "submissions_a100/schedule_free_adamw_v2/study_2",
294+
"rng_seed": 1970089239,
295+
"tuning_ruleset": "self",
296+
"num_tuning_trials": 1,
297+
"max_global_steps": 10
298+
}
299+
}

run_pytorch_scoring.sh

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/bin/bash
2+
3+
#SBATCH --nodes=1 # give it a full node
4+
#SBATCH --ntasks-per-node=1
5+
#SBATCH --array=0-26
6+
#SBATCH --partition=a100
7+
#SBATCH --gpus-per-node=4
8+
#SBATCH --exclusive #this will not allow other jobs to run on this cluster
9+
#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out
10+
#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err
11+
12+
# Usage: sbatch <this file>.sh
13+
# This script reads config.json and launches a sbatch job using task
14+
# arrays where each job in the array corresponds to a training run
15+
# for a workload given a random seed and tuning trial index.
16+
# To generate the config.json use make_job_config.py.
17+
18+
set -x
19+
20+
# Pull docker image (ATTENTION: you may want to modify this)
21+
REPO="europe-west4-docker.pkg.dev"
22+
IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
23+
yes | gcloud auth configure-docker $REPO
24+
docker pull $IMAGE
25+
# Job config (ATTENTION: you may want to modify this)
26+
config_file="$HOME/algorithmic-efficiency/pytorch_scoring_config_1.json" # Replace with your config file path
27+
LOGS_BUCKET="algoperf-runs" # replace with your bucket used for logging
28+
29+
30+
# Function to read a JSON file and extract a value by key
31+
read_json_value() {
32+
local json_file="$1"
33+
local index="$2"
34+
local key="$3"
35+
local value=$(jq -r ".[\"$index\"].$key" "$json_file")
36+
echo "$value"
37+
}
38+
39+
# Check if jq is installed
40+
if ! command -v jq &> /dev/null
41+
then
42+
echo "jq could not be found. Please install it."
43+
exit 1
44+
fi
45+
46+
TASK="$SLURM_ARRAY_TASK_ID"
47+
FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework")
48+
DATASET=$(read_json_value "$config_file" "$TASK" "dataset")
49+
SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path")
50+
FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework")
51+
TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space")
52+
EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir")
53+
MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps")
54+
RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed")
55+
WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload")
56+
HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index")
57+
HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index")
58+
NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials")
59+
TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset")
60+
MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$TASK" "max_global_steps")
61+
62+
docker run \
63+
-v /opt/data/:/data/ \
64+
-v $HOME/experiment_runs:/experiment_runs \
65+
-v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \
66+
-v $HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh \
67+
--gpus all \
68+
--ipc=host \
69+
$IMAGE \
70+
-d $DATASET \
71+
-f $FRAMEWORK \
72+
-s $SUBMISSION_PATH \
73+
-w $WORKLOAD \
74+
-t $TUNING_SEARCH_SPACE \
75+
-e $EXPERIMENT_DIR \
76+
-c False \
77+
-o True \
78+
--rng_seed $RNG_SEED \
79+
--hparam_start_index $HPARAM_START_INDEX \
80+
--hparam_end_index $HPARAM_END_INDEX \
81+
--num_tuning_trials $NUM_TUNING_TRIALS \
82+
--tuning_ruleset $TUNING_RULESET \
83+
-i true \
84+
-r false \
85+
--logs_bucket $LOGS_BUCKET \
86+
-m $MAX_GLOBAL_STEPS
87+

0 commit comments

Comments
 (0)