99# SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out
1010# SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err
1111
12- # Usage: sbatch <this file>.sh
12+ # Usage: sbatch <this file>.sh [options]
1313# This script reads config.json and launches a sbatch job using task
1414# arrays where each job in the array corresponds to a training run
1515# for a workload given a random seed and tuning trial index.
1616# To generate the config.json use make_job_config.py.
1717
1818set -x
1919
20- # Pull docker image (ATTENTION: you may want to modify this)
21- REPO=" europe-west4-docker.pkg.dev"
22- IMAGE=" europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
23- yes | gcloud auth configure-docker $REPO
24- docker pull $IMAGE
25- # Job config (ATTENTION: you may want to modify this)
26- config_file=" $HOME /algorithmic-efficiency/pytorch_scoring_config_1.json" # Replace with your config file path
27- LOGS_BUCKET=" algoperf-runs" # replace with your bucket used for logging
28-
29-
3020# Function to read a JSON file and extract a value by key
3121read_json_value () {
3222 local json_file=" $1 "
@@ -43,45 +33,137 @@ then
4333 exit 1
4434fi
4535
46- TASK=" $SLURM_ARRAY_TASK_ID "
47- FRAMEWORK=$( read_json_value " $config_file " " $TASK " " framework" )
48- DATASET=$( read_json_value " $config_file " " $TASK " " dataset" )
49- SUBMISSION_PATH=$( read_json_value " $config_file " " $TASK " " submission_path" )
50- FRAMEWORK=$( read_json_value " $config_file " " $TASK " " framework" )
51- TUNING_SEARCH_SPACE=$( read_json_value " $config_file " " $TASK " " tuning_search_space" )
52- EXPERIMENT_DIR=$( read_json_value " $config_file " " $TASK " " experiment_dir" )
53- MAX_STEPS=$( read_json_value " $config_file " " $TASK " " max_steps" )
54- RNG_SEED=$( read_json_value " $config_file " " $TASK " " rng_seed" )
55- WORKLOAD=$( read_json_value " $config_file " " $TASK " " workload" )
56- HPARAM_START_INDEX=$( read_json_value " $config_file " " $TASK " " hparam_start_index" )
57- HPARAM_END_INDEX=$( read_json_value " $config_file " " $TASK " " hparam_end_index" )
58- NUM_TUNING_TRIALS=$( read_json_value " $config_file " " $TASK " " num_tuning_trials" )
59- TUNING_RULESET=$( read_json_value " $config_file " " $TASK " " tuning_ruleset" )
60- MAX_GLOBAL_STEPS=$( read_json_value " $config_file " " $TASK " " max_global_steps" )
36+ # Default values
37+ REPO=" europe-west4-docker.pkg.dev"
38+ IMAGE=" europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
39+ CONFIG_FILE=" $HOME /algorithmic-efficiency/pytorch_scoring_config_1.json"
40+ LOGS_BUCKET=" algoperf-runs"
41+ TASK_ID=" ${SLURM_ARRAY_TASK_ID:- 0} "
6142
62- docker run \
63- -v /opt/data/:/data/ \
64- -v $HOME /experiment_runs:/experiment_runs \
65- -v $HOME /submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \
66- -v $HOME /algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh \
67- --gpus all \
68- --ipc=host \
69- $IMAGE \
70- -d $DATASET \
71- -f $FRAMEWORK \
72- -s $SUBMISSION_PATH \
73- -w $WORKLOAD \
74- -t $TUNING_SEARCH_SPACE \
75- -e $EXPERIMENT_DIR \
76- -c False \
77- -o True \
78- --rng_seed $RNG_SEED \
79- --hparam_start_index $HPARAM_START_INDEX \
80- --hparam_end_index $HPARAM_END_INDEX \
81- --num_tuning_trials $NUM_TUNING_TRIALS \
82- --tuning_ruleset $TUNING_RULESET \
83- -i true \
84- -r false \
85- --logs_bucket $LOGS_BUCKET \
86- -m $MAX_GLOBAL_STEPS
43+ # Parse flags
44+ while [[ $# -gt 0 ]]; do
45+ case $1 in
46+ --repo)
47+ REPO=" $2 "
48+ shift 2
49+ ;;
50+ --image)
51+ IMAGE=" $2 "
52+ shift 2
53+ ;;
54+ --config_file)
55+ CONFIG_FILE=" $2 "
56+ shift 2
57+ ;;
58+ --logs_bucket)
59+ LOGS_BUCKET=" $2 "
60+ shift 2
61+ ;;
62+ --task_id)
63+ TASK_ID=" $2 "
64+ shift 2
65+ ;;
66+ --framework)
67+ FRAMEWORK=" $2 "
68+ shift 2
69+ ;;
70+ --dataset)
71+ DATASET=" $2 "
72+ shift 2
73+ ;;
74+ --submission_path)
75+ SUBMISSION_PATH=" $2 "
76+ shift 2
77+ ;;
78+ --tuning_search_space)
79+ TUNING_SEARCH_SPACE=" $2 "
80+ shift 2
81+ ;;
82+ --experiment_dir)
83+ EXPERIMENT_DIR=" $2 "
84+ shift 2
85+ ;;
86+ --rng_seed)
87+ RNG_SEED=" $2 "
88+ shift 2
89+ ;;
90+ --workload)
91+ WORKLOAD=" $2 "
92+ shift 2
93+ ;;
94+ --hparam_start_index)
95+ HPARAM_START_INDEX=" $2 "
96+ shift 2
97+ ;;
98+ --hparam_end_index)
99+ HPARAM_END_INDEX=" $2 "
100+ shift 2
101+ ;;
102+ --num_tuning_trials)
103+ NUM_TUNING_TRIALS=" $2 "
104+ shift 2
105+ ;;
106+ --tuning_ruleset)
107+ TUNING_RULESET=" $2 "
108+ shift 2
109+ ;;
110+ --max_global_steps)
111+ MAX_GLOBAL_STEPS=" $2 "
112+ shift 2
113+ ;;
114+ * )
115+ echo " Unknown option $1 "
116+ exit 1
117+ ;;
118+ esac
119+ done
120+
121+ # Pull docker image
122+ yes | gcloud auth configure-docker " $REPO "
123+ docker pull " $IMAGE "
124+
125+ # Set variables from config file if not already set by flags
126+ FRAMEWORK=" ${FRAMEWORK:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " framework" )} "
127+ DATASET=" ${DATASET:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " dataset" )} "
128+ SUBMISSION_PATH=" ${SUBMISSION_PATH:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " submission_path" )} "
129+ TUNING_SEARCH_SPACE=" ${TUNING_SEARCH_SPACE:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " tuning_search_space" )} "
130+ EXPERIMENT_DIR=" ${EXPERIMENT_DIR:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " experiment_dir" )} "
131+ RNG_SEED=" ${RNG_SEED:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " rng_seed" )} "
132+ WORKLOAD=" ${WORKLOAD:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " workload" )} "
133+ HPARAM_START_INDEX=" ${HPARAM_START_INDEX:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " hparam_start_index" )} "
134+ HPARAM_END_INDEX=" ${HPARAM_END_INDEX:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " hparam_end_index" )} "
135+ NUM_TUNING_TRIALS=" ${NUM_TUNING_TRIALS:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " num_tuning_trials" )} "
136+ TUNING_RULESET=" ${TUNING_RULESET:- $(read_json_value " $CONFIG_FILE " " $TASK_ID " " tuning_ruleset" )} "
137+
138+ DOCKER_CMD=(
139+ docker run
140+ -v /opt/data/:/data/
141+ -v " $HOME /experiment_runs:/experiment_runs"
142+ -v " $HOME /submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms"
143+ -v " $HOME /algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh"
144+ --gpus all
145+ --ipc=host
146+ " $IMAGE "
147+ -d " $DATASET "
148+ -f " $FRAMEWORK "
149+ -s " $SUBMISSION_PATH "
150+ -w " $WORKLOAD "
151+ -t " $TUNING_SEARCH_SPACE "
152+ -e " $EXPERIMENT_DIR "
153+ -c False
154+ -o True
155+ --rng_seed " $RNG_SEED "
156+ --hparam_start_index " $HPARAM_START_INDEX "
157+ --hparam_end_index " $HPARAM_END_INDEX "
158+ --num_tuning_trials " $NUM_TUNING_TRIALS "
159+ --tuning_ruleset " $TUNING_RULESET "
160+ -i true
161+ -r false
162+ --logs_bucket " $LOGS_BUCKET "
163+ )
164+
165+ if [ -n " $MAX_GLOBAL_STEPS " ]; then
166+ DOCKER_CMD+=(-m " $MAX_GLOBAL_STEPS " )
167+ fi
87168
169+ " ${DOCKER_CMD[@]} "
0 commit comments