22
33# SBATCH --nodes=1 # give it a full node
44# SBATCH --ntasks-per-node=1
5- # SBATCH --array=<fill with range of items in config, e.g 0-7 >
6- # SBATCH --partition=v100
7- # SBATCH --gpus-per-node=8
5+ # SBATCH --array=0-26
6+ # SBATCH --partition=a100
7+ # SBATCH --gpus-per-node=4
88# SBATCH --exclusive #this will not allow other jobs to run on this cluster
9- # SBATCH --output=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw /job_%A_%a.out
10- # SBATCH --error=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw /job_%A_%a.err
9+ # SBATCH --output=experiments/tests/updated_schedule_free /job_%A_%a.out
10+ # SBATCH --error=experiments/tests/updated_schedule_free /job_%A_%a.err
1111
12- # Usage: sbatch <this file>.sh
12+ # Usage: sbatch <this file>.sh [options]
1313# This script reads config.json and launches a sbatch job using task
14- # arrays where each job in the array corresponds to a training run
14+ # arrays where each job in the array corresponds to a training run
1515# for a workload given a random seed and tuning trial index.
1616# To generate the config.json use make_job_config.py.
1717
1818set -x
1919
20- # Pull docker image (ATTENTION: you may want to modify this)
21- REPO=" "
22- IMAGE=" "
23- y | gcloud auth configure-docker $REPO
24- docker pull $IMAGE
25- # Job config (ATTENTION: you may want to modify this)
26- config_file=" " # Replace with your config file path
27- LOGS_BUCKET=" " # replace with your bucket used for logging
28-
29-
3020# Function to read a JSON file and extract a value by key
3121read_json_value () {
3222 local json_file=" $1 "
4333 exit 1
4434fi
4535
46- TASK=" $SLURM_ARRAY_TASK_ID "
47- FRAMEWORK=$( read_json_value " $config_file " " $TASK " " framework" )
48- DATASET=$( read_json_value " $config_file " " $TASK " " dataset" )
49- SUBMISSION_PATH=$( read_json_value " $config_file " " $TASK " " submission_path" )
50- FRAMEWORK=$( read_json_value " $config_file " " $TASK " " framework" )
51- TUNING_SEARCH_SPACE=$( read_json_value " $config_file " " $TASK " " tuning_search_space" )
52- EXPERIMENT_DIR=$( read_json_value " $config_file " " $TASK " " experiment_dir" )
53- MAX_STEPS=$( read_json_value " $config_file " " $TASK " " max_steps" )
54- RNG_SEED=$( read_json_value " $config_file " " $TASK " " rng_seed" )
55- WORKLOAD=$( read_json_value " $config_file " " $TASK " " workload" )
56- HPARAM_START_INDEX=$( read_json_value " $config_file " " $TASK " " hparam_start_index" )
57- HPARAM_END_INDEX=$( read_json_value " $config_file " " $TASK " " hparam_end_index" )
58- NUM_TUNING_TRIALS=$( read_json_value " $config_file " " $TASK " " num_tuning_trials" )
59- TUNING_RULESET=$( read_json_value " $config_file " " $TASK " " tuning_ruleset" )
60- MAX_GLOBAL_STEPS=$( read_json_value " $config_file " " $MAX_GLOBAL_STEPS " " max_global_steps" )
36+ # Default values
37+ REPO=" europe-west4-docker.pkg.dev"
38+ IMAGE=" europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
39+ CONFIG_FILE=" $HOME /algorithmic-efficiency/config.json"
40+ LOGS_BUCKET=" algoperf-runs"
41+ TASK_ID=" ${SLURM_ARRAY_TASK_ID:- 0} "
42+
43+ # Parse flags
44+ while [[ $# -gt 0 ]]; do
45+ case $1 in
46+ --repo)
47+ REPO=" $2 "
48+ shift 2
49+ ;;
50+ --image)
51+ IMAGE=" $2 "
52+ shift 2
53+ ;;
54+ --config_file)
55+ CONFIG_FILE=" $2 "
56+ shift 2
57+ ;;
58+ --logs_bucket)
59+ LOGS_BUCKET=" $2 "
60+ shift 2
61+ ;;
62+ --max_global_steps)
63+ MAX_GLOBAL_STEPS=" $2 "
64+ shift 2
65+ ;;
66+ * )
67+ echo " Unknown option $1 "
68+ exit 1
69+ ;;
70+ esac
71+ done
72+
73+ # Pull docker image
74+ yes | gcloud auth configure-docker " $REPO "
75+ docker pull " $IMAGE "
76+
77+ # Set variables from config file
78+ FRAMEWORK=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " framework" )
79+ DATASET=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " dataset" )
80+ SUBMISSION_PATH=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " submission_path" )
81+ TUNING_SEARCH_SPACE=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " tuning_search_space" )
82+ EXPERIMENT_DIR=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " experiment_dir" )
83+ RNG_SEED=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " rng_seed" )
84+ WORKLOAD=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " workload" )
85+ HPARAM_START_INDEX=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " hparam_start_index" )
86+ HPARAM_END_INDEX=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " hparam_end_index" )
87+ NUM_TUNING_TRIALS=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " num_tuning_trials" )
88+ TUNING_RULESET=$( read_json_value " $CONFIG_FILE " " $TASK_ID " " tuning_ruleset" )
89+
90+ DOCKER_CMD=(
91+ docker run
92+ -v /opt/data/:/data/
93+ -v " $HOME /experiment_runs:/experiment_runs"
94+ -v " $HOME /submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms"
95+ -v " $HOME /algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh"
96+ --gpus all
97+ --ipc=host
98+ " $IMAGE "
99+ -d " $DATASET "
100+ -f " $FRAMEWORK "
101+ -s " $SUBMISSION_PATH "
102+ -w " $WORKLOAD "
103+ -t " $TUNING_SEARCH_SPACE "
104+ -e " $EXPERIMENT_DIR "
105+ -c False
106+ -o True
107+ --rng_seed " $RNG_SEED "
108+ --hparam_start_index " $HPARAM_START_INDEX "
109+ --hparam_end_index " $HPARAM_END_INDEX "
110+ --num_tuning_trials " $NUM_TUNING_TRIALS "
111+ --tuning_ruleset " $TUNING_RULESET "
112+ -i true
113+ -r false
114+ --logs_bucket " $LOGS_BUCKET "
115+ )
116+
117+ if [ -n " $MAX_GLOBAL_STEPS " ]; then
118+ DOCKER_CMD+=(-m " $MAX_GLOBAL_STEPS " )
119+ fi
61120
62- docker run \
63- -v /opt/data/:/data/ \
64- -v $HOME /submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \
65- --gpus all \
66- --ipc=host \
67- $IMAGE \
68- -d $DATASET \
69- -f $FRAMEWORK \
70- -s $SUBMISSION_PATH \
71- -w $WORKLOAD \
72- -t $TUNING_SEARCH_SPACE \
73- -e $EXPERIMENT_DIR \
74- -c False \
75- -o True \
76- --rng_seed $RNG_SEED \
77- --hparam_start_index $HPARAM_START_INDEX \
78- --hparam_end_index $HPARAM_END_INDEX \
79- --num_tuning_trials $NUM_TUNING_TRIALS \
80- --tuning_ruleset $TUNING_RULESET \
81- --logs_bucket $LOGS_BUCKET \
82- -i true \
83- -r false
121+ " ${DOCKER_CMD[@]} "
0 commit comments