Skip to content

Commit 8a9d659

Browse files
committed
Update run_jobs.sh script for a100 submissions.
1. Ensure any variable can be passed in via flags. Folks shouldn't have to edit the file and hardcode variables for any reason. 2. Pass max global steps via a flag. 3. Update some default values for the new submission (repo/image/config file/logs bucket)
1 parent 8ed4856 commit 8a9d659

1 file changed

Lines changed: 92 additions & 54 deletions

File tree

scoring/utils/slurm/run_jobs.sh

Lines changed: 92 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,21 @@
22

33
#SBATCH --nodes=1 # give it a full node
44
#SBATCH --ntasks-per-node=1
5-
#SBATCH --array=<fill with range of items in config, e.g 0-7 >
6-
#SBATCH --partition=v100
7-
#SBATCH --gpus-per-node=8
5+
#SBATCH --array=0-26
6+
#SBATCH --partition=a100
7+
#SBATCH --gpus-per-node=4
88
#SBATCH --exclusive #this will not allow other jobs to run on this cluster
9-
#SBATCH --output=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.out
10-
#SBATCH --error=experiments/tests/jit_debug_deepspeech_old_stephint_nadamw/job_%A_%a.err
9+
#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out
10+
#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err
1111

12-
# Usage: sbatch <this file>.sh
12+
# Usage: sbatch <this file>.sh [options]
1313
# This script reads config.json and launches a sbatch job using task
14-
# arrays where each job in the array corresponds to a training run
14+
# arrays where each job in the array corresponds to a training run
1515
# for a workload given a random seed and tuning trial index.
1616
# To generate the config.json use make_job_config.py.
1717

1818
set -x
1919

20-
# Pull docker image (ATTENTION: you may want to modify this)
21-
REPO=""
22-
IMAGE=""
23-
y | gcloud auth configure-docker $REPO
24-
docker pull $IMAGE
25-
# Job config (ATTENTION: you may want to modify this)
26-
config_file="" # Replace with your config file path
27-
LOGS_BUCKET="" # replace with your bucket used for logging
28-
29-
3020
# Function to read a JSON file and extract a value by key
3121
read_json_value() {
3222
local json_file="$1"
@@ -43,41 +33,89 @@ then
4333
exit 1
4434
fi
4535

46-
TASK="$SLURM_ARRAY_TASK_ID"
47-
FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework")
48-
DATASET=$(read_json_value "$config_file" "$TASK" "dataset")
49-
SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path")
50-
FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework")
51-
TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space")
52-
EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir")
53-
MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps")
54-
RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed")
55-
WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload")
56-
HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index")
57-
HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index")
58-
NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials")
59-
TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset")
60-
MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$MAX_GLOBAL_STEPS" "max_global_steps")
36+
# Default values
37+
REPO="europe-west4-docker.pkg.dev"
38+
IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
39+
CONFIG_FILE="$HOME/algorithmic-efficiency/config.json"
40+
LOGS_BUCKET="algoperf-runs"
41+
TASK_ID="${SLURM_ARRAY_TASK_ID:-0}"
42+
43+
# Parse flags
44+
while [[ $# -gt 0 ]]; do
45+
case $1 in
46+
--repo)
47+
REPO="$2"
48+
shift 2
49+
;;
50+
--image)
51+
IMAGE="$2"
52+
shift 2
53+
;;
54+
--config_file)
55+
CONFIG_FILE="$2"
56+
shift 2
57+
;;
58+
--logs_bucket)
59+
LOGS_BUCKET="$2"
60+
shift 2
61+
;;
62+
--max_global_steps)
63+
MAX_GLOBAL_STEPS="$2"
64+
shift 2
65+
;;
66+
*)
67+
echo "Unknown option $1"
68+
exit 1
69+
;;
70+
esac
71+
done
72+
73+
# Pull docker image
74+
yes | gcloud auth configure-docker "$REPO"
75+
docker pull "$IMAGE"
76+
77+
# Set variables from config file
78+
FRAMEWORK=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "framework")
79+
DATASET=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "dataset")
80+
SUBMISSION_PATH=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "submission_path")
81+
TUNING_SEARCH_SPACE=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_search_space")
82+
EXPERIMENT_DIR=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "experiment_dir")
83+
RNG_SEED=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "rng_seed")
84+
WORKLOAD=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "workload")
85+
HPARAM_START_INDEX=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_start_index")
86+
HPARAM_END_INDEX=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_end_index")
87+
NUM_TUNING_TRIALS=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "num_tuning_trials")
88+
TUNING_RULESET=$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_ruleset")
89+
90+
DOCKER_CMD=(
91+
docker run
92+
-v /opt/data/:/data/
93+
-v "$HOME/experiment_runs:/experiment_runs"
94+
-v "$HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms"
95+
-v "$HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh"
96+
--gpus all
97+
--ipc=host
98+
"$IMAGE"
99+
-d "$DATASET"
100+
-f "$FRAMEWORK"
101+
-s "$SUBMISSION_PATH"
102+
-w "$WORKLOAD"
103+
-t "$TUNING_SEARCH_SPACE"
104+
-e "$EXPERIMENT_DIR"
105+
-c False
106+
-o True
107+
--rng_seed "$RNG_SEED"
108+
--hparam_start_index "$HPARAM_START_INDEX"
109+
--hparam_end_index "$HPARAM_END_INDEX"
110+
--num_tuning_trials "$NUM_TUNING_TRIALS"
111+
--tuning_ruleset "$TUNING_RULESET"
112+
-i true
113+
-r false
114+
--logs_bucket "$LOGS_BUCKET"
115+
)
116+
117+
if [ -n "$MAX_GLOBAL_STEPS" ]; then
118+
DOCKER_CMD+=(-m "$MAX_GLOBAL_STEPS")
119+
fi
61120

62-
docker run \
63-
-v /opt/data/:/data/ \
64-
-v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \
65-
--gpus all \
66-
--ipc=host \
67-
$IMAGE \
68-
-d $DATASET \
69-
-f $FRAMEWORK \
70-
-s $SUBMISSION_PATH \
71-
-w $WORKLOAD \
72-
-t $TUNING_SEARCH_SPACE \
73-
-e $EXPERIMENT_DIR \
74-
-c False \
75-
-o True \
76-
--rng_seed $RNG_SEED \
77-
--hparam_start_index $HPARAM_START_INDEX \
78-
--hparam_end_index $HPARAM_END_INDEX \
79-
--num_tuning_trials $NUM_TUNING_TRIALS \
80-
--tuning_ruleset $TUNING_RULESET \
81-
--logs_bucket $LOGS_BUCKET \
82-
-i true \
83-
-r false
121+
"${DOCKER_CMD[@]}"

0 commit comments

Comments
 (0)