Skip to content

Commit 97e9975

Browse files
committed
Make a script to easily run sbatch with minimal args
1 parent fb6cbe5 commit 97e9975

3 files changed

Lines changed: 342 additions & 64 deletions

File tree

scoring/utils/slurm/make_job_config.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
TUNING_SEARCH_SPACE = (
1919
'reference_algorithms/paper_baselines/adamw/tuning_search_space.json'
2020
)
21-
NUM_TUNING_TRIALS = 3 # For external tuning ruleset
22-
NUM_STUDIES = 3
2321

2422
flags.DEFINE_string(
2523
'submission_path',
@@ -43,7 +41,6 @@
4341
help='Can be either pytorch or jax.',
4442
)
4543
flags.DEFINE_integer('seed', 0, 'RNG seed to to generate study seeds from.')
46-
flags.DEFINE_integer('max_global_steps', None, 'Number of steps to run each workload for')
4744
flags.DEFINE_enum(
4845
'tuning_ruleset',
4946
'self',
@@ -53,14 +50,13 @@
5350
flags.DEFINE_string(
5451
'workloads', None, help='Comma seperated list of workloads to run.'
5552
)
56-
flags.DEFINE_integer('num_studies', NUM_STUDIES, help='Number of studies.')
53+
flags.DEFINE_integer('num_studies', 3, help='Number of studies.')
54+
flags.DEFINE_integer('num_tuning_trials', 5, help='Number of tuning trials for external ruleset.')
5755

5856
FLAGS = flags.FLAGS
5957

6058
MIN_INT = -(2 ** (31))
6159
MAX_INT = 2 ** (31) - 1
62-
NUM_TUNING_TRIALS = 5 # For external tuning ruleset
63-
NUM_STUDIES = 3
6460

6561
WORKLOADS = {
6662
'imagenet_resnet': {'dataset': 'imagenet'},
@@ -91,10 +87,10 @@ def main(_):
9187
for workload in workloads:
9288
# Fold in hash(workload) mod(max(uint32))
9389
workload_key = jax.random.fold_in(key, hash(workload) % (2**32 - 1))
94-
for study_index in range(NUM_STUDIES):
90+
for study_index in range(FLAGS.num_studies):
9591
study_key = jax.random.fold_in(workload_key, study_index)
9692
if FLAGS.tuning_ruleset == 'external':
97-
for hparam_index in range(NUM_TUNING_TRIALS):
93+
for hparam_index in range(FLAGS.num_tuning_trials):
9894
run_key = jax.random.fold_in(study_key, hparam_index)
9995
seed = jax.random.randint(run_key, (1,), MIN_INT, MAX_INT)[0].item()
10096
print(seed)
@@ -108,13 +104,11 @@ def main(_):
108104
job['experiment_dir'] = study_dir
109105
job['rng_seed'] = seed
110106
job['tuning_ruleset'] = FLAGS.tuning_ruleset
111-
job['num_tuning_trials'] = NUM_TUNING_TRIALS
107+
job['num_tuning_trials'] = FLAGS.num_tuning_trials
112108
job['hparam_start_index'] = hparam_index
113109
job['hparam_end_index'] = hparam_index + 1
114110
job['tuning_search_space'] = FLAGS.tuning_search_space
115111
job['tuning_ruleset'] = FLAGS.tuning_ruleset
116-
if FLAGS.max_global_steps:
117-
job['max_global_steps'] = FLAGS.max_global_steps
118112
jobs.append(job)
119113
print(job)
120114

@@ -133,8 +127,6 @@ def main(_):
133127
job['rng_seed'] = seed
134128
job['tuning_ruleset'] = FLAGS.tuning_ruleset
135129
job['num_tuning_trials'] = 1
136-
if FLAGS.max_global_steps:
137-
job['max_global_steps'] = FLAGS.max_global_steps
138130

139131
jobs.append(job)
140132
print(job)

scoring/utils/slurm/run_jobs.sh

Lines changed: 133 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,14 @@
99
#SBATCH --output=experiments/tests/updated_schedule_free/job_%A_%a.out
1010
#SBATCH --error=experiments/tests/updated_schedule_free/job_%A_%a.err
1111

12-
# Usage: sbatch <this file>.sh
12+
# Usage: sbatch <this file>.sh [options]
1313
# This script reads config.json and launches a sbatch job using task
1414
# arrays where each job in the array corresponds to a training run
1515
# for a workload given a random seed and tuning trial index.
1616
# To generate the config.json use make_job_config.py.
1717

1818
set -x
1919

20-
# Pull docker image (ATTENTION: you may want to modify this)
21-
REPO="europe-west4-docker.pkg.dev"
22-
IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
23-
yes | gcloud auth configure-docker $REPO
24-
docker pull $IMAGE
25-
# Job config (ATTENTION: you may want to modify this)
26-
config_file="$HOME/algorithmic-efficiency/pytorch_scoring_config_1.json" # Replace with your config file path
27-
LOGS_BUCKET="algoperf-runs" # replace with your bucket used for logging
28-
29-
3020
# Function to read a JSON file and extract a value by key
3121
read_json_value() {
3222
local json_file="$1"
@@ -43,45 +33,137 @@ then
4333
exit 1
4434
fi
4535

46-
TASK="$SLURM_ARRAY_TASK_ID"
47-
FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework")
48-
DATASET=$(read_json_value "$config_file" "$TASK" "dataset")
49-
SUBMISSION_PATH=$(read_json_value "$config_file" "$TASK" "submission_path")
50-
FRAMEWORK=$(read_json_value "$config_file" "$TASK" "framework")
51-
TUNING_SEARCH_SPACE=$(read_json_value "$config_file" "$TASK" "tuning_search_space")
52-
EXPERIMENT_DIR=$(read_json_value "$config_file" "$TASK" "experiment_dir")
53-
MAX_STEPS=$(read_json_value "$config_file" "$TASK" "max_steps")
54-
RNG_SEED=$(read_json_value "$config_file" "$TASK" "rng_seed")
55-
WORKLOAD=$(read_json_value "$config_file" "$TASK" "workload")
56-
HPARAM_START_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_start_index")
57-
HPARAM_END_INDEX=$(read_json_value "$config_file" "$TASK" "hparam_end_index")
58-
NUM_TUNING_TRIALS=$(read_json_value "$config_file" "$TASK" "num_tuning_trials")
59-
TUNING_RULESET=$(read_json_value "$config_file" "$TASK" "tuning_ruleset")
60-
MAX_GLOBAL_STEPS=$(read_json_value "$config_file" "$TASK" "max_global_steps")
36+
# Default values
37+
REPO="europe-west4-docker.pkg.dev"
38+
IMAGE="europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_pytorch_main:latest"
39+
CONFIG_FILE="$HOME/algorithmic-efficiency/pytorch_scoring_config_1.json"
40+
LOGS_BUCKET="algoperf-runs"
41+
TASK_ID="${SLURM_ARRAY_TASK_ID:-0}"
6142

62-
docker run \
63-
-v /opt/data/:/data/ \
64-
-v $HOME/experiment_runs:/experiment_runs \
65-
-v $HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms \
66-
-v $HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh \
67-
--gpus all \
68-
--ipc=host \
69-
$IMAGE \
70-
-d $DATASET \
71-
-f $FRAMEWORK \
72-
-s $SUBMISSION_PATH \
73-
-w $WORKLOAD \
74-
-t $TUNING_SEARCH_SPACE \
75-
-e $EXPERIMENT_DIR \
76-
-c False \
77-
-o True \
78-
--rng_seed $RNG_SEED \
79-
--hparam_start_index $HPARAM_START_INDEX \
80-
--hparam_end_index $HPARAM_END_INDEX \
81-
--num_tuning_trials $NUM_TUNING_TRIALS \
82-
--tuning_ruleset $TUNING_RULESET \
83-
-i true \
84-
-r false \
85-
--logs_bucket $LOGS_BUCKET \
86-
-m $MAX_GLOBAL_STEPS
43+
# Parse flags
44+
while [[ $# -gt 0 ]]; do
45+
case $1 in
46+
--repo)
47+
REPO="$2"
48+
shift 2
49+
;;
50+
--image)
51+
IMAGE="$2"
52+
shift 2
53+
;;
54+
--config_file)
55+
CONFIG_FILE="$2"
56+
shift 2
57+
;;
58+
--logs_bucket)
59+
LOGS_BUCKET="$2"
60+
shift 2
61+
;;
62+
--task_id)
63+
TASK_ID="$2"
64+
shift 2
65+
;;
66+
--framework)
67+
FRAMEWORK="$2"
68+
shift 2
69+
;;
70+
--dataset)
71+
DATASET="$2"
72+
shift 2
73+
;;
74+
--submission_path)
75+
SUBMISSION_PATH="$2"
76+
shift 2
77+
;;
78+
--tuning_search_space)
79+
TUNING_SEARCH_SPACE="$2"
80+
shift 2
81+
;;
82+
--experiment_dir)
83+
EXPERIMENT_DIR="$2"
84+
shift 2
85+
;;
86+
--rng_seed)
87+
RNG_SEED="$2"
88+
shift 2
89+
;;
90+
--workload)
91+
WORKLOAD="$2"
92+
shift 2
93+
;;
94+
--hparam_start_index)
95+
HPARAM_START_INDEX="$2"
96+
shift 2
97+
;;
98+
--hparam_end_index)
99+
HPARAM_END_INDEX="$2"
100+
shift 2
101+
;;
102+
--num_tuning_trials)
103+
NUM_TUNING_TRIALS="$2"
104+
shift 2
105+
;;
106+
--tuning_ruleset)
107+
TUNING_RULESET="$2"
108+
shift 2
109+
;;
110+
--max_global_steps)
111+
MAX_GLOBAL_STEPS="$2"
112+
shift 2
113+
;;
114+
*)
115+
echo "Unknown option $1"
116+
exit 1
117+
;;
118+
esac
119+
done
120+
121+
# Pull docker image
122+
yes | gcloud auth configure-docker "$REPO"
123+
docker pull "$IMAGE"
124+
125+
# Set variables from config file if not already set by flags
126+
FRAMEWORK="${FRAMEWORK:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "framework")}"
127+
DATASET="${DATASET:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "dataset")}"
128+
SUBMISSION_PATH="${SUBMISSION_PATH:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "submission_path")}"
129+
TUNING_SEARCH_SPACE="${TUNING_SEARCH_SPACE:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_search_space")}"
130+
EXPERIMENT_DIR="${EXPERIMENT_DIR:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "experiment_dir")}"
131+
RNG_SEED="${RNG_SEED:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "rng_seed")}"
132+
WORKLOAD="${WORKLOAD:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "workload")}"
133+
HPARAM_START_INDEX="${HPARAM_START_INDEX:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_start_index")}"
134+
HPARAM_END_INDEX="${HPARAM_END_INDEX:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "hparam_end_index")}"
135+
NUM_TUNING_TRIALS="${NUM_TUNING_TRIALS:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "num_tuning_trials")}"
136+
TUNING_RULESET="${TUNING_RULESET:-$(read_json_value "$CONFIG_FILE" "$TASK_ID" "tuning_ruleset")}"
137+
138+
DOCKER_CMD=(
139+
docker run
140+
-v /opt/data/:/data/
141+
-v "$HOME/experiment_runs:/experiment_runs"
142+
-v "$HOME/submissions_algorithms/:/algorithmic-efficiency/submissions_algorithms"
143+
-v "$HOME/algorithmic-efficiency/docker/scripts/startup.sh:/algorithmic-efficiency/docker/scripts/startup.sh"
144+
--gpus all
145+
--ipc=host
146+
"$IMAGE"
147+
-d "$DATASET"
148+
-f "$FRAMEWORK"
149+
-s "$SUBMISSION_PATH"
150+
-w "$WORKLOAD"
151+
-t "$TUNING_SEARCH_SPACE"
152+
-e "$EXPERIMENT_DIR"
153+
-c False
154+
-o True
155+
--rng_seed "$RNG_SEED"
156+
--hparam_start_index "$HPARAM_START_INDEX"
157+
--hparam_end_index "$HPARAM_END_INDEX"
158+
--num_tuning_trials "$NUM_TUNING_TRIALS"
159+
--tuning_ruleset "$TUNING_RULESET"
160+
-i true
161+
-r false
162+
--logs_bucket "$LOGS_BUCKET"
163+
)
164+
165+
if [ -n "$MAX_GLOBAL_STEPS" ]; then
166+
DOCKER_CMD+=(-m "$MAX_GLOBAL_STEPS")
167+
fi
87168

169+
"${DOCKER_CMD[@]}"

0 commit comments

Comments
 (0)