Skip to content

Commit 4956a31

Browse files
committed
add flag for step budget
1 parent 078b5fa commit 4956a31

1 file changed

Lines changed: 67 additions & 21 deletions

File tree

scoring/run_workloads.py

Lines changed: 67 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""
1+
]"""
22
Example Usage:
33
python run_workloads.py --framework jax \
44
--experiment_name my_first_experiment \
@@ -17,6 +17,8 @@
1717
from absl import app
1818
from absl import flags
1919
from absl import logging
20+
import datetime
21+
import subprocess
2022

2123
from algorithmic_efficiency import random_utils as prng
2224
from algorithmic_efficiency.workloads.workloads import get_base_workload_name
@@ -28,7 +30,8 @@
2830
'URL to docker image')
2931
flags.DEFINE_integer('run_percentage',
3032
100,
31-
'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect.')
33+
'Percentage of max num steps to run for.'
34+
'Must set the flag enable_step_budget to True for this to take effect.')
3235
flags.DEFINE_string('experiment_name',
3336
'my_experiment',
3437
'Name of top sub directory in experiment dir.')
@@ -83,14 +86,26 @@
8386
'If your algorithm has a smaller per step time than our baselines '
8487
'you may want to increase the number of steps per workload.')
8588
flags.DEFINE_string(
86-
'workload',
89+
'workloads',
8790
None,
91+
'String representing a comma separated list of workload names.'
8892
'If not None, only run this workload, else run all workloads in workload_metadata_path.'
8993
)
94+
flags.DEFINE_string(
95+
'additional_requirements_path',
96+
None,
97+
'Path to requirements.txt if any.'
98+
)
99+
flags.DEFINE_integer(
100+
'max_steps',
101+
None,
102+
'Maximum number of steps to run. Must set flag enable_step_budget.'
103+
'This flag takes precedence over the run_percentage flag.'
104+
)
90105
flags.DEFINE_bool(
91-
'enable_step_percentage',
92-
False,
93-
'By default ignore step_fraction such that scoring is bounded by time budget.'
106+
'enable_step_budget',
107+
False,
108+
'Flag that has to be explicitly set to override time budgets to step budget percentage.'
94109
)
95110

96111
FLAGS = flags.FLAGS
@@ -110,13 +125,31 @@ def container_running():
110125
else:
111126
return True
112127

128+
def kill_containers():
129+
docker_client = docker.from_env()
130+
containers = docker_client.containers.list()
131+
for container in containers:
132+
container.kill()
133+
134+
def gpu_is_active():
135+
output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'])
136+
return any(int(x) > 0 for x in output.decode().splitlines())
137+
113138

114139
def wait_until_container_not_running(sleep_interval=5 * 60):
140+
# check gpu util
141+
# if the gpu has not been utilized for 30 minutes kill the
142+
gpu_last_active = datetime.datetime.now().timestamp()
143+
115144
while container_running():
145+
# check if gpus have been inactive > 45 min and if so terminate container
146+
if gpu_is_active():
147+
gpu_last_active = datetime.datetime.now().timestamp()
148+
if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60:
149+
kill_containers("Killing container: GPUs have been inactive > 45 minutes...")
116150
time.sleep(sleep_interval)
117151
return
118152

119-
120153
def main(_):
121154
framework = FLAGS.framework
122155
experiment_name = FLAGS.experiment_name
@@ -136,7 +169,13 @@ def main(_):
136169
study_end_index = FLAGS.study_end_index
137170
else:
138171
study_end_index = num_studies - 1
172+
173+
additional_requirements_path_flag = ''
174+
if FLAGS.additional_requirements_path:
175+
additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} '
176+
139177
submission_id = FLAGS.submission_id
178+
140179
rng_seed = FLAGS.seed
141180

142181
if not rng_seed:
@@ -148,17 +187,21 @@ def main(_):
148187
with open(FLAGS.workload_metadata_path) as f:
149188
workload_metadata = json.load(f)
150189

190+
# Get list of all possible workloads
151191
workloads = [w for w in workload_metadata.keys()]
152192

153-
# Read held-out workloads
193+
# Read heldout workloads
154194
if FLAGS.held_out_workloads_config_path:
155195
held_out_workloads = read_held_out_workloads(
156196
FLAGS.held_out_workloads_config_path)
157197
workloads = workloads + held_out_workloads
158198

159-
# Filter for single workload
160-
if FLAGS.workload and (FLAGS.workload in workloads):
161-
workloads = [FLAGS.workload]
199+
# Filter workloads if explicit workloads specified
200+
if FLAGS.workloads is not None:
201+
workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads))
202+
if len(workloads) != len(FLAGS.workloads.split(',')):
203+
unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads)
204+
raise ValueError(f'Invalid workload name {unmatched_workloads}')
162205

163206
rng_subkeys = prng.split(rng_key, num_studies)
164207

@@ -178,20 +221,22 @@ def main(_):
178221
"sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches
179222
print('=' * 100)
180223
dataset = workload_metadata[base_workload_name]['dataset']
181-
182224
max_steps_flag = ''
183-
if FLAGS.enable_step_percentage:
184-
run_fraction = FLAGS.run_percentage / 100.
225+
if FLAGS.enable_step_budget:
226+
run_fraction = FLAGS.run_percentage / 100.
227+
if FLAGS.max_steps is None:
185228
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
186-
run_fraction)
187-
max_steps_flag = f'-m {max_steps}'
188-
229+
run_fraction)
230+
else:
231+
max_steps = FLAGS.max_steps
232+
max_steps_flag = f'-m {max_steps}'
233+
189234
mount_repo_flag = ''
190235
if FLAGS.local:
191-
mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency '
192-
command = ('docker run -t -d -v $HOME/data/:/data/ '
193-
'-v $HOME/experiment_runs/:/experiment_runs '
194-
'-v $HOME/experiment_runs/logs:/logs '
236+
mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
237+
command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ '
238+
'-v /home/kasimbeg/experiment_runs/:/experiment_runs '
239+
'-v /home/kasimbeg/experiment_runs/logs:/logs '
195240
f'{mount_repo_flag}'
196241
'--gpus all --ipc=host '
197242
f'{docker_image_url} '
@@ -203,6 +248,7 @@ def main(_):
203248
f'{max_steps_flag} '
204249
f'--num_tuning_trials {num_tuning_trials} '
205250
f'--rng_seed {run_seed} '
251+
f'{additional_requirements_path_flag}'
206252
'-c false '
207253
'-o true '
208254
'-i true ')

0 commit comments

Comments
 (0)