Skip to content

Commit 11ea68f

Browse files
committed
add safety flag to enforce explicitly enabling step budgets
1 parent f30ce4f commit 11ea68f

1 file changed

Lines changed: 23 additions & 59 deletions

File tree

scoring/run_workloads.py

Lines changed: 23 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
from absl import app
1818
from absl import flags
1919
from absl import logging
20-
import datetime
21-
import subprocess
2220

2321
from algorithmic_efficiency import random_utils as prng
2422
from algorithmic_efficiency.workloads.workloads import get_base_workload_name
@@ -30,7 +28,7 @@
3028
'URL to docker image')
3129
flags.DEFINE_integer('run_percentage',
3230
100,
33-
'Percentage of max num steps to run for.')
31+
'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect.')
3432
flags.DEFINE_string('experiment_name',
3533
'my_experiment',
3634
'Name of top sub directory in experiment dir.')
@@ -85,21 +83,14 @@
8583
'If your algorithm has a smaller per step time than our baselines '
8684
'you may want to increase the number of steps per workload.')
8785
flags.DEFINE_string(
88-
'workloads',
86+
'workload',
8987
None,
90-
'String representing a comma separated list of workload names.'
9188
'If not None, only run this workload, else run all workloads in workload_metadata_path.'
9289
)
9390
flags.DEFINE_string(
94-
'additional_requirements_path',
95-
None,
96-
'Path to requirements.txt if any.'
97-
)
98-
flags.DEFINE_integer(
99-
'max_steps',
100-
None,
101-
'Maximum number of steps to run. If run_fraction results in greater number of steps '
102-
'than the max_steps, the run will be cut to max_steps.'
91+
'enable_step_percentage',
92+
False,
93+
'By default ignore step_fraction such that scoring is bounded by time budget.'
10394
)
10495

10596
FLAGS = flags.FLAGS
@@ -119,34 +110,15 @@ def container_running():
119110
else:
120111
return True
121112

122-
def kill_containers():
123-
docker_client = docker.from_env()
124-
containers = docker_client.containers.list()
125-
for container in containers:
126-
container.kill()
127-
128-
def gpu_is_active():
129-
output = subprocess.check_output(['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'])
130-
return any(int(x) > 0 for x in output.decode().splitlines())
131-
132113

133114
def wait_until_container_not_running(sleep_interval=5 * 60):
134-
# check gpu util
135-
# if the gpu has not been utilized for 30 minutes kill the
136-
gpu_last_active = datetime.datetime.now().timestamp()
137-
138115
while container_running():
139-
# check if gpus have been inactive > 45 min and if so terminate container
140-
if gpu_is_active():
141-
gpu_last_active = datetime.datetime.now().timestamp()
142-
if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60:
143-
kill_containers("Killing container: GPUs have been inactive > 45 minutes...")
144116
time.sleep(sleep_interval)
145117
return
146118

119+
147120
def main(_):
148121
framework = FLAGS.framework
149-
run_fraction = FLAGS.run_percentage / 100.
150122
experiment_name = FLAGS.experiment_name
151123
docker_image_url = FLAGS.docker_image_url
152124
submission_path = FLAGS.submission_path
@@ -164,13 +136,7 @@ def main(_):
164136
study_end_index = FLAGS.study_end_index
165137
else:
166138
study_end_index = num_studies - 1
167-
168-
additional_requirements_path_flag = ''
169-
if FLAGS.additional_requirements_path:
170-
additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} '
171-
172139
submission_id = FLAGS.submission_id
173-
174140
rng_seed = FLAGS.seed
175141

176142
if not rng_seed:
@@ -182,21 +148,17 @@ def main(_):
182148
with open(FLAGS.workload_metadata_path) as f:
183149
workload_metadata = json.load(f)
184150

185-
# Get list of all possible workloads
186151
workloads = [w for w in workload_metadata.keys()]
187152

188-
# Read heldout workloads
153+
# Read held-out workloads
189154
if FLAGS.held_out_workloads_config_path:
190155
held_out_workloads = read_held_out_workloads(
191156
FLAGS.held_out_workloads_config_path)
192157
workloads = workloads + held_out_workloads
193158

194-
# Filter workloads if explicit workloads specified
195-
if FLAGS.workloads is not None:
196-
workloads = list(filter(lambda x: x in FLAGS.workloads.split(','), workloads))
197-
if len(workloads) != len(FLAGS.workloads.split(',')):
198-
unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads)
199-
raise ValueError(f'Invalid workload name {unmatched_workloads}')
159+
# Filter for single workload
160+
if FLAGS.workload and (FLAGS.workload in workloads):
161+
workloads = [FLAGS.workload]
200162

201163
rng_subkeys = prng.split(rng_key, num_studies)
202164

@@ -216,17 +178,20 @@ def main(_):
216178
"sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches
217179
print('=' * 100)
218180
dataset = workload_metadata[base_workload_name]['dataset']
219-
if FLAGS.max_steps is None:
220-
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
221-
run_fraction)
222-
else:
223-
max_steps = FLAGS.max_steps
181+
182+
max_steps_flag = ''
183+
if FLAGS.enable_step_percentage:
184+
run_fraction = FLAGS.run_percentage / 100.
185+
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
186+
run_fraction)
187+
max_steps_flag = f'-m {max_steps}'
188+
224189
mount_repo_flag = ''
225190
if FLAGS.local:
226-
mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
227-
command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ '
228-
'-v /home/kasimbeg/experiment_runs/:/experiment_runs '
229-
'-v /home/kasimbeg/experiment_runs/logs:/logs '
191+
mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency '
192+
command = ('docker run -t -d -v $HOME/data/:/data/ '
193+
'-v $HOME/experiment_runs/:/experiment_runs '
194+
'-v $HOME/experiment_runs/logs:/logs '
230195
f'{mount_repo_flag}'
231196
'--gpus all --ipc=host '
232197
f'{docker_image_url} '
@@ -235,10 +200,9 @@ def main(_):
235200
f'-s {submission_path} '
236201
f'-w {workload} '
237202
f'-e {study_dir} '
238-
f'-m {max_steps} '
203+
f'{max_steps_flag} '
239204
f'--num_tuning_trials {num_tuning_trials} '
240205
f'--rng_seed {run_seed} '
241-
f'{additional_requirements_path_flag}'
242206
'-c false '
243207
'-o true '
244208
'-i true ')

0 commit comments

Comments
 (0)