1- """
1+ ] """
22Example Usage:
33python run_workloads.py --framework jax \
44 --experiment_name my_first_experiment \
1717from absl import app
1818from absl import flags
1919from absl import logging
20+ import datetime
21+ import subprocess
2022
2123from algorithmic_efficiency import random_utils as prng
2224from algorithmic_efficiency .workloads .workloads import get_base_workload_name
2830 'URL to docker image' )
2931flags .DEFINE_integer ('run_percentage' ,
3032 100 ,
31- 'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect.' )
33+ 'Percentage of max num steps to run for.'
34+ 'Must set the flag enable_step_budget to True for this to take effect.' )
3235flags .DEFINE_string ('experiment_name' ,
3336 'my_experiment' ,
3437 'Name of top sub directory in experiment dir.' )
8386 'If your algorithm has a smaller per step time than our baselines '
8487 'you may want to increase the number of steps per workload.' )
8588flags .DEFINE_string (
86- 'workload ' ,
89+ 'workloads ' ,
8790 None ,
91+ 'String representing a comma separated list of workload names.'
8892 'If not None, only run this workload, else run all workloads in workload_metadata_path.'
8993)
94+ flags .DEFINE_string (
95+ 'additional_requirements_path' ,
96+ None ,
97+ 'Path to requirements.txt if any.'
98+ )
99+ flags .DEFINE_integer (
100+ 'max_steps' ,
101+ None ,
102+ 'Maximum number of steps to run. Must set flag enable_step_budget.'
103+ 'This flag takes precedence over the run_percentage flag.'
104+ )
90105flags .DEFINE_bool (
91- 'enable_step_percentage ' ,
92- False ,
93- 'By default ignore step_fraction such that scoring is bounded by time budget.'
106+ 'enable_step_budget ' ,
107+ False ,
108+ 'Flag that has to be explicitly set to override time budgets to step budget percentage .'
94109)
95110
96111FLAGS = flags .FLAGS
@@ -110,13 +125,31 @@ def container_running():
110125 else :
111126 return True
112127
128+ def kill_containers ():
129+ docker_client = docker .from_env ()
130+ containers = docker_client .containers .list ()
131+ for container in containers :
132+ container .kill ()
133+
134+ def gpu_is_active ():
135+ output = subprocess .check_output (['nvidia-smi' , '--query-gpu=utilization.gpu' , '--format=csv,noheader,nounits' ])
136+ return any (int (x ) > 0 for x in output .decode ().splitlines ())
137+
113138
114139def wait_until_container_not_running (sleep_interval = 5 * 60 ):
140+ # check gpu util
141+ # if the gpu has not been utilized for 30 minutes kill the
142+ gpu_last_active = datetime .datetime .now ().timestamp ()
143+
115144 while container_running ():
145+ # check if gpus have been inactive > 45 min and if so terminate container
146+ if gpu_is_active ():
147+ gpu_last_active = datetime .datetime .now ().timestamp ()
148+ if (datetime .datetime .now ().timestamp () - gpu_last_active ) > 45 * 60 :
149+ kill_containers ("Killing container: GPUs have been inactive > 45 minutes..." )
116150 time .sleep (sleep_interval )
117151 return
118152
119-
120153def main (_ ):
121154 framework = FLAGS .framework
122155 experiment_name = FLAGS .experiment_name
@@ -136,7 +169,13 @@ def main(_):
136169 study_end_index = FLAGS .study_end_index
137170 else :
138171 study_end_index = num_studies - 1
172+
173+ additional_requirements_path_flag = ''
174+ if FLAGS .additional_requirements_path :
175+ additional_requirements_path_flag = f'--additional_requirements_path { FLAGS .additional_requirements_path } '
176+
139177 submission_id = FLAGS .submission_id
178+
140179 rng_seed = FLAGS .seed
141180
142181 if not rng_seed :
@@ -148,17 +187,21 @@ def main(_):
148187 with open (FLAGS .workload_metadata_path ) as f :
149188 workload_metadata = json .load (f )
150189
190+ # Get list of all possible workloads
151191 workloads = [w for w in workload_metadata .keys ()]
152192
153- # Read held-out workloads
193+ # Read heldout workloads
154194 if FLAGS .held_out_workloads_config_path :
155195 held_out_workloads = read_held_out_workloads (
156196 FLAGS .held_out_workloads_config_path )
157197 workloads = workloads + held_out_workloads
158198
159- # Filter for single workload
160- if FLAGS .workload and (FLAGS .workload in workloads ):
161- workloads = [FLAGS .workload ]
199+ # Filter workloads if explicit workloads specified
200+ if FLAGS .workloads is not None :
201+ workloads = list (filter (lambda x : x in FLAGS .workloads .split (',' ), workloads ))
202+ if len (workloads ) != len (FLAGS .workloads .split (',' )):
203+ unmatched_workloads = set (FLAGS .workloads .split (',' )) - set (workloads )
204+ raise ValueError (f'Invalid workload name { unmatched_workloads } ' )
162205
163206 rng_subkeys = prng .split (rng_key , num_studies )
164207
@@ -178,20 +221,22 @@ def main(_):
178221 "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'" ) # clear caches
179222 print ('=' * 100 )
180223 dataset = workload_metadata [base_workload_name ]['dataset' ]
181-
182224 max_steps_flag = ''
183- if FLAGS .enable_step_percentage :
184- run_fraction = FLAGS .run_percentage / 100.
225+ if FLAGS .enable_step_budget :
226+ run_fraction = FLAGS .run_percentage / 100.
227+ if FLAGS .max_steps is None :
185228 max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
186- run_fraction )
187- max_steps_flag = f'-m { max_steps } '
188-
229+ run_fraction )
230+ else :
231+ max_steps = FLAGS .max_steps
232+ max_steps_flag = f'-m { max_steps } '
233+
189234 mount_repo_flag = ''
190235 if FLAGS .local :
191- mount_repo_flag = '-v $HOME /algorithmic-efficiency:/algorithmic-efficiency '
192- command = ('docker run -t -d -v $HOME /data/:/data/ '
193- '-v $HOME /experiment_runs/:/experiment_runs '
194- '-v $HOME /experiment_runs/logs:/logs '
236+ mount_repo_flag = '-v /home/kasimbeg /algorithmic-efficiency:/algorithmic-efficiency '
237+ command = ('docker run -t -d -v /home/kasimbeg /data/:/data/ '
238+ '-v /home/kasimbeg /experiment_runs/:/experiment_runs '
239+ '-v /home/kasimbeg /experiment_runs/logs:/logs '
195240 f'{ mount_repo_flag } '
196241 '--gpus all --ipc=host '
197242 f'{ docker_image_url } '
@@ -203,6 +248,7 @@ def main(_):
203248 f'{ max_steps_flag } '
204249 f'--num_tuning_trials { num_tuning_trials } '
205250 f'--rng_seed { run_seed } '
251+ f'{ additional_requirements_path_flag } '
206252 '-c false '
207253 '-o true '
208254 '-i true ' )
0 commit comments