1717from absl import app
1818from absl import flags
1919from absl import logging
20- import datetime
21- import subprocess
2220
2321from algorithmic_efficiency import random_utils as prng
2422from algorithmic_efficiency .workloads .workloads import get_base_workload_name
3028 'URL to docker image' )
3129flags .DEFINE_integer ('run_percentage' ,
3230 100 ,
33- 'Percentage of max num steps to run for.' )
31+ 'Percentage of max num steps to run for. Must set enable_step_percentage to true for this to take effect. ' )
3432flags .DEFINE_string ('experiment_name' ,
3533 'my_experiment' ,
3634 'Name of top sub directory in experiment dir.' )
8583 'If your algorithm has a smaller per step time than our baselines '
8684 'you may want to increase the number of steps per workload.' )
8785flags .DEFINE_string (
88- 'workloads ' ,
86+ 'workload ' ,
8987 None ,
90- 'String representing a comma separated list of workload names.'
9188 'If not None, only run this workload, else run all workloads in workload_metadata_path.'
9289)
9390flags .DEFINE_string (
94- 'additional_requirements_path' ,
95- None ,
96- 'Path to requirements.txt if any.'
97- )
98- flags .DEFINE_integer (
99- 'max_steps' ,
100- None ,
101- 'Maximum number of steps to run. If run_fraction results in greater number of steps '
102- 'than the max_steps, the run will be cut to max_steps.'
91+ 'enable_step_percentage' ,
92+ False ,
93+ 'By default ignore step_fraction such that scoring is bounded by time budget.'
10394)
10495
10596FLAGS = flags .FLAGS
@@ -119,34 +110,15 @@ def container_running():
119110 else :
120111 return True
121112
122- def kill_containers ():
123- docker_client = docker .from_env ()
124- containers = docker_client .containers .list ()
125- for container in containers :
126- container .kill ()
127-
128- def gpu_is_active ():
129- output = subprocess .check_output (['nvidia-smi' , '--query-gpu=utilization.gpu' , '--format=csv,noheader,nounits' ])
130- return any (int (x ) > 0 for x in output .decode ().splitlines ())
131-
132113
133114def wait_until_container_not_running (sleep_interval = 5 * 60 ):
134- # check gpu util
135- # if the gpu has not been utilized for 30 minutes kill the
136- gpu_last_active = datetime .datetime .now ().timestamp ()
137-
138115 while container_running ():
139- # check if gpus have been inactive > 45 min and if so terminate container
140- if gpu_is_active ():
141- gpu_last_active = datetime .datetime .now ().timestamp ()
142- if (datetime .datetime .now ().timestamp () - gpu_last_active ) > 45 * 60 :
143- kill_containers ("Killing container: GPUs have been inactive > 45 minutes..." )
144116 time .sleep (sleep_interval )
145117 return
146118
119+
147120def main (_ ):
148121 framework = FLAGS .framework
149- run_fraction = FLAGS .run_percentage / 100.
150122 experiment_name = FLAGS .experiment_name
151123 docker_image_url = FLAGS .docker_image_url
152124 submission_path = FLAGS .submission_path
@@ -164,13 +136,7 @@ def main(_):
164136 study_end_index = FLAGS .study_end_index
165137 else :
166138 study_end_index = num_studies - 1
167-
168- additional_requirements_path_flag = ''
169- if FLAGS .additional_requirements_path :
170- additional_requirements_path_flag = f'--additional_requirements_path { FLAGS .additional_requirements_path } '
171-
172139 submission_id = FLAGS .submission_id
173-
174140 rng_seed = FLAGS .seed
175141
176142 if not rng_seed :
@@ -182,21 +148,17 @@ def main(_):
182148 with open (FLAGS .workload_metadata_path ) as f :
183149 workload_metadata = json .load (f )
184150
185- # Get list of all possible workloads
186151 workloads = [w for w in workload_metadata .keys ()]
187152
188- # Read heldout workloads
153+ # Read held-out workloads
189154 if FLAGS .held_out_workloads_config_path :
190155 held_out_workloads = read_held_out_workloads (
191156 FLAGS .held_out_workloads_config_path )
192157 workloads = workloads + held_out_workloads
193158
194- # Filter workloads if explicit workloads specified
195- if FLAGS .workloads is not None :
196- workloads = list (filter (lambda x : x in FLAGS .workloads .split (',' ), workloads ))
197- if len (workloads ) != len (FLAGS .workloads .split (',' )):
198- unmatched_workloads = set (FLAGS .workloads .split (',' )) - set (workloads )
199- raise ValueError (f'Invalid workload name { unmatched_workloads } ' )
159+ # Filter for single workload
160+ if FLAGS .workload and (FLAGS .workload in workloads ):
161+ workloads = [FLAGS .workload ]
200162
201163 rng_subkeys = prng .split (rng_key , num_studies )
202164
@@ -216,17 +178,20 @@ def main(_):
216178 "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'" ) # clear caches
217179 print ('=' * 100 )
218180 dataset = workload_metadata [base_workload_name ]['dataset' ]
219- if FLAGS .max_steps is None :
220- max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
221- run_fraction )
222- else :
223- max_steps = FLAGS .max_steps
181+
182+ max_steps_flag = ''
183+ if FLAGS .enable_step_percentage :
184+ run_fraction = FLAGS .run_percentage / 100.
185+ max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
186+ run_fraction )
187+ max_steps_flag = f'-m { max_steps } '
188+
224189 mount_repo_flag = ''
225190 if FLAGS .local :
226- mount_repo_flag = '-v /home/kasimbeg /algorithmic-efficiency:/algorithmic-efficiency '
227- command = ('docker run -t -d -v /home/kasimbeg /data/:/data/ '
228- '-v /home/kasimbeg /experiment_runs/:/experiment_runs '
229- '-v /home/kasimbeg /experiment_runs/logs:/logs '
191+ mount_repo_flag = '-v $HOME /algorithmic-efficiency:/algorithmic-efficiency '
192+ command = ('docker run -t -d -v $HOME /data/:/data/ '
193+ '-v $HOME /experiment_runs/:/experiment_runs '
194+ '-v $HOME /experiment_runs/logs:/logs '
230195 f'{ mount_repo_flag } '
231196 '--gpus all --ipc=host '
232197 f'{ docker_image_url } '
@@ -235,10 +200,9 @@ def main(_):
235200 f'-s { submission_path } '
236201 f'-w { workload } '
237202 f'-e { study_dir } '
238- f'-m { max_steps } '
203+ f'{ max_steps_flag } '
239204 f'--num_tuning_trials { num_tuning_trials } '
240205 f'--rng_seed { run_seed } '
241- f'{ additional_requirements_path_flag } '
242206 '-c false '
243207 '-o true '
244208 '-i true ' )
0 commit comments