@@ -236,6 +236,35 @@ def download_file(s3, url, output_dir, force):
236236
237237
238238
239+ ###================================================================================================================
240+
241+
242+ def download_files_concurrently (s3 , dls , download_dir , n_workers , force = False ):
243+ """Download files using thread pool with up to n_workers"""
244+
245+ pass
246+
247+ start_time = time .time ()
248+
249+ # use thread pool for concurrent downloads
250+ with ThreadPoolExecutor (max_workers = n_workers ) as executor :
251+
252+ # tqdm makes a cool progress bar
253+ downloaded_files = list (tqdm (executor .map (download_file , repeat (s3 ), dls , repeat (download_dir ), repeat (force )),\
254+ total = len (dls ), desc = 'DL Progress' ,\
255+ ascii = True , ncols = 75 , file = sys .stdout ))
256+
257+ # calculate total time spent in the download
258+ total_time_download = time .time () - start_time
259+
260+ print ('\n =====================================' )
261+ print ('Time spent = ' + str (total_time_download ) + ' seconds' )
262+ print ('\n ' )
263+
264+ return downloaded_files
265+
266+
267+
239268###================================================================================================================
240269
241270
@@ -272,6 +301,7 @@ def download_files_wrapper(s3, s3_files_list, download_dir, n_workers, force_red
272301
273302 print ('\n =====================================' )
274303 print ('Time spent = ' + str (total_time_download ) + ' seconds' )
304+ print ('\n ' )
275305
276306 return downloaded_files
277307
@@ -349,6 +379,10 @@ def ecco_podaac_s3_get(ShortName,StartDate,EndDate,download_root_dir=None,n_work
349379 For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets
350380 within the specified date range.
351381
382+ download_root_dir: str, defines parent directory to download files to.
383+ Files will be downloaded to directory download_root_dir/ShortName/.
384+ If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
385+
352386 n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
353387
354388 force_redownload: bool, if True, existing files will be redownloaded and replaced;
@@ -371,28 +405,6 @@ def ecco_podaac_s3_get(ShortName,StartDate,EndDate,download_root_dir=None,n_work
371405 from concurrent .futures import ThreadPoolExecutor
372406
373407
374-
375- ### Helper subroutine to download all urls in the list `dls`
376- def download_files_concurrently (s3 , dls , download_dir , n_workers , force = False ):
377- start_time = time .time ()
378-
379- # use 3 threads for concurrent downloads
380- with ThreadPoolExecutor (max_workers = n_workers ) as executor :
381-
382- # tqdm makes a cool progress bar
383- downloaded_files = list (tqdm (executor .map (download_file , repeat (s3 ), dls , repeat (download_dir ), repeat (force )),\
384- total = len (dls ), desc = 'DL Progress' ,\
385- ascii = True , ncols = 75 , file = sys .stdout ))
386-
387- # calculate total time spent in the download
388- total_time_download = time .time () - start_time
389-
390- print ('\n =====================================' )
391- print ('Time spent = ' + str (total_time_download ) + ' seconds' )
392-
393- return downloaded_files
394-
395-
396408 # set default download parent directory
397409 if download_root_dir == None :
398410 download_root_dir = join (expanduser ('~' ),'Downloads' ,'ECCO_V4r4_PODAAC' )
@@ -428,7 +440,7 @@ def download_files_concurrently(s3, dls, download_dir, n_workers, force=False):
428440###================================================================================================================
429441
430442
431- def ecco_podaac_s3_get_diskaware (ShortNames ,StartDate ,EndDate ,max_avail_frac = 0.5 ,snapshot_opt = None ,download_root_dir = None ,n_workers = 6 ,\
443+ def ecco_podaac_s3_get_diskaware (ShortNames ,StartDate ,EndDate ,max_avail_frac = 0.5 ,snapshot_interval = None ,download_root_dir = None ,n_workers = 6 ,\
432444 force_redownload = False ):
433445
434446 """
@@ -454,9 +466,13 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
454466 This determines whether the dataset files are stored on the current instance, or opened on S3.
455467 Valid range is [0,0.9]. If number provided is outside this range, it is replaced by the closer endpoint of the range.
456468
457- snapshot_opt: ('monthly', 'daily', or None), if snapshot datasets are included in ShortNames, this determines whether
458- snapshots are included for only the beginning/end of each month ('monthly'), or for every day ('daily').
459- If None or not specified, defaults to 'daily' if any daily mean ShortNames are included and 'monthly' otherwise.
469+ snapshot_interval: ('monthly', 'daily', or None), if snapshot datasets are included in ShortNames, this determines whether
470+ snapshots are included for only the beginning/end of each month ('monthly'), or for every day ('daily').
471+ If None or not specified, defaults to 'daily' if any daily mean ShortNames are included and 'monthly' otherwise.
472+
473+ download_root_dir: str, defines parent directory to download files to.
474+ Files will be downloaded to directory download_root_dir/ShortName/.
475+ If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
460476
461477 n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
462478 Applies only if files are downloaded.
@@ -483,14 +499,18 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
483499 # initiate S3 access
484500 s3 = init_S3FileSystem ()
485501
486- # determine value of snapshot_opt if None or not specified
487- if snapshot_opt == None :
488- snapshot_opt = 'monthly'
502+ # determine value of snapshot_interval if None or not specified
503+ if snapshot_interval == None :
504+ snapshot_interval = 'monthly'
489505 for curr_shortname in ShortNames :
490506 if 'DAILY' in curr_shortname :
491- snapshot_opt = 'daily'
507+ snapshot_interval = 'daily'
492508 break
493509
510+ # set default download parent directory
511+ if download_root_dir == None :
512+ download_root_dir = join (expanduser ('~' ),'Downloads' ,'ECCO_V4r4_PODAAC' )
513+
494514 # add up total size of files that would be downloaded
495515 dataset_sizes = np .array ([])
496516 s3_files_list_all = []
@@ -499,8 +519,8 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
499519 # get list of files
500520 s3_files_list = ecco_podaac_s3_query (curr_shortname ,StartDate ,EndDate )
501521
502- # for snapshot datasets with monthly snapshot_opt , only include snapshots at beginning/end of months
503- if (('SNAPSHOT' in curr_shortname ) and (snapshot_opt == 'monthly' )):
522+ # for snapshot datasets with monthly snapshot_interval , only include snapshots at beginning/end of months
523+ if (('SNAPSHOT' in curr_shortname ) and (snapshot_interval == 'monthly' )):
504524 s3_files_list_copy = list (tuple (s3_files_list ))
505525 for s3_file in s3_files_list :
506526 snapshot_date = re .findall ("_[0-9]{4}-[0-9]{2}-[0-9]{2}" ,url )[0 ][1 :]
@@ -509,6 +529,7 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
509529 s3_files_list = s3_files_list_copy
510530
511531 # compute size of current dataset
532+ download_dir = Path (download_root_dir ) / curr_shortname
512533 curr_dataset_size = 0
513534 for s3_file in s3_files_list :
514535 if isfile (join (download_dir ,basename (s3_file ))) == False :
@@ -527,17 +548,17 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
527548 query_disk_completed = True
528549 except :
529550 try :
530- query_dir = join (os .path .split (query_dir )[:- 1 ])
531- except :
551+ query_dir = join (* os .path .split (query_dir )[:- 1 ])
552+ except :
532553 print ('Error: can not detect available disk space for download_root_dir: ' + download_root_dir )
533554 return - 1
534555
535556 # fraction of available storage that would be occupied by downloads
536557 sizes_sum = np .sum (dataset_sizes )
537558 avail_frac = sizes_sum / avail_storage
538559
539- print (f'Size of files to be downloaded to instance is { sizes_sum / (2 ** 30 )} GB,\n ' \
540- + f'which is { .01 * np .round ((1.e4 )* avail_frac )} % of the { avail_storage / (2 ** 30 )} GB available storage.' )
560+ print (f'Size of files to be downloaded to instance is { ( 1.e-3 ) * np . round (( 1.e3 ) * sizes_sum / (2 ** 30 ) )} GB,\n ' \
561+ + f'which is { .01 * np .round ((1.e4 )* avail_frac )} % of the { ( 1.e-3 ) * np . round (( 1.e3 ) * avail_storage / (2 ** 30 ) )} GB available storage.' )
541562
542563 retrieved_files = {}
543564 if avail_frac <= max_avail_frac :
0 commit comments