Skip to content

Commit fcc03a2

Browse files
committed
updates to ECCO access modules
1 parent b968323 commit fcc03a2

2 files changed

Lines changed: 65 additions & 37 deletions

File tree

ECCO-ACCESS/ecco_download.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ def ecco_podaac_download(ShortName,StartDate,EndDate,download_root_dir=None,n_wo
2020
ECCOv4r4 date range is '1992-01-01' to '2017-12-31'.
2121
For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets
2222
within the specified date range.
23+
24+
download_root_dir: str, defines parent directory to download files to.
25+
Files will be downloaded to directory download_root_dir/ShortName/.
26+
If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
2327
2428
n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
2529
@@ -168,6 +172,7 @@ def download_files_concurrently(dls, download_dir, n_workers, force=False):
168172
print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
169173
print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time_download,2)} Mb/s')
170174
print('Time spent = ' + str(total_time_download) + ' seconds')
175+
print('\n')
171176

172177
# return list of downloaded files
173178
downloaded_files = []
@@ -309,6 +314,7 @@ def download_files_concurrently(dls, download_dir, n_workers, force=False):
309314
print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
310315
print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time_download,2)} Mb/s')
311316
print('Time spent = ' + str(total_time_download) + ' seconds')
317+
print('\n')
312318

313319
if return_downloaded_files == True:
314320
if len(downloaded_files) == 1:
@@ -1007,7 +1013,8 @@ def download_wrapper(url: str, url_append: str, download_dir: str, subset_file_i
10071013
print('\n=====================================')
10081014
print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
10091015
print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time_download,2)} Mb/s')
1010-
print('Time spent = ' + str(total_time_download) + ' seconds')
1016+
print('Time spent = ' + str(total_time_download) + ' seconds')
1017+
print('\n')
10111018

10121019
# Display dates of granules that were not downloaded successfully
10131020
status_codes_bad = (status_codes < 0).nonzero()[0]

ECCO-ACCESS/ecco_s3_retrieve.py

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,35 @@ def download_file(s3, url, output_dir, force):
236236

237237

238238

239+
###================================================================================================================
240+
241+
242+
def download_files_concurrently(s3, dls, download_dir, n_workers, force=False):
243+
"""Download files using thread pool with up to n_workers"""
244+
245+
pass
246+
247+
start_time = time.time()
248+
249+
# use thread pool for concurrent downloads
250+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
251+
252+
# tqdm makes a cool progress bar
253+
downloaded_files = list(tqdm(executor.map(download_file, repeat(s3), dls, repeat(download_dir), repeat(force)),\
254+
total=len(dls), desc='DL Progress',\
255+
ascii=True, ncols=75, file=sys.stdout))
256+
257+
# calculate total time spent in the download
258+
total_time_download = time.time() - start_time
259+
260+
print('\n=====================================')
261+
print('Time spent = ' + str(total_time_download) + ' seconds')
262+
print('\n')
263+
264+
return downloaded_files
265+
266+
267+
239268
###================================================================================================================
240269

241270

@@ -272,6 +301,7 @@ def download_files_wrapper(s3, s3_files_list, download_dir, n_workers, force_red
272301

273302
print('\n=====================================')
274303
print('Time spent = ' + str(total_time_download) + ' seconds')
304+
print('\n')
275305

276306
return downloaded_files
277307

@@ -349,6 +379,10 @@ def ecco_podaac_s3_get(ShortName,StartDate,EndDate,download_root_dir=None,n_work
349379
For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets
350380
within the specified date range.
351381
382+
download_root_dir: str, defines parent directory to download files to.
383+
Files will be downloaded to directory download_root_dir/ShortName/.
384+
If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
385+
352386
n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
353387
354388
force_redownload: bool, if True, existing files will be redownloaded and replaced;
@@ -371,28 +405,6 @@ def ecco_podaac_s3_get(ShortName,StartDate,EndDate,download_root_dir=None,n_work
371405
from concurrent.futures import ThreadPoolExecutor
372406

373407

374-
375-
### Helper subroutine to download all urls in the list `dls`
376-
def download_files_concurrently(s3, dls, download_dir, n_workers, force=False):
377-
start_time = time.time()
378-
379-
# use 3 threads for concurrent downloads
380-
with ThreadPoolExecutor(max_workers=n_workers) as executor:
381-
382-
# tqdm makes a cool progress bar
383-
downloaded_files = list(tqdm(executor.map(download_file, repeat(s3), dls, repeat(download_dir), repeat(force)),\
384-
total=len(dls), desc='DL Progress',\
385-
ascii=True, ncols=75, file=sys.stdout))
386-
387-
# calculate total time spent in the download
388-
total_time_download = time.time() - start_time
389-
390-
print('\n=====================================')
391-
print('Time spent = ' + str(total_time_download) + ' seconds')
392-
393-
return downloaded_files
394-
395-
396408
# set default download parent directory
397409
if download_root_dir==None:
398410
download_root_dir = join(expanduser('~'),'Downloads','ECCO_V4r4_PODAAC')
@@ -428,7 +440,7 @@ def download_files_concurrently(s3, dls, download_dir, n_workers, force=False):
428440
###================================================================================================================
429441

430442

431-
def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5,snapshot_opt=None,download_root_dir=None,n_workers=6,\
443+
def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5,snapshot_interval=None,download_root_dir=None,n_workers=6,\
432444
force_redownload=False):
433445

434446
"""
@@ -454,9 +466,13 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
454466
This determines whether the dataset files are stored on the current instance, or opened on S3.
455467
Valid range is [0,0.9]. If number provided is outside this range, it is replaced by the closer endpoint of the range.
456468
457-
snapshot_opt: ('monthly', 'daily', or None), if snapshot datasets are included in ShortNames, this determines whether
458-
snapshots are included for only the beginning/end of each month ('monthly'), or for every day ('daily').
459-
If None or not specified, defaults to 'daily' if any daily mean ShortNames are included and 'monthly' otherwise.
469+
snapshot_interval: ('monthly', 'daily', or None), if snapshot datasets are included in ShortNames, this determines whether
470+
snapshots are included for only the beginning/end of each month ('monthly'), or for every day ('daily').
471+
If None or not specified, defaults to 'daily' if any daily mean ShortNames are included and 'monthly' otherwise.
472+
473+
download_root_dir: str, defines parent directory to download files to.
474+
Files will be downloaded to directory download_root_dir/ShortName/.
475+
If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
460476
461477
n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
462478
Applies only if files are downloaded.
@@ -483,14 +499,18 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
483499
# initiate S3 access
484500
s3 = init_S3FileSystem()
485501

486-
# determine value of snapshot_opt if None or not specified
487-
if snapshot_opt == None:
488-
snapshot_opt = 'monthly'
502+
# determine value of snapshot_interval if None or not specified
503+
if snapshot_interval == None:
504+
snapshot_interval = 'monthly'
489505
for curr_shortname in ShortNames:
490506
if 'DAILY' in curr_shortname:
491-
snapshot_opt = 'daily'
507+
snapshot_interval = 'daily'
492508
break
493509

510+
# set default download parent directory
511+
if download_root_dir==None:
512+
download_root_dir = join(expanduser('~'),'Downloads','ECCO_V4r4_PODAAC')
513+
494514
# add up total size of files that would be downloaded
495515
dataset_sizes = np.array([])
496516
s3_files_list_all = []
@@ -499,8 +519,8 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
499519
# get list of files
500520
s3_files_list = ecco_podaac_s3_query(curr_shortname,StartDate,EndDate)
501521

502-
# for snapshot datasets with monthly snapshot_opt, only include snapshots at beginning/end of months
503-
if (('SNAPSHOT' in curr_shortname) and (snapshot_opt == 'monthly')):
522+
# for snapshot datasets with monthly snapshot_interval, only include snapshots at beginning/end of months
523+
if (('SNAPSHOT' in curr_shortname) and (snapshot_interval == 'monthly')):
504524
s3_files_list_copy = list(tuple(s3_files_list))
505525
for s3_file in s3_files_list:
506526
snapshot_date = re.findall("_[0-9]{4}-[0-9]{2}-[0-9]{2}",url)[0][1:]
@@ -509,6 +529,7 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
509529
s3_files_list = s3_files_list_copy
510530

511531
# compute size of current dataset
532+
download_dir = Path(download_root_dir) / curr_shortname
512533
curr_dataset_size = 0
513534
for s3_file in s3_files_list:
514535
if isfile(join(download_dir,basename(s3_file))) == False:
@@ -527,17 +548,17 @@ def ecco_podaac_s3_get_diskaware(ShortNames,StartDate,EndDate,max_avail_frac=0.5
527548
query_disk_completed = True
528549
except:
529550
try:
530-
query_dir = join(os.path.split(query_dir)[:-1])
531-
except:
551+
query_dir = join(*os.path.split(query_dir)[:-1])
552+
except:
532553
print('Error: can not detect available disk space for download_root_dir: '+download_root_dir)
533554
return -1
534555

535556
# fraction of available storage that would be occupied by downloads
536557
sizes_sum = np.sum(dataset_sizes)
537558
avail_frac = sizes_sum/avail_storage
538559

539-
print(f'Size of files to be downloaded to instance is {sizes_sum/(2**30)} GB,\n'\
540-
+f'which is {.01*np.round((1.e4)*avail_frac)}% of the {avail_storage/(2**30)} GB available storage.')
560+
print(f'Size of files to be downloaded to instance is {(1.e-3)*np.round((1.e3)*sizes_sum/(2**30))} GB,\n'\
561+
+f'which is {.01*np.round((1.e4)*avail_frac)}% of the {(1.e-3)*np.round((1.e3)*avail_storage/(2**30))} GB available storage.')
541562

542563
retrieved_files = {}
543564
if avail_frac <= max_avail_frac:

0 commit comments

Comments
 (0)