Skip to content

Commit 1bf70b0

Browse files
committed
setting up capability to open using fsspec jsons
1 parent 8f1ca78 commit 1bf70b0

3 files changed

Lines changed: 92 additions & 9 deletions

File tree

ecco_access/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88

99
from .ecco_s3_retrieve import ecco_podaac_s3_query
1010
from .ecco_s3_retrieve import ecco_podaac_s3_open
11+
from .ecco_s3_retrieve import ecco_podaac_s3_open_fsspec
1112
from .ecco_s3_retrieve import ecco_podaac_s3_get
1213
from .ecco_s3_retrieve import ecco_podaac_s3_get_diskaware
1314

15+
1416
__all__ = ['ecco_access',
1517
'ecco_download',
1618
'ecco_s3_retrieve']

ecco_access/ecco_access.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99

1010
from .ecco_s3_retrieve import ecco_podaac_s3_query
1111
from .ecco_s3_retrieve import ecco_podaac_s3_open
12+
from .ecco_s3_retrieve import ecco_podaac_s3_open_fsspec
1213
from .ecco_s3_retrieve import ecco_podaac_s3_get
1314
from .ecco_s3_retrieve import ecco_podaac_s3_get_diskaware
1415

1516

16-
1717
def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\
1818
StartDate=None,EndDate=None,\
1919
mode='download_ifspace',download_root_dir=None,**kwargs):
@@ -54,7 +54,7 @@ def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\
5454
StartDate,EndDate: str, in 'YYYY', 'YYYY-MM', or 'YYYY-MM-DD' format,
5555
define date range [StartDate,EndDate] for download.
5656
EndDate is included in the time range (unlike typical Python ranges).
57-
ECCOv4r4 date range is '1992-01-01' to '2017-12-31'.
57+
Full ECCOv4r4 date range (default) is '1992-01-01' to '2017-12-31'.
5858
For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets
5959
within the specified date range.
6060
@@ -72,13 +72,13 @@ def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\
7272
to see keyword arguments that can be used in this mode.
7373
The following modes work within the AWS cloud only:
7474
's3_open': Access datasets on S3 without downloading.
75+
's3_open_fsspec': Use json files (generated with `fsspec` and `kerchunk`)
76+
for expedited opening of datasets.
7577
's3_get': Download from S3 (to AWS EC2 instance).
7678
's3_get_ifspace': Check storage availability before downloading;
7779
download if storage footprint
7880
<= max_avail_frac*(available storage).
7981
Otherwise data are opened "remotely" from S3 bucket.
80-
's3_fsspec': Use `fsspec` json files (generated with `kerchunk`)
81-
for expedited loading of datasets.
8282
8383
download_root_dir: str, defines parent directory to download files to.
8484
Files will be downloaded to directory download_root_dir/ShortName/.
@@ -95,6 +95,15 @@ def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\
9595
Valid range is [0,0.9]. If number provided is outside this range, it is replaced by the closer
9696
endpoint of the range.
9797
98+
jsons_root_dir: str, for s3_open_fsspec mode only, the root/parent directory where the
99+
fsspec/kerchunk-generated jsons are found.
100+
jsons are generated using the steps described here:
101+
https://medium.com/pangeo/fake-it-until-you-make-it-reading-goes-netcdf4-data-on-aws-s3-as-zarr
102+
-for-rapid-data-access-61e33f8fe685
103+
and stored as {jsons_root_dir}/MZZ_{GRIDTYPE}_{TIME_RES}/{SHORTNAME}.json.
104+
For v4r4, GRIDTYPE is '05DEG' or 'LLC0090GRID'.
105+
TIME_RES is one of: ('MONTHLY','DAILY','SNAPSHOT','GEOMETRY','MIXING_COEFFS').
106+
98107
n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
99108
100109
force_redownload: bool, if True, existing files will be redownloaded and replaced;
@@ -153,14 +162,22 @@ def shortnames_find(query_list,grid,time_res):
153162

154163
possible_mode_list = "['ls','query','s3_ls','s3_query','download',\n"\
155164
+"'download_ifspace','download_subset',\n"\
156-
+"'s3_open','s3_get','s3_get_ifspace','s3_fsspec']"
165+
+"'s3_open','s3_get','s3_get_ifspace','s3_open_fsspec']"
157166

158167
# set some default keyword arguments
159168
if (('n_workers' not in kwargs.keys()) and (mode != 'download_subset')):
160169
kwargs['n_workers'] = 6
161170
if 'force_redownload' not in kwargs.keys():
162171
kwargs['force_redownload'] = False
163-
172+
173+
# remove unneeded keyword arguments
174+
if mode == 's3_open_fsspec':
175+
for kwarg in list(kwargs.keys()):
176+
if kwarg != 'jsons_root_dir':
177+
del kwargs[kwarg]
178+
else:
179+
if 'jsons_root_dir' in kwargs.keys():
180+
del kwargs['jsons_root_dir']
164181

165182
# download or otherwise access granules, depending on mode
166183

@@ -206,14 +223,16 @@ def shortnames_find(query_list,grid,time_res):
206223
elif mode == 's3_open':
207224
granule_files[shortname] = ecco_podaac_s3_open(\
208225
shortname,StartDate,EndDate)
226+
elif mode == 's3_open_fsspec':
227+
# granule_files will consist of mapper objects rather than URL/path or file lists
228+
granule_files[shortname] = ecco_podaac_s3_open_fsspec(\
229+
shortname,**kwargs)
209230
elif mode == 's3_get':
210231
kwargs['return_downloaded_files'] = True
211232
granule_files[shortname] = ecco_podaac_s3_get(\
212233
shortname,StartDate,EndDate,\
213234
download_root_dir=download_root_dir,\
214235
**kwargs)
215-
elif mode == 's3_fsspec':
216-
print('Placeholder for jsons')
217236
else:
218237
raise ValueError('Invalid mode specified; please specify one of the following:'\
219238
+'\n'+possible_mode_list)
@@ -225,7 +244,7 @@ def shortnames_find(query_list,grid,time_res):
225244
return_granules = True
226245
if return_granules:
227246
for shortname in granule_files.keys():
228-
if len(granule_files[shortname]) == 1:
247+
if ((len(granule_files[shortname]) == 1) and (mode != 's3_open_fsspec')):
229248
# if only 1 file is downloaded, return a string of filename instead of a list
230249
granule_files[shortname] = granule_files[shortname][0]
231250

ecco_access/ecco_s3_retrieve.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os.path
1111
from os.path import basename, isfile, isdir, join, expanduser
1212
from pathlib import Path
13+
import glob
1314

1415

1516
def ecco_podaac_s3_query(ShortName,StartDate,EndDate):
@@ -376,6 +377,67 @@ def ecco_podaac_s3_open(ShortName,StartDate,EndDate):
376377

377378

378379

380+
###================================================================================================================
381+
382+
383+
def ecco_podaac_s3_open_fsspec(ShortName,jsons_root_dir):
384+
385+
"""
386+
387+
This routine searches for and opens ECCO datasets from S3 buckets in the PO.DAAC Cloud.
388+
It returns a list of opened file(s) on S3 that can be passed to xarray.
389+
This function is intended to be called from an EC2 instance running in AWS region us-west-2.
390+
391+
Parameters
392+
----------
393+
ShortName: str, the ShortName that identifies the dataset on PO.DAAC.
394+
395+
jsons_root_dir: str, the root/parent directory where the
396+
fsspec/kerchunk-generated jsons are found.
397+
jsons are generated using the steps described here:
398+
https://medium.com/pangeo/fake-it-until-you-make-it-reading-goes-netcdf4-data-on-aws-s3-as-zarr
399+
-for-rapid-data-access-61e33f8fe685
400+
and stored as {jsons_root_dir}/MZZ_{GRIDTYPE}_{TIME_RES}/{SHORTNAME}.json.
401+
For v4r4, GRIDTYPE is '05DEG' or 'LLC0090GRID'.
402+
TIME_RES is one of: ('MONTHLY','DAILY','SNAPSHOT','GEOMETRY','MIXING_COEFFS').
403+
404+
Returns
405+
-------
406+
fsmap_obj: fsspec.mapping.FSMap object, can be passed directly to xarray.open_dataset
407+
(with engine='zarr')
408+
409+
"""
410+
411+
pass
412+
413+
import fsspec
414+
415+
416+
# identify where json file is found
417+
shortname_split = ShortName.split('_')
418+
gridtype = shortname_split[-3]
419+
if 'GEOMETRY' in ShortName:
420+
time_res = 'GEOMETRY'
421+
elif 'MIX_COEFFS' in ShortName:
422+
time_res = 'MIXING_COEFFS'
423+
else:
424+
time_res = shortname_split[-2]
425+
json_subdir = join(jsons_root_dir,"_".join(['MZZ',gridtype,time_res]))
426+
json_file = glob.glob(join(json_subdir,'*.json'))[0]
427+
428+
# generate map object
429+
fs = fsspec.filesystem(\
430+
"reference",
431+
fo=json_file,\
432+
remote_protocol="s3",
433+
remote_options={"anon":True},
434+
skip_instance_cache=True)
435+
fsmap_obj = fs.get_mapper("")
436+
437+
return fsmap_obj
438+
439+
440+
379441
###================================================================================================================
380442

381443

0 commit comments

Comments
 (0)