Skip to content

Commit f9bff8f

Browse files
committed
Added xarray dataset output function and dates module
1 parent 1bf70b0 commit f9bff8f

5 files changed

Lines changed: 246 additions & 127 deletions

File tree

ecco_access/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .ecco_access import ecco_podaac_access
2+
from .ecco_access import ecco_podaac_access_to_xrdataset
23

34
from .ecco_download import ecco_podaac_query
45
from .ecco_download import ecco_podaac_download
@@ -12,7 +13,10 @@
1213
from .ecco_s3_retrieve import ecco_podaac_s3_get
1314
from .ecco_s3_retrieve import ecco_podaac_s3_get_diskaware
1415

16+
from .ecco_acc_dates import date_adjustment
17+
1518

1619
__all__ = ['ecco_access',
1720
'ecco_download',
18-
'ecco_s3_retrieve']
21+
'ecco_s3_retrieve',
22+
'ecco_acc_dates']

ecco_access/ecco_acc_dates.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
### This module contains date handling routines used in the other ecco_access modules.
2+
3+
4+
import numpy as np
5+
6+
7+
def date_adjustment(ShortName,StartDate,EndDate,CMR_query=True):
8+
"""
9+
Adjusts StartDate and EndDate, augmenting where day or month may be missing.
10+
Returns either strings ready for NASA Earthdata CMR query (CMR_query = True),
11+
or numpy.datetime64 values.
12+
"""
13+
14+
pass
15+
16+
17+
# # Adjust StartDate and EndDate
18+
19+
if StartDate=='yesterday':
20+
StartDate = yesterday()
21+
if EndDate==-1:
22+
EndDate = StartDate
23+
elif StartDate=='yesterday':
24+
StartDate = yesterday()
25+
elif EndDate=='today':
26+
EndDate = today()
27+
28+
if len(StartDate) == 4:
29+
StartDate += '-01-01'
30+
elif len(StartDate) == 7:
31+
StartDate += '-01'
32+
elif len(StartDate) != 10:
33+
sys.exit('\nStart date should be in format ''YYYY'', ''YYYY-MM'', or ''YYYY-MM-DD''!\n'\
34+
+'Program will exit now !\n')
35+
36+
if len(EndDate) == 4:
37+
EndDate += '-12-31'
38+
elif len(EndDate) == 7:
39+
EndDate = str(np.datetime64(str(np.datetime64(EndDate,'M')+np.timedelta64(1,'M'))+'-01','D')\
40+
-np.timedelta64(1,'D'))
41+
elif len(EndDate) != 10:
42+
sys.exit('\nEnd date should be in format ''YYYY'', ''YYYY-MM'', or ''YYYY-MM-DD''!\n'\
43+
+'Program will exit now !\n')
44+
45+
# for snapshot datasets, move EndDate one day later
46+
if 'SNAPSHOT' in ShortName:
47+
EndDate = str(np.datetime64(EndDate,'D') + np.timedelta64(1,'D'))
48+
49+
# CMR request adjustments
50+
if CMR_request:
51+
SingleDay_flag = False
52+
if (('MONTHLY' in ShortName) or ('DAILY' in ShortName)):
53+
if np.datetime64(EndDate,'D') - np.datetime64(StartDate,'D') \
54+
> np.timedelta64(1,'D'):
55+
# for monthly and daily datasets, do not include the month or day before
56+
StartDate = str(np.datetime64(StartDate,'D') + np.timedelta64(1,'D'))
57+
else:
58+
# for single day ranges we need to make the adjustment
59+
# after the CMR request
60+
SingleDay_flag = True
61+
62+
return StartDate,EndDate,SingleDay_flag
63+
64+
else:
65+
StartDate = np.datetime64(StartDate,'D')
66+
EndDate = np.datetime64(EndDate,'D')
67+
68+
return StartDate,EndDate

ecco_access/ecco_access.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from .ecco_s3_retrieve import ecco_podaac_s3_get
1414
from .ecco_s3_retrieve import ecco_podaac_s3_get_diskaware
1515

16+
from .ecco_acc_dates import date_adjustment
17+
1618

1719
def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\
1820
StartDate=None,EndDate=None,\
@@ -249,3 +251,160 @@ def shortnames_find(query_list,grid,time_res):
249251
granule_files[shortname] = granule_files[shortname][0]
250252

251253
return granule_files
254+
255+
256+
257+
###================================================================================================================
258+
259+
260+
def ecco_podaac_access_to_xrdataset(query,version='v4r4',grid=None,time_res='all',\
261+
StartDate=None,EndDate=None,\
262+
mode='download_ifspace',download_root_dir=None,**kwargs):
263+
"""
264+
265+
This function queries and accesses ECCO datasets from PO.DAAC. The core query and download functions are adapted from Jupyter notebooks
266+
created by Jack McNelis and Ian Fenty
267+
(https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/README.md)
268+
and modified by Andrew Delman (https://ecco-v4-python-tutorial.readthedocs.io).
269+
It is similar to ecco_podaac_access, except instead of a list of URLs or files,
270+
an xarray Dataset with all of the queried ECCO datasets is returned.
271+
272+
Parameters
273+
----------
274+
query: str, list, or dict, defines datasets or variables to access.
275+
If query is str, it specifies either a dataset ShortName (which is
276+
assumed if the string begins with 'ECCO_'), or a text string that
277+
can be used to search the ShortNames, variable names, and descriptions.
278+
A query may also be a list of multiple ShortNames and/or text searches,
279+
or a dict that contains grid,time_res specifiers as keys and ShortNames
280+
or text searches as values, e.g.,
281+
{'native,monthly':['ECCO_L4_SSH_LLC0090GRID_MONTHLY_V4R4',
282+
'THETA']}
283+
will query the native grid monthly SSH datasets, and all native grid
284+
monthly datasets with variables or descriptions matching 'THETA'.
285+
286+
version: ('v4r4'), specifies ECCO version to query
287+
288+
grid: ('native','latlon',None), specifies whether to query datasets with output
289+
on the native grid or the interpolated lat/lon grid.
290+
The default None will query both types of grids, unless specified
291+
otherwise in a query dict (e.g., the example above).
292+
293+
time_res: ('monthly','daily','snapshot','all'), specifies which time resolution
294+
to include in query and downloads. 'all' includes all time resolutions,
295+
and datasets that have no time dimension, such as the grid parameter
296+
and mixing coefficient datasets.
297+
298+
299+
StartDate,EndDate: str, in 'YYYY', 'YYYY-MM', or 'YYYY-MM-DD' format,
300+
define date range [StartDate,EndDate] for download.
301+
EndDate is included in the time range (unlike typical Python ranges).
302+
Full ECCOv4r4 date range (default) is '1992-01-01' to '2017-12-31'.
303+
For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets
304+
within the specified date range.
305+
306+
mode: str, one of the following:
307+
'ls' or 'query': Query dataset ShortNames and variable names/
308+
descriptions only; no downloads.
309+
's3_ls' or 's3_query': Query dataset ShortNames and variable names/
310+
descriptions only; return paths on S3.
311+
'download': Download datasets using NASA Earthdata URLs
312+
'download_ifspace': Check storage availability before downloading.
313+
Download only if storage footprint of downloads
314+
<= max_avail_frac*(available storage)
315+
'download_subset': Download spatial and temporal subsets of datasets
316+
via Opendap; query help(ecco_access.ecco_podaac_download_subset)
317+
to see keyword arguments that can be used in this mode.
318+
The following modes work within the AWS cloud only:
319+
's3_open': Access datasets on S3 without downloading.
320+
's3_open_fsspec': Use json files (generated with `fsspec` and `kerchunk`)
321+
for expedited opening of datasets.
322+
's3_get': Download from S3 (to AWS EC2 instance).
323+
's3_get_ifspace': Check storage availability before downloading;
324+
download if storage footprint
325+
<= max_avail_frac*(available storage).
326+
Otherwise data are opened "remotely" from S3 bucket.
327+
328+
download_root_dir: str, defines parent directory to download files to.
329+
Files will be downloaded to directory download_root_dir/ShortName/.
330+
If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
331+
332+
Additional keyword arguments*:
333+
*This is not an exhaustive list, especially for
334+
'download_subset' mode; use help(ecco_access.ecco_podaac_download_subset) to display
335+
options specific to that mode
336+
337+
max_avail_frac: float, maximum fraction of remaining available disk space to
338+
use in storing ECCO datasets.
339+
If storing the datasets exceeds this fraction, an error is returned.
340+
Valid range is [0,0.9]. If number provided is outside this range, it is replaced by the closer
341+
endpoint of the range.
342+
343+
jsons_root_dir: str, for s3_open_fsspec mode only, the root/parent directory where the
344+
fsspec/kerchunk-generated jsons are found.
345+
jsons are generated using the steps described here:
346+
https://medium.com/pangeo/fake-it-until-you-make-it-reading-goes-netcdf4-data-on-aws-s3-as-zarr
347+
-for-rapid-data-access-61e33f8fe685
348+
and stored as {jsons_root_dir}/MZZ_{GRIDTYPE}_{TIME_RES}/{SHORTNAME}.json.
349+
For v4r4, GRIDTYPE is '05DEG' or 'LLC0090GRID'.
350+
TIME_RES is one of: ('MONTHLY','DAILY','SNAPSHOT','GEOMETRY','MIXING_COEFFS').
351+
352+
n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
353+
354+
force_redownload: bool, if True, existing files will be redownloaded and replaced;
355+
if False (default), existing files will not be replaced.
356+
357+
return_granules: bool, if True (default), str or list of queried or
358+
downloaded granules/files (including ones that
359+
were already on disk and not replaced) is returned.
360+
if False, the function returns nothing.
361+
362+
Returns
363+
-------
364+
ds_out: xarray Dataset or dict of xarray Datasets (with ShortNames as keys),
365+
containing all of the accessed datasets.
366+
Does not work with the query modes: 'ls','query','s3_ls','s3_query'.
367+
"""
368+
369+
pass
370+
371+
372+
import numpy as np
373+
import xarray as xr
374+
375+
376+
# raise error if mode is ls/query only
377+
if mode in ['ls','query','s3_ls','s3_query']:
378+
raise ValueError("ecco_podaac_access_to_xrdataset does not work with 'ls'/'query' modes. \n"\
379+
+"Please use ecco_podaac_access with these modes.")
380+
381+
return -1
382+
383+
# submit access query (and download if needed)
384+
access_output = ecco_podaac_access(query,version,grid,time_res,\
385+
StartDate,EndDate,\
386+
mode,download_root_dir,**kwargs)
387+
388+
# open xarray datasets
389+
ds_out = {}
390+
for shortname,access_out in access_output.items():
391+
if mode == 's3_open_fsspec':
392+
ds_out[shortname] = xr.open_dataset(access_out,engine='zarr',consolidated=False)
393+
if 'time' in ds_out[shortname].dims:
394+
# isolate time range specified
395+
startdate,enddate = date_adjustment(ShortName,\
396+
StartDate,EndDate,CMR_query=False)
397+
time_values = ds_out[shortname].time.values.astype('datetime64[D]')
398+
in_time_range = np.logical_and(time_values >= startdate,\
399+
time_values <= enddate).nonzero()[0]
400+
ds_out[shortname] = ds_out[shortname].isel(time=in_time_range)
401+
else:
402+
ds_out[shortname] = xr.open_mfdataset(access_out,\
403+
compat='override',data_vars='minimal',coords='minimal',\
404+
parallel=True)
405+
406+
# if only one ShortName is involved, then extract dataset from dictionary
407+
if len(ds_out) == 1:
408+
ds_out = list(ds_out.values())[0]
409+
410+
return ds_out

ecco_access/ecco_download.py

Lines changed: 7 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
### This module contains routines to download ECCO datasets using Python requests
22

33

4+
from .ecco_acc_dates import date_adjustment
5+
46
## Initalize Python libraries
57
import numpy as np
68
import pandas as pd
@@ -106,51 +108,13 @@ def get_granules(params: dict):
106108
return df
107109

108110

111+
109112
#=====================================================
110113

111114

112115
# # Adjust StartDate and EndDate to CMR query values
113-
114-
if StartDate=='yesterday':
115-
StartDate = yesterday()
116-
if EndDate==-1:
117-
EndDate = StartDate
118-
elif StartDate=='yesterday':
119-
StartDate = yesterday()
120-
elif EndDate=='today':
121-
EndDate = today()
122-
123-
if len(StartDate) == 4:
124-
StartDate += '-01-01'
125-
elif len(StartDate) == 7:
126-
StartDate += '-01'
127-
elif len(StartDate) != 10:
128-
sys.exit('\nStart date should be in format ''YYYY'', ''YYYY-MM'', or ''YYYY-MM-DD''!\n'\
129-
+'Program will exit now !\n')
130-
131-
if len(EndDate) == 4:
132-
EndDate += '-12-31'
133-
elif len(EndDate) == 7:
134-
EndDate = str(np.datetime64(str(np.datetime64(EndDate,'M')+np.timedelta64(1,'M'))+'-01','D')\
135-
-np.timedelta64(1,'D'))
136-
elif len(EndDate) != 10:
137-
sys.exit('\nEnd date should be in format ''YYYY'', ''YYYY-MM'', or ''YYYY-MM-DD''!\n'\
138-
+'Program will exit now !\n')
139-
140-
141-
# for monthly and daily datasets, do not include the month or day before
142-
if (('MONTHLY' in ShortName) or ('DAILY' in ShortName)):
143-
if np.datetime64(EndDate,'D') - np.datetime64(StartDate,'D') \
144-
> np.timedelta64(1,'D'):
145-
StartDate = str(np.datetime64(StartDate,'D') + np.timedelta64(1,'D'))
146-
SingleDay_flag = False
147-
else:
148-
# for single day ranges we need to make the adjustment
149-
# after the CMR request
150-
SingleDay_flag = True
151-
# for snapshot datasets, move EndDate one day later
152-
if 'SNAPSHOT' in ShortName:
153-
EndDate = str(np.datetime64(EndDate,'D') + np.timedelta64(1,'D'))
116+
StartDate,EndDate,SingleDay_flag = date_adjustment(ShortName,\
117+
StartDate,EndDate,CMR_query=True)
154118

155119

156120
## Log into Earthdata using your username and password
@@ -1011,46 +975,8 @@ def download_wrapper(url: str, url_append: str, download_dir: str, subset_file_i
1011975

1012976

1013977
# # Adjust StartDate and EndDate to CMR query values
1014-
1015-
if StartDate=='yesterday':
1016-
StartDate = yesterday()
1017-
if EndDate==-1:
1018-
EndDate = StartDate
1019-
elif StartDate=='yesterday':
1020-
StartDate = yesterday()
1021-
elif EndDate=='today':
1022-
EndDate = today()
1023-
1024-
if len(StartDate) == 4:
1025-
StartDate += '-01-01'
1026-
elif len(StartDate) == 7:
1027-
StartDate += '-01'
1028-
elif len(StartDate) != 10:
1029-
sys.exit('\nStart date should be in format ''YYYY'', ''YYYY-MM'', or ''YYYY-MM-DD''!\n'\
1030-
+'Program will exit now !\n')
1031-
1032-
if len(EndDate) == 4:
1033-
EndDate += '-12-31'
1034-
elif len(EndDate) == 7:
1035-
EndDate = str(np.datetime64(str(np.datetime64(EndDate,'M')+np.timedelta64(1,'M'))+'-01','D')\
1036-
-np.timedelta64(1,'D'))
1037-
elif len(EndDate) != 10:
1038-
sys.exit('\nEnd date should be in format ''YYYY-MM-DD''!\n'\
1039-
+'Program will exit now !\n')
1040-
1041-
# for monthly and daily datasets, do not include the month or day before
1042-
if (('MONTHLY' in ShortName) or ('DAILY' in ShortName)):
1043-
if np.datetime64(EndDate,'D') - np.datetime64(StartDate,'D') \
1044-
> np.timedelta64(1,'D'):
1045-
StartDate = str(np.datetime64(StartDate,'D') + np.timedelta64(1,'D'))
1046-
SingleDay_flag = False
1047-
else:
1048-
# for single day ranges we need to make the adjustment
1049-
# after the CMR request
1050-
SingleDay_flag = True
1051-
# for snapshot datasets, move EndDate one day later
1052-
if 'SNAPSHOT' in ShortName:
1053-
EndDate = str(np.datetime64(EndDate,'D') + np.timedelta64(1,'D'))
978+
StartDate,EndDate,SingleDay_flag = date_adjustment(ShortName,\
979+
StartDate,EndDate,CMR_query=True)
1054980

1055981

1056982
# set default download parent directory

0 commit comments

Comments
 (0)