|
13 | 13 | from .ecco_s3_retrieve import ecco_podaac_s3_get |
14 | 14 | from .ecco_s3_retrieve import ecco_podaac_s3_get_diskaware |
15 | 15 |
|
| 16 | +from .ecco_acc_dates import date_adjustment |
| 17 | + |
16 | 18 |
|
17 | 19 | def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\ |
18 | 20 | StartDate=None,EndDate=None,\ |
@@ -249,3 +251,160 @@ def shortnames_find(query_list,grid,time_res): |
249 | 251 | granule_files[shortname] = granule_files[shortname][0] |
250 | 252 |
|
251 | 253 | return granule_files |
| 254 | + |
| 255 | + |
| 256 | + |
| 257 | +###================================================================================================================ |
| 258 | + |
| 259 | + |
| 260 | +def ecco_podaac_access_to_xrdataset(query,version='v4r4',grid=None,time_res='all',\ |
| 261 | + StartDate=None,EndDate=None,\ |
| 262 | + mode='download_ifspace',download_root_dir=None,**kwargs): |
| 263 | + """ |
| 264 | + |
| 265 | + This function queries and accesses ECCO datasets from PO.DAAC. The core query and download functions are adapted from Jupyter notebooks |
| 266 | + created by Jack McNelis and Ian Fenty |
| 267 | + (https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/README.md) |
| 268 | + and modified by Andrew Delman (https://ecco-v4-python-tutorial.readthedocs.io). |
| 269 | + It is similar to ecco_podaac_access, except instead of a list of URLs or files, |
| 270 | + an xarray Dataset with all of the queried ECCO datasets is returned. |
| 271 | +
|
| 272 | + Parameters |
| 273 | + ---------- |
| 274 | + query: str, list, or dict, defines datasets or variables to access. |
| 275 | + If query is str, it specifies either a dataset ShortName (which is |
| 276 | + assumed if the string begins with 'ECCO_'), or a text string that |
| 277 | + can be used to search the ShortNames, variable names, and descriptions. |
| 278 | + A query may also be a list of multiple ShortNames and/or text searches, |
| 279 | + or a dict that contains grid,time_res specifiers as keys and ShortNames |
| 280 | + or text searches as values, e.g., |
| 281 | + {'native,monthly':['ECCO_L4_SSH_LLC0090GRID_MONTHLY_V4R4', |
| 282 | + 'THETA']} |
| 283 | + will query the native grid monthly SSH datasets, and all native grid |
| 284 | + monthly datasets with variables or descriptions matching 'THETA'. |
| 285 | + |
| 286 | + version: ('v4r4'), specifies ECCO version to query |
| 287 | + |
| 288 | + grid: ('native','latlon',None), specifies whether to query datasets with output |
| 289 | + on the native grid or the interpolated lat/lon grid. |
| 290 | + The default None will query both types of grids, unless specified |
| 291 | + otherwise in a query dict (e.g., the example above). |
| 292 | + |
| 293 | + time_res: ('monthly','daily','snapshot','all'), specifies which time resolution |
| 294 | + to include in query and downloads. 'all' includes all time resolutions, |
| 295 | + and datasets that have no time dimension, such as the grid parameter |
| 296 | + and mixing coefficient datasets. |
| 297 | +
|
| 298 | + |
| 299 | + StartDate,EndDate: str, in 'YYYY', 'YYYY-MM', or 'YYYY-MM-DD' format, |
| 300 | + define date range [StartDate,EndDate] for download. |
| 301 | + EndDate is included in the time range (unlike typical Python ranges). |
| 302 | + Full ECCOv4r4 date range (default) is '1992-01-01' to '2017-12-31'. |
| 303 | + For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets |
| 304 | + within the specified date range. |
| 305 | + |
| 306 | + mode: str, one of the following: |
| 307 | + 'ls' or 'query': Query dataset ShortNames and variable names/ |
| 308 | + descriptions only; no downloads. |
| 309 | + 's3_ls' or 's3_query': Query dataset ShortNames and variable names/ |
| 310 | + descriptions only; return paths on S3. |
| 311 | + 'download': Download datasets using NASA Earthdata URLs |
| 312 | + 'download_ifspace': Check storage availability before downloading. |
| 313 | + Download only if storage footprint of downloads |
| 314 | + <= max_avail_frac*(available storage) |
| 315 | + 'download_subset': Download spatial and temporal subsets of datasets |
| 316 | + via Opendap; query help(ecco_access.ecco_podaac_download_subset) |
| 317 | + to see keyword arguments that can be used in this mode. |
| 318 | + The following modes work within the AWS cloud only: |
| 319 | + 's3_open': Access datasets on S3 without downloading. |
| 320 | + 's3_open_fsspec': Use json files (generated with `fsspec` and `kerchunk`) |
| 321 | + for expedited opening of datasets. |
| 322 | + 's3_get': Download from S3 (to AWS EC2 instance). |
| 323 | + 's3_get_ifspace': Check storage availability before downloading; |
| 324 | + download if storage footprint |
| 325 | + <= max_avail_frac*(available storage). |
| 326 | + Otherwise data are opened "remotely" from S3 bucket. |
| 327 | +
|
| 328 | + download_root_dir: str, defines parent directory to download files to. |
| 329 | + Files will be downloaded to directory download_root_dir/ShortName/. |
| 330 | + If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'. |
| 331 | + |
| 332 | + Additional keyword arguments*: |
| 333 | + *This is not an exhaustive list, especially for |
| 334 | + 'download_subset' mode; use help(ecco_access.ecco_podaac_download_subset) to display |
| 335 | + options specific to that mode |
| 336 | + |
| 337 | + max_avail_frac: float, maximum fraction of remaining available disk space to |
| 338 | + use in storing ECCO datasets. |
| 339 | + If storing the datasets exceeds this fraction, an error is returned. |
| 340 | + Valid range is [0,0.9]. If number provided is outside this range, it is replaced by the closer |
| 341 | + endpoint of the range. |
| 342 | + |
| 343 | + jsons_root_dir: str, for s3_open_fsspec mode only, the root/parent directory where the |
| 344 | + fsspec/kerchunk-generated jsons are found. |
| 345 | + jsons are generated using the steps described here: |
| 346 | + https://medium.com/pangeo/fake-it-until-you-make-it-reading-goes-netcdf4-data-on-aws-s3-as-zarr |
| 347 | + -for-rapid-data-access-61e33f8fe685 |
| 348 | + and stored as {jsons_root_dir}/MZZ_{GRIDTYPE}_{TIME_RES}/{SHORTNAME}.json. |
| 349 | + For v4r4, GRIDTYPE is '05DEG' or 'LLC0090GRID'. |
| 350 | + TIME_RES is one of: ('MONTHLY','DAILY','SNAPSHOT','GEOMETRY','MIXING_COEFFS'). |
| 351 | + |
| 352 | + n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6. |
| 353 | + |
| 354 | + force_redownload: bool, if True, existing files will be redownloaded and replaced; |
| 355 | + if False (default), existing files will not be replaced. |
| 356 | +
|
| 357 | + return_granules: bool, if True (default), str or list of queried or |
| 358 | + downloaded granules/files (including ones that |
| 359 | + were already on disk and not replaced) is returned. |
| 360 | + if False, the function returns nothing. |
| 361 | +
|
| 362 | + Returns |
| 363 | + ------- |
| 364 | + ds_out: xarray Dataset or dict of xarray Datasets (with ShortNames as keys), |
| 365 | + containing all of the accessed datasets. |
| 366 | + Does not work with the query modes: 'ls','query','s3_ls','s3_query'. |
| 367 | + """ |
| 368 | + |
| 369 | + pass |
| 370 | + |
| 371 | + |
| 372 | + import numpy as np |
| 373 | + import xarray as xr |
| 374 | + |
| 375 | + |
| 376 | + # raise error if mode is ls/query only |
| 377 | + if mode in ['ls','query','s3_ls','s3_query']: |
| 378 | + raise ValueError("ecco_podaac_access_to_xrdataset does not work with 'ls'/'query' modes. \n"\ |
| 379 | + +"Please use ecco_podaac_access with these modes.") |
| 380 | + |
| 381 | + return -1 |
| 382 | + |
| 383 | + # submit access query (and download if needed) |
| 384 | + access_output = ecco_podaac_access(query,version,grid,time_res,\ |
| 385 | + StartDate,EndDate,\ |
| 386 | + mode,download_root_dir,**kwargs) |
| 387 | + |
| 388 | + # open xarray datasets |
| 389 | + ds_out = {} |
| 390 | + for shortname,access_out in access_output.items(): |
| 391 | + if mode == 's3_open_fsspec': |
| 392 | + ds_out[shortname] = xr.open_dataset(access_out,engine='zarr',consolidated=False) |
| 393 | + if 'time' in ds_out[shortname].dims: |
| 394 | + # isolate time range specified |
| 395 | + startdate,enddate = date_adjustment(ShortName,\ |
| 396 | + StartDate,EndDate,CMR_query=False) |
| 397 | + time_values = ds_out[shortname].time.values.astype('datetime64[D]') |
| 398 | + in_time_range = np.logical_and(time_values >= startdate,\ |
| 399 | + time_values <= enddate).nonzero()[0] |
| 400 | + ds_out[shortname] = ds_out[shortname].isel(time=in_time_range) |
| 401 | + else: |
| 402 | + ds_out[shortname] = xr.open_mfdataset(access_out,\ |
| 403 | + compat='override',data_vars='minimal',coords='minimal',\ |
| 404 | + parallel=True) |
| 405 | + |
| 406 | + # if only one ShortName is involved, then extract dataset from dictionary |
| 407 | + if len(ds_out) == 1: |
| 408 | + ds_out = list(ds_out.values())[0] |
| 409 | + |
| 410 | + return ds_out |
0 commit comments