Merge pull request #37 from andrewdelman/master

andrewdelman · web-flow · commit 67f1a8af811f · 2023-01-11T12:04:01.000-08:00
Post ECCO download tutorials
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ _templates
 *.ipynb_checkpoints*
 /Tutorials_as_Jupyter_Notebooks/.ipynb_checkpoints/*
 .README.md.swp
+/ECCO-ACCESS/Downloading_ECCO_Datasets_from_PODAAC/~/
diff --git a/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb b/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb
@@ -5,11 +5,11 @@
    "id": "d8999184",
    "metadata": {},
    "source": [
-    "# Tutorial Python 3 Notebook for Downloading ECCO Datasets from PO.DAAC\n",
+    "# Using Python to Download ECCO Datasets\n",
     "\n",
-    "**Note: This notebook is modified from the tutorial on the [ECCO-GROUP Github](https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb) by Jack McNelis and Ian Fenty, Version 1.1 dated 2021-06-25.**\n",
+    "**Note: This notebook was modified by Andrew Delman (updated 2022-12-15) from the tutorial on the [ECCO-GROUP Github](https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb) by Jack McNelis and Ian Fenty, Version 1.1 dated 2021-06-25.**\n",
     "\n",
-    "This notebook provides instructions for downloading a set of granules (files) for an ECCO \"Dataset\" hosted by PO.DAAC. The focus is on downloading datasets in the lat-lon-cap 90 (llc90) native grid of the ECCO v4 simulations, since the tutorials mostly use output on the native grid. If you're new to this grid geometry, don't worry! The ecco_v4_py package discussed in the previous tutorial will help you load the ECCO output, make computations, and plot the results while hardly needing to interact with the model grid.\n",
+    "This Jupyter notebook provides instructions and Python code for downloading a set of granules (files) for an ECCO \"Dataset\" hosted by PO.DAAC. The focus is on downloading datasets in the lat-lon-cap 90 (llc90) native grid of the ECCO v4 simulations, since the tutorials mostly use output on the native grid. If you're new to this grid geometry, don't worry! The ecco_v4_py package discussed in the previous tutorial will help you load the ECCO output, make computations, and plot the results while hardly needing to interact with the model grid.\n",
     "\n",
     "The example ECCO Dataset used in this tutorial is \"ECCO Sea Surface Height - Daily Mean llc90 Grid (Version 4 Release 4)\" which provides daily mean sea surface height on the native llc90 grid ([10.5067/ECL5D-SSH44](https://doi.org/10.5067/ECL5D-SSH44)). \n",
     "\n",
diff --git a/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.pdf b/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.pdf
diff --git a/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_wget_Command_Line_HTTPS_Downloading_ECCO_Datasets_from_PODAAC.md b/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_wget_Command_Line_HTTPS_Downloading_ECCO_Datasets_from_PODAAC.md
@@ -1,4 +1,4 @@
-# Tutorial: Using Command Line _wget_ to Download ECCO Datasets from PO.DAAC
+# Using _wget_ to Download ECCO Datasets from PO.DAAC
 
 Version 1.0 2021-06-25
 
@@ -91,4 +91,4 @@ $wget --no-verbose \
      --no-clobber \
      --continue \
      -i 5237392644-download.txt -P data/
-```
+```
diff --git a/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_wget_Command_Line_HTTPS_Downloading_ECCO_Datasets_from_PODAAC.pdf b/ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_wget_Command_Line_HTTPS_Downloading_ECCO_Datasets_from_PODAAC.pdf
diff --git a/Intro_to_PO_Tutorials/__pycache__/ecco_download.cpython-37.pyc b/Intro_to_PO_Tutorials/__pycache__/ecco_download.cpython-37.pyc
diff --git a/Intro_to_PO_Tutorials/ecco_download.py b/Intro_to_PO_Tutorials/ecco_download.py
@@ -0,0 +1,220 @@
+def ecco_podaac_download(ShortName,StartDate,EndDate,download_root_dir=None,n_workers=6,force_redownload=False):
+    """
+    This routine downloads ECCO datasets from PO.DAAC. It is adapted from the Jupyter notebooks created by Jack McNelis and Ian Fenty (https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/README.md) and modified by Andrew Delman (https://ecco-v4-python-tutorial.readthedocs.io).
+    
+    Parameters
+    ----------
+    ShortName: the ShortName of the dataset (can be identified from https://search.earthdata.nasa.gov/search?fpj=ECCO, selecting the "i" information button and the ShortName will appear in a gray box in the upper-left corner)
+    
+    StartDate: the start of the time range to be downloaded, expressed in the format "YYYY-MM-DD"
+    
+    EndDate: the end of the time range to be downloaded, expressed in the format "YYYY-MM-DD"
+    
+    download_root_dir: path of the parent directory to download ECCO files
+    
+    n_workers: number of workers to use in concurrent downloads
+    
+    force_redownload: if True, existing files will be redownloaded and replaced; if False, existing files will not be replaced
+    """
+    
+    
+    ## Initalize Python libraries
+    import numpy as np
+    import pandas as pd
+    import requests
+    import shutil
+    import time as time
+    
+    # for concurrent simulatenous downloads
+    from concurrent.futures import ThreadPoolExecutor
+    from getpass import getpass
+    from http.cookiejar import CookieJar
+    from io import StringIO
+    from itertools import repeat
+    from pathlib import Path
+    from platform import system
+    from netrc import netrc
+    from os.path import basename, isfile, isdir, join
+    # progress bar
+    from tqdm import tqdm
+    # library to download files
+    from urllib import request
+    
+    # if no download directory specified, set directory under user's home directory
+    if download_root_dir==None:
+        import sys
+        from os.path import expanduser
+        user_home_dir = expanduser('~')
+        download_root_dir = Path(user_home_dir + '/Downloads/ECCO_V4r4_PODAAC')
+    else:
+        download_root_dir = Path(download_root_dir)
+    
+    # Predict the path of the netrc file depending on os/platform type.
+    _netrc = join(expanduser('~'), "_netrc" if system()=="Windows" else ".netrc")
+    
+    ## Define Helper Subroutines
+    
+    ### Helper subroutine to log into NASA EarthData
+    
+    # not pretty but it works
+    def setup_earthdata_login_auth(url: str='urs.earthdata.nasa.gov'):
+        # look for the netrc file and use the login/password
+        try:
+            username, _, password = netrc(file=_netrc).authenticators(url)
+    
+        # if the file is not found, prompt the user for the login/password
+        except (FileNotFoundError, TypeError):
+            print('Please provide Earthdata Login credentials for access.')
+            username, password = input('Username: '), getpass('Password: ')
+        
+        manager = request.HTTPPasswordMgrWithDefaultRealm()
+        manager.add_password(None, url, username, password)
+        auth = request.HTTPBasicAuthHandler(manager)
+        jar = CookieJar()
+        processor = request.HTTPCookieProcessor(jar)
+        opener = request.build_opener(auth, processor)
+        request.install_opener(opener)
+    
+    ### Helper subroutines to make the API calls to search CMR and parse response
+    def set_params(params: dict):
+        params.update({'scroll': "true", 'page_size': 2000})
+        return {par: val for par, val in params.items() if val is not None}
+    
+    def get_results(params: dict, headers: dict=None):
+        response = requests.get(url="https://cmr.earthdata.nasa.gov/search/granules.csv", 
+                                params=set_params(params),
+                                headers=headers)
+        return response, response.headers
+    
+    
+    def get_granules(params: dict):
+        response, headers = get_results(params=params)
+        scroll = headers['CMR-Scroll-Id']
+        hits = int(headers['CMR-Hits'])
+        if hits==0:
+            raise Exception("No granules matched your input parameters.")
+        df = pd.read_csv(StringIO(response.text)) 
+        while hits > df.index.size:
+            response, _ = get_results(params=params, headers={'CMR-Scroll-Id': scroll})
+            data = pd.read_csv(StringIO(response.text))
+            df = pd.concat([df, data])
+        return df
+    
+    ### Helper subroutine to gracefully download single files and avoids re-downloading if file already exists.
+    # To force redownload of the file, pass **True** to the boolean argument *force* (default **False**)\n,
+    def download_file(url: str, output_dir: str, force: bool=False):
+        """url (str): the HTTPS url from which the file will download
+        output_dir (str): the local path into which the file will download
+        force (bool): download even if the file exists locally already
+        """
+        if not isdir(output_dir):
+            raise Exception(f"Output directory doesnt exist! ({output_dir})")
+        
+        target_file = join(output_dir, basename(url))
+        
+        # if the file has already been downloaded, skip    
+        if isfile(target_file) and force is False:
+            print(f'\n{basename(url)} already exists, and force=False, not re-downloading')
+            return 0
+        
+        with requests.get(url) as r:
+            if not r.status_code // 100 == 2: 
+                raise Exception(r.text)
+                return 0
+            else:
+                with open(target_file, 'wb') as f:
+                    total_size_in_bytes= int(r.headers.get('content-length', 0))
+                    for chunk in r.iter_content(chunk_size=1024):
+                        if chunk:
+                            f.write(chunk)
+    
+                    return total_size_in_bytes
+    
+    ### Helper subroutine to download all urls in the list `dls`
+    def download_files_concurrently(dls, download_dir, force=False):
+        start_time = time.time()
+    
+        # use 3 threads for concurrent downloads
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+    
+            # tqdm makes a cool progress bar
+            results = list(tqdm(executor.map(download_file, dls, repeat(download_dir), repeat(force)), total=len(dls)))
+        
+            # add up the total downloaded file sizes
+            total_download_size_in_bytes = np.sum(np.array(results))
+            # calculate total time spent in the download
+            total_time = time.time() - start_time
+    
+            print('\n=====================================')
+            print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
+            print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time,2)} Mb/s')
+    
+    # define root directory for downloaded NetCDF files
+    download_root_dir = Path(user_home_dir + '/Downloads/ECCO_V4r4_PODAAC')
+    
+    # define the directory where the downloaded files will be saved
+    download_dir = download_root_dir / ShortName
+    
+    # create the download directory
+    download_dir.mkdir(exist_ok = True, parents=True)
+    
+    print(f'created download directory {download_dir}')
+    
+    ## Log into Earthdata using your username and password
+    
+    # actually log in with this command:
+    setup_earthdata_login_auth()
+    
+    # Query the NASA Common Metadata Repository to find the URL of every granule associated with the desired ECCO Dataset and date range of interest.
+    
+    # create a Python dictionary with our search criteria:  `ShortName` and `temporal`
+    input_search_params = {'ShortName': ShortName,
+                           'temporal': ",".join([StartDate, EndDate])}
+    
+    print(input_search_params)
+    
+    ### Query CMR for the desired ECCO Dataset
+    
+    # grans means 'granules', PO.DAAC's term for individual files in a dataset
+    grans = get_granules(input_search_params)
+    
+    # grans.info()
+    
+    num_grans = len( grans['Granule UR'] )
+    print (f'\nTotal number of matching granules: {num_grans}')
+    
+    
+    ## Download the granules
+    
+    # convert the rows of the 'Online Access URLS' column to a Python list
+    dls = grans['Online Access URLs'].tolist()
+    
+    try:
+        # Attempt concurrent downloads, but if error arises switch to sequential downloads
+        ### Method 1: Concurrent downloads
+        
+        # Define the maximum number of concurrent downloads (benefits typically taper off above 5-6)
+        max_workers = 6
+        
+        # Force redownload (or not) depending on value of force_redownload
+        download_files_concurrently(dls, download_dir, force_redownload)
+        
+    except:
+        ### Method 2: Sequential Downloads
+        
+        # Download each URL sequentially in a for loop.
+        total_download_size_in_bytes = 0
+        start_time = time.time()
+        
+        # loop through all urls in dls
+        for u in dls:
+            u_name = u.split('/')[-1]
+            print(f'downloading {u_name}')
+            total_download_size_in_bytes += download_file(url=u, output_dir=download_dir, force=force_redownload)
+        
+        # calculate total time spent in the download
+        total_time = time.time() - start_time
+        
+        print('\n=====================================')
+        print(f'total downloaded: {np.round(total_download_size_in_bytes/1e6,2)} Mb')
+        print(f'avg download speed: {np.round(total_download_size_in_bytes/1e6/total_time,2)} Mb/s')
diff --git a/doc/Downloading_ECCO_Datasets_from_PODAAC_Python.ipynb b/doc/Downloading_ECCO_Datasets_from_PODAAC_Python.ipynb
@@ -0,0 +1 @@
+../ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb
diff --git a/doc/Downloading_ECCO_Datasets_from_PODAAC_wget.md b/doc/Downloading_ECCO_Datasets_from_PODAAC_wget.md
@@ -0,0 +1 @@
+../ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_wget_Command_Line_HTTPS_Downloading_ECCO_Datasets_from_PODAAC.md
diff --git a/doc/conf.py b/doc/conf.py
@@ -91,9 +91,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'4.3-20191128'
+version = u'4.4-20221215'
 # The full version, including alpha/beta/rc tags.
-release = u'4.3-20191128'
+release = u'4.4-20221215'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/doc/index.rst b/doc/index.rst
@@ -29,6 +29,8 @@ The `ecco_v4_py`_ package used in this tutorial was inspired by the `xmitgcm`_ p
    fields
    Installing_Python_and_Python_Packages
    Downloading_the_ECCO_v4_state_estimate
+   Downloading_ECCO_Datasets_from_PODAAC_Python.ipynb
+   Downloading_ECCO_Datasets_from_PODAAC_wget.md
    Tutorial_Introduction
 
 .. toctree::

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../ECCO-ACCESS/Downloading_ECCO_datasets_from_PODAAC/Tutorial_Python3_Jupyter_Notebook_Downloading_ECCO_Datasets_from_PODAAC.ipynb`