|
| 1 | +# Retrieve data from the National Water Quality Assessment Program (NAWQA) |
| 2 | + |
| 3 | +import lithops |
| 4 | +import math |
| 5 | +import os |
| 6 | +import pandas as pd |
| 7 | + |
| 8 | +from dataretrieval import nldi, nwis, wqp |
| 9 | + |
| 10 | +DESTINATION_BUCKET = os.environ.get('DESTINATION_BUCKET') |
| 11 | +PROJECT = "National Water Quality Assessment Program (NAWQA)" |
| 12 | + |
| 13 | + |
| 14 | +def map_retrieval(site): |
| 15 | + """Map function to pull data from NWIS and WQP""" |
| 16 | + site_list = find_neighboring_sites(site) |
| 17 | + # reformat for wqp |
| 18 | + site_list = [f"USGS-{site}" for site in site_list] |
| 19 | + |
| 20 | + df, _ = wqp.get_results(siteid=site_list, |
| 21 | + project=PROJECT, |
| 22 | + ) |
| 23 | + |
| 24 | + # merge sites |
| 25 | + df['MonitoringLocationIdentifier'] = f"USGS-{site}" |
| 26 | + |
| 27 | + if len(df) != 0: |
| 28 | + df.astype(str).to_parquet(f's3://{DESTINATION_BUCKET}/nwqn-samples.parquet', |
| 29 | + engine='pyarrow', |
| 30 | + partition_cols=['MonitoringLocationIdentifier'], |
| 31 | + compression='zstd') |
| 32 | + # optionally, `return df` for further processing |
| 33 | + |
| 34 | + |
| 35 | +def find_neighboring_sites(site, search_factor=0.05): |
| 36 | + """Find sites upstream and downstream of the given site within a certain distance. |
| 37 | +
|
| 38 | + Parameters |
| 39 | + ---------- |
| 40 | + site : str |
| 41 | + 8-digit site number. |
| 42 | + search_factor : float, optional |
| 43 | + """ |
| 44 | + site_df, _ = nwis.get_info(sites=site) |
| 45 | + drain_area_sq_mi = site_df["drain_area_va"].values[0] |
| 46 | + length = _estimate_watershed_length_km(drain_area_sq_mi) |
| 47 | + search_distance = length * search_factor |
| 48 | + # clip between 1 and 9999km |
| 49 | + search_distance = max(1.0, min(9999.0, search_distance)) |
| 50 | + |
| 51 | + upstream_gdf = nldi.get_features( |
| 52 | + feature_source="WQP", |
| 53 | + feature_id=f"USGS-{site}", |
| 54 | + navigation_mode="UM", |
| 55 | + distance=search_distance, |
| 56 | + data_source="nwissite", |
| 57 | + ) |
| 58 | + |
| 59 | + downstream_gdf = nldi.get_features( |
| 60 | + feature_source="WQP", |
| 61 | + feature_id=f"USGS-{site}", |
| 62 | + navigation_mode="DM", |
| 63 | + distance=search_distance, |
| 64 | + data_source="nwissite", |
| 65 | + ) |
| 66 | + |
| 67 | + features = pd.concat([upstream_gdf, downstream_gdf], ignore_index=True) |
| 68 | + |
| 69 | + df, _ = nwis.get_info(sites=list(features.identifier.str.strip('USGS-'))) |
| 70 | + # drop sites with disimilar different drainage areas |
| 71 | + df = df.where( |
| 72 | + (df["drain_area_va"] / drain_area_sq_mi) > search_factor, |
| 73 | + ).dropna(how="all") |
| 74 | + |
| 75 | + return df["site_no"].to_list() |
| 76 | + |
| 77 | + |
| 78 | +def _estimate_watershed_length_km(drain_area_sq_mi): |
| 79 | + """Estimate the diameter assuming a circular watershed. |
| 80 | +
|
| 81 | + Parameters |
| 82 | + ---------- |
| 83 | + drain_area_sq_mi : float |
| 84 | + The drainage area in square miles. |
| 85 | +
|
| 86 | + Returns |
| 87 | + ------- |
| 88 | + float |
| 89 | + The diameter of the watershed in kilometers. |
| 90 | + """ |
| 91 | + # assume a circular watershed |
| 92 | + length_miles = 2 * (drain_area_sq_mi / math.pi) ** 0.5 |
| 93 | + # convert to km |
| 94 | + return length_miles * 1.60934 |
| 95 | + |
| 96 | + |
| 97 | +if __name__ == "__main__": |
| 98 | + project = "National Water Quality Assessment Program (NAWQA)" |
| 99 | + |
| 100 | + site_df = pd.read_csv( |
| 101 | + 'NWQN_sites.csv', |
| 102 | + comment='#', |
| 103 | + dtype={'SITE_QW_ID': str, 'SITE_FLOW_ID': str}, |
| 104 | + ) |
| 105 | + |
| 106 | + site_list = site_df['SITE_QW_ID'].to_list() |
| 107 | + # site_list = site_list[:4] # prune for testing |
| 108 | + |
| 109 | + fexec = lithops.FunctionExecutor(config_file="lithops.yaml") |
| 110 | + futures = fexec.map(map_retrieval, site_list) |
| 111 | + |
| 112 | + futures.get_result() |
0 commit comments