Skip to content
This repository was archived by the owner on May 28, 2024. It is now read-only.

Commit 323c62e

Browse files
authored
Merge pull request #169 from lekoenig/format-model-inputs
Format model inputs in p2a_model.R
2 parents 8cdcd1d + ef82f49 commit 323c62e

2 files changed

Lines changed: 79 additions & 53 deletions

File tree

2a_model.R

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ source("2a_model/src/write_model_config_files.R")
33

44
p2a_targets_list <- list(
55

6-
## PREPARE (RENAME, JOIN) INPUT AND OUTPUT FILES ##
6+
## 1) COMBINE AND FORMAT MODEL-READY INPUTS AND OUTPUTS ##
77
# join met data with light input data
88
tar_target(
99
p2a_met_light_data,
@@ -19,26 +19,30 @@ p2a_targets_list <- list(
1919
relocate(date, .after = COMID)
2020
),
2121

22-
# match site_ids to seg_ids
22+
# join met and light data with site_ids (resulting data frame will have
23+
# 16 unique COMID's which matches the number of well-observed reaches).
2324
tar_target(
2425
p2a_met_data_w_sites,
2526
match_site_ids_to_segs(p2a_met_light_data, p2_sites_w_segs)
2627
),
2728

28-
# match seg attributes with site_ids
29+
# join segment attributes with site_ids (resulting data frame will have one
30+
# row for each unique COMID x site_id in the lower DRB; n = 10,111).
2931
tar_target(
3032
p2a_seg_attr_w_sites,
3133
match_site_ids_to_segs(p2_seg_attr_data, p2_sites_w_segs)
3234
),
3335

34-
# join the metab data with the DO observations
36+
# join the metabolism data with the DO observations (use full_join to include
37+
# all rows in both the DO data and the metab data).
3538
tar_target(
3639
p2a_do_and_metab,
3740
p2_daily_with_seg_ids %>%
3841
full_join(p2_metab_filtered, by = c("site_id", "date"))
3942
),
4043

41-
## SPLIT SITES INTO (train) and (train and validation) ##
44+
45+
## 2) SPLIT SITES INTO (train) and (train and validation) ##
4246
# char vector of well-observed train sites
4347
tar_target(
4448
p2a_trn_sites,
@@ -87,7 +91,8 @@ p2a_targets_list <- list(
8791
sf::st_as_sf(., coords = c("lon","lat"), crs = unique(.$epsg))
8892
),
8993

90-
## WRITE MODEL CONFIGURATION FILES ##
94+
95+
## 3) WRITE MODEL CONFIGURATION FILES ##
9196
# Write base config file using inputs and parameters defined in _targets.R
9297
tar_target(
9398
p2a_config_base_yml,
@@ -141,35 +146,40 @@ p2a_targets_list <- list(
141146
format = "file"
142147
),
143148

144-
## WRITE OUT PARTITION INPUT AND OUTPUT DATA ##
145-
# write met and seg attribute data for trn/val sites to zarr
146-
# note - I have to subset inputs to only include the train/val sites before
147-
# passing to subset_and_write_zarr or else I get a memory error on the join
148149

149-
## CHANGING X VARIABLES ##
150-
#To change x variables for the model, they have to be added to the
151-
#model specific config.yml file which can be found in
152-
#2a_model/src/model/{model ID}/config.yml
153-
154-
# write trn and val input and output data to zarr
150+
## 4) WRITE OUT PARTITION INPUT AND OUTPUT DATA ##
151+
# Subset trn/val input and output data to well-observed sites and format
152+
# for export. [Jeff]: note - I have to subset inputs to only include the
153+
# train/val sites before passing to subset_and_write_zarr or else I get a
154+
# memory error on the join.
155155
tar_target(
156156
p2a_well_obs_data,
157157
{
158+
# use inner_join to keep sites that are within the set of trn/val sites
159+
# and are represented in both the met data and the seg attr data.
158160
inputs <- p2a_met_data_w_sites %>%
159161
filter(site_id %in% p2a_trn_val_sites) %>%
160162
inner_join(p2a_seg_attr_w_sites, by = c("site_id", "COMID"))
161163

162164
inputs_and_outputs <- inputs %>%
163-
left_join(p2a_do_and_metab, by=c("site_id", "date"))
165+
left_join(p2a_do_and_metab, by = c("site_id", "COMID", "date"))
164166

165-
# note that if the name of well_obs_io.zarr is changed below, this change must
166-
# also be made in 2a_model/src/Snakefile_base.smk (lines 32, 103, and 177) and
167-
# in 2a_model/src/visualize_models.smk (line 6).
168-
write_df_to_zarr(inputs_and_outputs, c("site_id", "date"), "2a_model/out/well_obs_io.zarr")
169-
},
170-
format="file"
167+
inputs_and_outputs
168+
}
171169
),
172170

171+
# Write trn and val input and output data to zarr. Note that if the name of
172+
# well_obs_io.zarr is changed below, this change must also be made in
173+
# 2a_model/src/Snakefile_base.smk (lines 32, 103, and 177) and in
174+
# 2a_model/src/visualize_models.smk (line 6).
175+
tar_target(
176+
p2a_well_obs_data_zarr,
177+
write_df_to_zarr(p2a_well_obs_data, c("site_id", "date"), "2a_model/out/well_obs_io.zarr"),
178+
format = "file"
179+
),
180+
181+
182+
## 5) GATHER MODEL IDS AND KICK OFF SNAKEMAKE WORKFLOW TO MAKE MODEL PREDICTIONS ##
173183
# gather model ids - add to this list when you want to reproduce
174184
# outputs from a new model
175185
tar_target(
@@ -199,7 +209,8 @@ p2a_targets_list <- list(
199209
tar_target(
200210
p2a_metrics_files,
201211
{
202-
#we need these to make the prepped data file
212+
# we need these to make the prepped data file, so force a dependency of this
213+
# target on p2a_well_obs_data.
203214
p2a_well_obs_data
204215

205216
base_dir <- "2a_model/src/models"

2a_model/src/model_ready_data_utils.R

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1-
2-
match_site_ids_to_segs <-
3-
function(seg_data, sites_w_segs) {
4-
#'
5-
#' @description match site ids to segment data (e.g., met or attributes)
6-
#'
7-
#' @param seg_data a data frame of meterological data with either column
8-
#' seg_id_nat' or 'COMID'
9-
#' @param sites_w_segs a dataframe with both segment ids ('segidnat' or 'COMID')
10-
#' and site ids ('site_id')
11-
#'
12-
#' @value A data frame of seg data with site ids
1+
#' @title Match site ids to segment data
2+
#'
3+
#' @description
4+
#' Function to match site ids to segment data, including meteorological data
5+
#' or segment attribute data.
6+
#'
7+
#' @param seg_data a data frame of meterological data with either column
8+
#' seg_id_nat' or 'COMID'.
9+
#' @param sites_w_segs a dataframe with both segment ids ('segidnat' or 'COMID')
10+
#' and site ids ('site_id').
11+
#'
12+
#' @returns
13+
#' Returns a data frame of seg data with site ids.
14+
#'
15+
match_site_ids_to_segs <- function(seg_data, sites_w_segs) {
1316

1417
if(any(grepl('COMID', names(seg_data)))){
1518
seg_data_out <- seg_data %>%
@@ -20,21 +23,27 @@ match_site_ids_to_segs <-
2023
} else {
2124
seg_data_out <- seg_data %>%
2225
left_join(sites_w_segs[,c("site_id","segidnat")],
23-
by=c("seg_id_nat" = "segidnat"))
26+
by = c("seg_id_nat" = "segidnat"))
2427
}
2528

2629
return(seg_data_out)
2730
}
2831

32+
33+
#' @title Write R data frame to zarr
34+
#'
35+
#' @description
36+
#' Function to use reticulate to write an R data frame to a Zarr data store,
37+
#' which is the file format river-dl currently takes.
38+
#'
39+
#' @param df a data frame of data
40+
#' @param index vector of strings - the column(s) that should be the index
41+
#' @param out_zarr where the zarr data will be written
42+
#'
43+
#' @returns
44+
#' Returns the out_zarr path.
45+
#'
2946
write_df_to_zarr <- function(df, index_cols, out_zarr) {
30-
#'
31-
#' @description use reticulate to write an R data frame to a Zarr data store (the file format river-dl currently takes)
32-
#'
33-
#' @param df a data frame of data
34-
#' @param index vector of strings - the column(s) that should be the index
35-
#' @param out_zarr where the zarr data will be written
36-
#'
37-
#' @value the out_zarr path
3847

3948
# convert to a python (pandas) DataFrame so we have access to the object methods (set_index and to_xarray)
4049
py_df <- reticulate::r_to_py(df)
@@ -57,15 +66,21 @@ write_df_to_zarr <- function(df, index_cols, out_zarr) {
5766
}
5867

5968

69+
#' @title Write R data frame to zarr
70+
#'
71+
#' @description
72+
#' Function to write out to zarr and optionally take a subset. This assumes your
73+
#' zarr index names will be "site_id" and "date".
74+
#'
75+
#' @param df a data frame of data
76+
#' @param out_zarr where the zarr data will be written
77+
#' @param sites_subset - character vector of sites to subset to
78+
#'
79+
#' @returns
80+
#' Returns the out_zarr path.
81+
#'
6082
subset_and_write_zarr <- function(df, out_zarr, sites_subset = NULL){
61-
#' @description write out to zarr and optionally take a subset. This assumes your zarr index
62-
#' names will be "site_id" and "date"
63-
#'
64-
#' @param df a data frame of data
65-
#' @param out_zarr where the zarr data will be written
66-
#' @param sites_subset - character vector of sites to subset to
67-
#'
68-
#' @value the out_zarr path
83+
6984
if (!is.null(sites_subset)){
7085
df <- df %>% filter(site_id %in% sites_subset)
7186
}

0 commit comments

Comments
 (0)