Skip to content

Commit eb62be5

Browse files
committed
convert checked=NA to 0 and enforce access_level==4 filter
1 parent eadd8d1 commit eb62be5

17 files changed

Lines changed: 22 additions & 30 deletions

data-raw/make-data.R

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ log_info("Building betydata package data objects...")
1717

1818
# Create output directories
1919
dir.create("data", showWarnings = FALSE)
20-
dir.create("inst/extdata/parquet", showWarnings = FALSE, recursive = TRUE)
2120

2221
# Column type specifications for stable parsing
2322
traitsview_cols <- cols(
@@ -70,12 +69,28 @@ traitsview <- read_csv(
7069
na = c("", "NA")
7170
)
7271

73-
# Filter out checked = -1
74-
traitsview <- traitsview[is.na(traitsview$checked) | traitsview$checked != -1, ]
72+
# Summarize access_level before filtering -- flag non-public records
73+
access_summary <- table(traitsview$access_level, useNA = "ifany")
74+
log_info("access_level distribution:")
75+
for (lvl in names(access_summary)) {
76+
log_info(sprintf("access_level = %s: %d records", lvl, access_summary[[lvl]]))
77+
}
78+
79+
# Keep only public records (access_level == 4)
80+
non_public <- sum(traitsview$access_level != 4, na.rm = TRUE)
81+
if (non_public > 0) {
82+
log_info(sprintf("Removing %d non-public records (access_level != 4)", non_public))
83+
traitsview <- traitsview[traitsview$access_level == 4, ]
84+
}
7585

76-
# Drop access_level column (all records are public, access_level = 4)
86+
# Drop access_level column (all remaining records are public)
7787
traitsview$access_level <- NULL
7888

89+
# Convert checked = NA to checked = 0, then remove failed QC records
90+
traitsview <- traitsview |>
91+
dplyr::mutate(checked = ifelse(is.na(checked), 0L, checked)) |>
92+
dplyr::filter(checked >= 0)
93+
7994
# Reorder columns: key analytical columns first, IDs and metadata last
8095
col_order <- c(
8196
"trait", "mean", "units", "scientificname", "genus",
@@ -141,30 +156,6 @@ if (!is.null(pfts_priors)) usethis::use_data(pfts_priors, overwrite = TRUE, comp
141156
if (!is.null(managements_treatments)) usethis::use_data(managements_treatments, overwrite = TRUE, compress = "xz")
142157
if (!is.null(cultivars_pfts)) usethis::use_data(cultivars_pfts, overwrite = TRUE, compress = "xz")
143158

144-
145-
log_info("Saving Parquet files to inst/extdata/parquet/...")
146-
if (requireNamespace("arrow", quietly = TRUE)) {
147-
arrow::write_parquet(traitsview, "inst/extdata/parquet/traitsview.parquet")
148-
if (!is.null(species)) arrow::write_parquet(species, "inst/extdata/parquet/species.parquet")
149-
if (!is.null(sites)) arrow::write_parquet(sites, "inst/extdata/parquet/sites.parquet")
150-
if (!is.null(variables)) arrow::write_parquet(variables, "inst/extdata/parquet/variables.parquet")
151-
if (!is.null(citations)) arrow::write_parquet(citations, "inst/extdata/parquet/citations.parquet")
152-
if (!is.null(cultivars)) arrow::write_parquet(cultivars, "inst/extdata/parquet/cultivars.parquet")
153-
if (!is.null(methods)) arrow::write_parquet(methods, "inst/extdata/parquet/methods.parquet")
154-
if (!is.null(treatments)) arrow::write_parquet(treatments, "inst/extdata/parquet/treatments.parquet")
155-
if (!is.null(pfts)) arrow::write_parquet(pfts, "inst/extdata/parquet/pfts.parquet")
156-
if (!is.null(priors)) arrow::write_parquet(priors, "inst/extdata/parquet/priors.parquet")
157-
if (!is.null(managements)) arrow::write_parquet(managements, "inst/extdata/parquet/managements.parquet")
158-
if (!is.null(entities)) arrow::write_parquet(entities, "inst/extdata/parquet/entities.parquet")
159-
if (!is.null(pfts_species)) arrow::write_parquet(pfts_species, "inst/extdata/parquet/pfts_species.parquet")
160-
if (!is.null(pfts_priors)) arrow::write_parquet(pfts_priors, "inst/extdata/parquet/pfts_priors.parquet")
161-
if (!is.null(managements_treatments)) arrow::write_parquet(managements_treatments, "inst/extdata/parquet/managements_treatments.parquet")
162-
if (!is.null(cultivars_pfts)) arrow::write_parquet(cultivars_pfts, "inst/extdata/parquet/cultivars_pfts.parquet")
163-
} else {
164-
log_info("arrow package not available, skipping Parquet export")
165-
}
166-
167-
168159
# --- Generate datapackage.json ---
169160
log_info("Generating inst/metadata/datapackage.json...")
170161
dir.create("inst/metadata", showWarnings = FALSE, recursive = TRUE)
@@ -197,8 +188,9 @@ resources <- lapply(datasets, function(nm) {
197188
df <- get(nm)
198189
base <- list(
199190
name = nm,
200-
path = paste0("data/", nm, ".rda"),
201-
format = "rda"
191+
path = paste0("data-raw/csv/", nm, ".csv"),
192+
format = "csv",
193+
mediatype = "text/csv"
202194
)
203195
if (nm == "traitsview") {
204196
base$title <- "Traits and Yields View"
-172 KB
Binary file not shown.
-24.2 KB
Binary file not shown.
-5.75 KB
Binary file not shown.
-79.8 KB
Binary file not shown.
-117 KB
Binary file not shown.
-149 KB
Binary file not shown.
-9.93 KB
Binary file not shown.

inst/extdata/parquet/pfts.parquet

-29.4 KB
Binary file not shown.
-110 KB
Binary file not shown.

0 commit comments

Comments
 (0)