Skip to content

Commit 55772cf

Browse files
author
brynnz22
committed
add config checker and make example csv smaller
1 parent 59146aa commit 55772cf

2 files changed

Lines changed: 51 additions & 0 deletions

File tree

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
biosample_id,LC-MS filename,DMS Dataset ID,MyEMSL link,Sample Name,Sample Type,env_medium,habitat,ecosystem_category,img_identifiers,name,geo_loc_name,lat_lon: has_raw_value,latitude,longitude,env_local_scale,location,Ecosystem_type,ncbi_taxonomy_name,ecosystem,env_broad_scale,samp_taxon_id:has_raw_value,sample_collection_site,samp_name,ecosystem_subtype,type,description,collection_date:has_raw_value,NMDC Study ID,FTICR_extraction_method,samp_store_temp.has_raw_value,samp_size.has_raw_value,samp_collec_device,elev,size_frac.has_raw_value,air_temp_regm.has_raw_value,biosample_categories,depth_has_numeric_value,depth.has_minimum_numeric_value,depth.has_maximum_numeric_value,depth.has_unit,soil_horizon,light_regm.has_raw_value,soil_type.has_raw_value,instrument_used,eluent_intro,mass_spec_config,chrom_config_name
2+
nmdc:Biosamp1,Lybrand_GHG_01_C_30Aug19_Alder_Infuse_p05_1_01_47965,775133,https://status.my.emsl.pnl.gov/view/1839138,Lybrand_GHG_01_C,soil,ENVO:00001998,,,,Lybrand_GHG_01_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
3+
nmdc:Biosamp2,Lybrand_GHG_02_C_30Aug19_Alder_Infuse_p05_1_01_47966,775114,https://status.my.emsl.pnl.gov/view/1839103,Lybrand_GHG_02_C,soil,ENVO:00001998,,,,Lybrand_GHG_02_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_02_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
4+
,Lybrand_GHG_03_C_30Aug19_Alder_Infuse_p05_1_01_47967,775105,https://status.my.emsl.pnl.gov/view/1839083,Lybrand_GHG_03_C,soil,ENVO:00001998,,,,Lybrand_GHG_03_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_03_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,gas_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
5+
,Lybrand_GHG_09_C_30Aug19_Alder_Infuse_p05_1_01_47973,775109,https://status.my.emsl.pnl.gov/view/1839094,Lybrand_GHG_09_C,soil,ENVO:00001998,,,,Lybrand_GHG_09_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_09_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
6+
,Lybrand_GHG_1_W_27Aug19_Alder_Infuse_p2_000001,772705,https://status.my.emsl.pnl.gov/view/1834769,Lybrand_GHG_01_W,soil,ENVO:00001998,,,,Lybrand_GHG_01_W,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_W,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,water,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry method, negative",EMSL LC method for non-polar metabolites
7+
,Lybrand_GHG_11_C_30Aug19_Alder_Infuse_p05_1_01_47975,775026,https://status.my.emsl.pnl.gov/view/1838954,Lybrand_GHG_11_C,soil,ENVO:00001998,,,,Lybrand_GHG_11_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_11_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites

support_code/nmdc/nom/metadata_parser.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from dataclasses import dataclass
55
from typing import Optional
66
from pathlib import Path
7+
from api_info_retriever import ApiInfoRetriever
78

89
@ dataclass
910
class BiosampleIncludedMetadata:
@@ -128,6 +129,8 @@ def load_metadata_file(self) -> pd.DataFrame:
128129
else:
129130
raise ValueError(f'Unsupported file extension: {metadata_file_path.suffix}')
130131

132+
# Check that all configs are valid
133+
self.check_for_valid_configs(metadata_df)
131134
# Check that 'chrom__config_name' column value is present if eluent_intro is 'liquid_chromatography)
132135
self.check_chrom_config(metadata_df)
133136

@@ -170,6 +173,47 @@ def check_chrom_config(self, df: pd.DataFrame):
170173
if not invalid_no_config_rows.empty:
171174
raise ValueError(f"'chrom_config_name' should be empty for the following rows where 'eluent_intro' is 'direct_infusion_syringe' or 'direct_infusion_autosampler': {(invalid_no_config_rows.index+2).tolist()}")
172175

176+
def check_for_valid_configs(sefl, df: pd.DataFrame):
177+
"""
178+
Get unique values for all columns containing 'config' in their name and verify they exist in API.
179+
180+
Parameters
181+
----------
182+
df : pd.DataFrame
183+
DataFrame to analyze
184+
185+
Raises
186+
------
187+
ValueError
188+
If any config value doesn't exist in the API
189+
"""
190+
191+
# Instantiate configuration_set info retriever
192+
api_config_getter = ApiInfoRetriever(
193+
collection_name="configuration_set")
194+
195+
config_columns = ['chrom_config_name', 'mass_spec_config']
196+
invalid_configs = []
197+
198+
for col in config_columns:
199+
unique_vals = [val for val in df[col].unique() if pd.notna(val)]
200+
201+
for val in unique_vals:
202+
try:
203+
# skip empty values
204+
if not val:
205+
continue
206+
api_config_getter.get_id_by_slot_from_collection(slot_name="name", slot_field_value=val)
207+
except ValueError:
208+
invalid_configs.append((col, val))
209+
210+
# If any invalid conifgs were found, raise error
211+
if invalid_configs:
212+
error_msg = "The following configurations were not found in the API:\n"
213+
for col, val in invalid_configs:
214+
error_msg += f" Column '{col}': '{val}'\n"
215+
raise ValueError(error_msg)
216+
173217
def check_for_biosamples(self, row: pd.Series) -> bool:
174218
"""
175219
Check if the biosample_id is not None, NaN, or empty.

0 commit comments

Comments
 (0)