Skip to content

Commit 16c1994

Browse files
author
brynnz22
committed
add metabolomics skelton functionality
1 parent 50cf2aa commit 16c1994

9 files changed

Lines changed: 668 additions & 234 deletions

File tree

support_code/nmdc/metabolomics/config.toml

Whitespace-only changes.
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from metab_metadata_generator import MetabolomicsMetadataGenerator
2+
import toml
3+
from pathlib import Path
4+
5+
6+
def main():
7+
"""
8+
Runs the MetadataGenerator using the configuration provided in a TOML file.
9+
10+
The TOML configuration file must include the following fields:
11+
12+
- metadata_file: str
13+
The path to the .csv, .xlsx, or .xls file where the biosample metadata is stored (include file extension).
14+
Example: "metadata_file.csv"
15+
- data_dir: str
16+
The directory path where the data lives or will be stored.
17+
Example: "/path/to/data/"
18+
- ref_calibration_path: str
19+
The path to the .ref file where the reference calibration data is stored (include .ref).
20+
Example: "/path/to/calibration_file.ref"
21+
- raw_data_object_type: str
22+
The raw data object type. Must match one of the following options:
23+
- 'Direct Infusion FT ICR-MS Raw Data'
24+
- 'LC-DDA-MS/MS Raw Data'
25+
- processed_data_object_type: str
26+
The processed data object type. Must match one of the following options:
27+
- 'FT ICR-MS Analysis Results'
28+
- 'GC-MS Metabolomics Results'
29+
- registration_file: str
30+
The desired name of the output file where the data will be dumped (include .json).
31+
Example: "output_file.json"
32+
- execution_resource: str
33+
The execution resource used for the analysis. Must match one of the following options:
34+
- 'EMSL-RZR'
35+
- 'EMSL'
36+
- field_strength: int
37+
The field strength for the NOM analysis. Must match one of the following values:
38+
- 7
39+
- 12
40+
- 15
41+
- 21
42+
- config_path: str
43+
The path to the NMDC minting client and Bioportal API key configuration yaml file. Defaults to 'enviroMS/nmdc_metadata_generation/config.yaml'.
44+
Should have client_id and client_secret for the NMDC minting client. And api_key for BioPortal API access. To get a BioPortal api_key register here:
45+
https://bioportal.bioontology.org/accounts/new and go to settings
46+
47+
Notes
48+
-----
49+
This function assumes:
50+
- If a biosample_id does not exist for the sample in the metadata_file, a biosample will be generated. On the other hand,
51+
if a biosample_id does exist in the spreadsheet, no biosample will be generated. And the biosample_id present will be used to
52+
generate all other metadata.
53+
- The metadata_file conforms to a predefined structure. See example spreadsheet: https://docs.google.com/spreadsheets/d/1-xHGkkG5Gpw5Pen1iM_JUP2XmphuF19ps2ZYONvtDUs/edit?gid=1112301083#gid=1112301083
54+
- Necessary configuration and calibrations are already added to MongoDB. If new configurations are needed, they must be added beforehand.
55+
"""
56+
57+
58+
# Load arguments from TOML file
59+
config_data = toml.load('support_code/nmdc/nom/config.toml')
60+
61+
generator = MetabolomicsMetadataGenerator(
62+
metadata_file=config_data['metadata_file'],
63+
data_dir=Path(config_data['data_dir']),
64+
ref_calibration_path=Path(config_data['ref_calibration_path']),
65+
raw_data_object_type=config_data['raw_data_object_type'],
66+
processed_data_object_type=config_data['processed_data_object_type'],
67+
database_dump_json_path=config_data['registration_file'],
68+
execution_resource=config_data['execution_resource'],
69+
field_strength=config_data['field_strength'],
70+
workflow_version=config_data['workflow_version'],
71+
config_path=config_data['config_path']
72+
)
73+
74+
generator.run()
75+
76+
if __name__ == "__main__":
77+
main()

support_code/nmdc/metabolomics/metab_metadata_generator.py

Lines changed: 313 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
biosample_id,LC-MS filename,DMS Dataset ID,MyEMSL link,Sample Name,Sample Type,env_medium,habitat,ecosystem_category,img_identifiers,name,geo_loc_name,lat_lon: has_raw_value,latitude,longitude,env_local_scale,location,Ecosystem_type,ncbi_taxonomy_name,ecosystem,env_broad_scale,samp_taxon_id:has_raw_value,sample_collection_site,samp_name,ecosystem_subtype,type,description,collection_date:has_raw_value,NMDC Study ID,FTICR_extraction_method,samp_store_temp.has_raw_value,samp_size.has_raw_value,samp_collec_device,elev,size_frac.has_raw_value,air_temp_regm.has_raw_value,biosample_categories,depth_has_numeric_value,depth.has_minimum_numeric_value,depth.has_maximum_numeric_value,depth.has_unit,soil_horizon,light_regm.has_raw_value,soil_type.has_raw_value,instrument_used,eluent_intro,mass_spec_config,chrom_config_name
2+
biosamp:1,Lybrand_GHG_01_C_30Aug19_Alder_Infuse_p05_1_01_47965,775133,https://status.my.emsl.pnl.gov/view/1839138,Lybrand_GHG_01_C,soil,ENVO:00001998,,,,Lybrand_GHG_01_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
3+
,Lybrand_GHG_02_C_30Aug19_Alder_Infuse_p05_1_01_47966,775114,https://status.my.emsl.pnl.gov/view/1839103,Lybrand_GHG_02_C,soil,ENVO:00001998,,,,Lybrand_GHG_02_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_02_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
4+
,Lybrand_GHG_03_C_30Aug19_Alder_Infuse_p05_1_01_47967,775105,https://status.my.emsl.pnl.gov/view/1839083,Lybrand_GHG_03_C,soil,ENVO:00001998,,,,Lybrand_GHG_03_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_03_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,gas_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
5+
,Lybrand_GHG_09_C_30Aug19_Alder_Infuse_p05_1_01_47973,775109,https://status.my.emsl.pnl.gov/view/1839094,Lybrand_GHG_09_C,soil,ENVO:00001998,,,,Lybrand_GHG_09_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_09_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
6+
,Lybrand_GHG_1_W_27Aug19_Alder_Infuse_p2_000001,772705,https://status.my.emsl.pnl.gov/view/1834769,Lybrand_GHG_01_W,soil,ENVO:00001998,,,,Lybrand_GHG_01_W,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_W,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,water,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
7+
,Lybrand_GHG_11_C_30Aug19_Alder_Infuse_p05_1_01_47975,775026,https://status.my.emsl.pnl.gov/view/1838954,Lybrand_GHG_11_C,soil,ENVO:00001998,,,,Lybrand_GHG_11_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_11_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites

support_code/nmdc/nom/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from metadata_generator import MetadataGenerator
1+
from nom_metadata_generator import NOMMetadataGenerator
22
import toml
33
from pathlib import Path
44

@@ -58,7 +58,7 @@ def main():
5858
# Load arguments from TOML file
5959
config_data = toml.load('support_code/nmdc/nom/config.toml')
6060

61-
generator = MetadataGenerator(
61+
generator = NOMMetadataGenerator(
6262
metadata_file=config_data['metadata_file'],
6363
data_dir=Path(config_data['data_dir']),
6464
ref_calibration_path=Path(config_data['ref_calibration_path']),

support_code/nmdc/nom/metadata_file_example.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
biosample_id,LC-MS filename,DMS Dataset ID,MyEMSL link,Sample Name,Sample Type,env_medium,habitat,ecosystem_category,img_identifiers,name,geo_loc_name,lat_lon: has_raw_value,latitude,longitude,env_local_scale,location,Ecosystem_type,ncbi_taxonomy_name,ecosystem,env_broad_scale,samp_taxon_id:has_raw_value,sample_collection_site,samp_name,ecosystem_subtype,type,description,collection_date:has_raw_value,NMDC Study ID,FTICR_extraction_method,samp_store_temp.has_raw_value,samp_size.has_raw_value,samp_collec_device,elev,size_frac.has_raw_value,air_temp_regm.has_raw_value,biosample_categories,depth_has_numeric_value,depth.has_minimum_numeric_value,depth.has_maximum_numeric_value,depth.has_unit,soil_horizon,light_regm.has_raw_value,soil_type.has_raw_value,instrument_used,eluent_intro,mass_spec_config,chrom_config_name
2-
nmdc:Biosamp1,Lybrand_GHG_01_C_30Aug19_Alder_Infuse_p05_1_01_47965,775133,https://status.my.emsl.pnl.gov/view/1839138,Lybrand_GHG_01_C,soil,ENVO:00001998,,,,Lybrand_GHG_01_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
3-
nmdc:Biosamp2,Lybrand_GHG_02_C_30Aug19_Alder_Infuse_p05_1_01_47966,775114,https://status.my.emsl.pnl.gov/view/1839103,Lybrand_GHG_02_C,soil,ENVO:00001998,,,,Lybrand_GHG_02_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_02_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
2+
biosamp:1,Lybrand_GHG_01_C_30Aug19_Alder_Infuse_p05_1_01_47965,775133,https://status.my.emsl.pnl.gov/view/1839138,Lybrand_GHG_01_C,soil,ENVO:00001998,,,,Lybrand_GHG_01_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
3+
,Lybrand_GHG_02_C_30Aug19_Alder_Infuse_p05_1_01_47966,775114,https://status.my.emsl.pnl.gov/view/1839103,Lybrand_GHG_02_C,soil,ENVO:00001998,,,,Lybrand_GHG_02_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_02_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
44
,Lybrand_GHG_03_C_30Aug19_Alder_Infuse_p05_1_01_47967,775105,https://status.my.emsl.pnl.gov/view/1839083,Lybrand_GHG_03_C,soil,ENVO:00001998,,,,Lybrand_GHG_03_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_03_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,gas_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
55
,Lybrand_GHG_09_C_30Aug19_Alder_Infuse_p05_1_01_47973,775109,https://status.my.emsl.pnl.gov/view/1839094,Lybrand_GHG_09_C,soil,ENVO:00001998,,,,Lybrand_GHG_09_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_09_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
66
,Lybrand_GHG_1_W_27Aug19_Alder_Infuse_p2_000001,772705,https://status.my.emsl.pnl.gov/view/1834769,Lybrand_GHG_01_W,soil,ENVO:00001998,,,,Lybrand_GHG_01_W,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_W,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,water,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites

0 commit comments

Comments
 (0)