Skip to content

Commit 48ca1f7

Browse files
committed
Merge branch 'update_nom_metadata_gen' into 'master'
Update nom metadata gen See merge request mass-spectrometry/corems!145
2 parents b453b3d + 86aac16 commit 48ca1f7

7 files changed

Lines changed: 420 additions & 113 deletions

File tree

support_code/nmdc/nom/api_info_retriever.py

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import requests
22
from typing import Optional
3+
import yaml
34

45
# TODO: Update og_url to be regular nmdc api url when berkeley is implemented
56

6-
class ApiInfoRetriever:
7+
class NmdcApiInfoRetriever:
78
def __init__(self, collection_name: str):
89
self.collection_name = collection_name
910

1011
def get_id_by_slot_from_collection(self, slot_name: str, slot_field_value: str):
1112
"""
12-
Retrieve the identifier from a specified collection based on a slot name and field value.
13+
Retrieve the NMDC identifier from a specified collection based on a slot name and field value.
1314
1415
Parameters
1516
----------
@@ -34,7 +35,7 @@ def get_id_by_slot_from_collection(self, slot_name: str, slot_field_value: str):
3435
filter = f'{{"{slot_name}": "{slot_field_value}"}}'
3536
field = "id"
3637

37-
og_url = f'https://api-berkeley.microbiomedata.org/nmdcschema/{self.collection_name}?&filter={filter}&projection={field}'
38+
og_url = f'https://api.microbiomedata.org/nmdcschema/{self.collection_name}?&filter={filter}&projection={field}'
3839
resp = requests.get(og_url)
3940

4041
# Check if the response status is 200
@@ -45,9 +46,79 @@ def get_id_by_slot_from_collection(self, slot_name: str, slot_field_value: str):
4546

4647
# Ensure there is at least one resource in the response
4748
if not data['resources']:
48-
raise ValueError(f"No resources found for {slot_name} with value {slot_field_value}")
49+
raise ValueError(f"No resources in Mongo found for '{slot_name}' slot in {self.collection_name} with value {slot_field_value}")
4950

5051
identifier = data['resources'][0]['id']
5152

5253
return identifier
5354

55+
class BioOntologyInfoRetriever:
56+
"""
57+
Client for retrieving ENVO term information from BioPortal API.
58+
59+
A class to handle authentication and retrieval of Environmental Ontology (ENVO)
60+
terms using the BioPortal REST API service.
61+
62+
Parameters
63+
----------
64+
config_path : str
65+
Path to YAML configuration file containing BioPortal API credentials
66+
67+
Notes
68+
-----
69+
The configuration file should contain an 'api_key' field with a valid
70+
BioPortal API key.
71+
72+
Examples
73+
--------
74+
>>> retriever = BioOntologyInfoRetriever('config.yaml')
75+
>>> envo_terms = retriever.get_envo_terms('ENVO:00002042')
76+
>>> print(envo_terms)
77+
{'ENVO:00002042': 'surface water'}
78+
"""
79+
def __init__(self, config_path: str):
80+
self.config = config_path
81+
82+
def get_envo_terms(self, envo_id: dict):
83+
"""
84+
Look up an ENVO term label using BioPortal API.
85+
86+
Parameters
87+
----------
88+
envo_id : str
89+
The ENVO identifier to look up (e.g., 'ENVO:00002042')
90+
91+
Returns
92+
-------
93+
dict
94+
Dictionary with envo_id as key and term label as value
95+
Example: {'ENVO:00002042': 'surface water'}
96+
97+
Raises
98+
------
99+
requests.exceptions.RequestException
100+
If the API request fails
101+
KeyError
102+
If the response doesn't contain expected data format
103+
yaml.YAMLError
104+
If the config file cannot be parsed
105+
FileNotFoundError
106+
If the config file is not found
107+
108+
Notes
109+
-----
110+
Makes an authenticated request to BioPortal API to retrieve the
111+
preferred label (prefLabel) for the given ENVO term.
112+
"""
113+
114+
config = yaml.safe_load(open(self.config))
115+
api_key = config['api_key']
116+
117+
url = f"http://data.bioontology.org/ontologies/ENVO/classes/{envo_id}"
118+
headers = {"Authorization": f"apikey token={api_key}"}
119+
120+
response = requests.get(url, headers=headers)
121+
response.raise_for_status()
122+
123+
data = response.json()
124+
return {envo_id: data['prefLabel']}

support_code/nmdc/nom/config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ registration_file = "path/to/where/database/dump/lives.json"
77
execution_resource = "EMSL-RZR"
88
field_strength = 7
99
workflow_version = "2.2.3"
10-
minting_client_config_path = "enviroMS/nmdc_metadata_generation/config.yaml"
10+
config_path = "enviroMS/nmdc_metadata_generation/config.yaml"

support_code/nmdc/nom/main.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import toml
33
from pathlib import Path
44

5+
56
def main():
67
"""
78
Runs the MetadataGenerator using the configuration provided in a TOML file.
@@ -38,8 +39,10 @@ def main():
3839
- 12
3940
- 15
4041
- 21
41-
- minting_client_config_path: str
42-
The path to the NMDC minting client configuration file. Defaults to 'enviroMS/nmdc_metadata_generation/config.yaml'.
42+
- config_path: str
43+
The path to the NMDC minting client and Bioportal API key configuration yaml file. Defaults to 'enviroMS/nmdc_metadata_generation/config.yaml'.
44+
Should have client_id and client_secret for the NMDC minting client. And api_key for BioPortal API access. To get a BioPortal api_key register here:
45+
https://bioportal.bioontology.org/accounts/new and go to settings
4346
4447
Notes
4548
-----
@@ -53,7 +56,7 @@ def main():
5356

5457

5558
# Load arguments from TOML file
56-
config_data = toml.load('enviroMS/nmdc_metadata_generation/config.toml')
59+
config_data = toml.load('support_code/nmdc/nom/config.toml')
5760

5861
generator = MetadataGenerator(
5962
metadata_file=config_data['metadata_file'],
@@ -65,7 +68,7 @@ def main():
6568
execution_resource=config_data['execution_resource'],
6669
field_strength=config_data['field_strength'],
6770
workflow_version=config_data['workflow_version'],
68-
minting_client_config_path=config_data['minting_client_config_path']
71+
config_path=config_data['config_path']
6972
)
7073

7174
generator.run()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
biosample_id,LC-MS filename,DMS Dataset ID,MyEMSL link,Sample Name,Sample Type,env_medium,habitat,ecosystem_category,img_identifiers,name,geo_loc_name,lat_lon: has_raw_value,latitude,longitude,env_local_scale,location,Ecosystem_type,ncbi_taxonomy_name,ecosystem,env_broad_scale,samp_taxon_id:has_raw_value,sample_collection_site,samp_name,ecosystem_subtype,type,description,collection_date:has_raw_value,NMDC Study ID,FTICR_extraction_method,samp_store_temp.has_raw_value,samp_size.has_raw_value,samp_collec_device,elev,size_frac.has_raw_value,air_temp_regm.has_raw_value,biosample_categories,depth_has_numeric_value,depth.has_minimum_numeric_value,depth.has_maximum_numeric_value,depth.has_unit,soil_horizon,light_regm.has_raw_value,soil_type.has_raw_value,instrument_used,eluent_intro,mass_spec_config,chrom_config_name
2+
nmdc:Biosamp1,Lybrand_GHG_01_C_30Aug19_Alder_Infuse_p05_1_01_47965,775133,https://status.my.emsl.pnl.gov/view/1839138,Lybrand_GHG_01_C,soil,ENVO:00001998,,,,Lybrand_GHG_01_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
3+
nmdc:Biosamp2,Lybrand_GHG_02_C_30Aug19_Alder_Infuse_p05_1_01_47966,775114,https://status.my.emsl.pnl.gov/view/1839103,Lybrand_GHG_02_C,soil,ENVO:00001998,,,,Lybrand_GHG_02_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_02_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
4+
,Lybrand_GHG_03_C_30Aug19_Alder_Infuse_p05_1_01_47967,775105,https://status.my.emsl.pnl.gov/view/1839083,Lybrand_GHG_03_C,soil,ENVO:00001998,,,,Lybrand_GHG_03_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_03_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,gas_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
5+
,Lybrand_GHG_09_C_30Aug19_Alder_Infuse_p05_1_01_47973,775109,https://status.my.emsl.pnl.gov/view/1839094,Lybrand_GHG_09_C,soil,ENVO:00001998,,,,Lybrand_GHG_09_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_09_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,40,54,cm,B horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
6+
,Lybrand_GHG_1_W_27Aug19_Alder_Infuse_p2_000001,772705,https://status.my.emsl.pnl.gov/view/1834769,Lybrand_GHG_01_W,soil,ENVO:00001998,,,,Lybrand_GHG_01_W,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_01_W,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,water,-20 degree Celsius,,permafrost corer,670,not sieved,"No freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,16,30,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites
7+
,Lybrand_GHG_11_C_30Aug19_Alder_Infuse_p05_1_01_47975,775026,https://status.my.emsl.pnl.gov/view/1838954,Lybrand_GHG_11_C,soil,ENVO:00001998,,,,Lybrand_GHG_11_C,"USA: Alaska, Healy",63.87980 -149.21539,63.8798,-149.21539,ENVO:01000861,,,,,ENVO:00000446,,HEAL,Lybrand_GHG_11_C,,nmdc:Biosample,Lybrand NEON (EMSL 50267),2016-04-01,nmdc:sty-11-db67n062,chloroform,-20 degree Celsius,,permafrost corer,670,not sieved,"Freeze-thaw|Following freeze-thaw incubation, soils were incubated at 20C for 14 days.",NEON,,12,24,cm,O horizon,Dark,,12T FT-ICR MS,liquid_chromatography,"EMSL NOM reduced profile mass spectrometry, negative",EMSL LC method for non-polar metabolites

0 commit comments

Comments
 (0)