Skip to content

Commit 641013c

Browse files
committed
Merge branch 'add-nmdc-nom-updates' into 'master'
add updated nmdc nom scripts See merge request mass-spectrometry/corems!129
2 parents dffc59e + 82ca90f commit 641013c

9 files changed

Lines changed: 1124 additions & 0 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,4 @@ r00*/
8282
!ext_lib/dotnet/*.xml
8383
ext_lib/
8484
*.egg*
85+
config.yaml
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import requests
2+
from typing import Optional
3+
4+
# TODO: Update og_url to be regular nmdc api url when berkeley is implemented
5+
6+
class ApiInfoRetriever:
7+
def __init__(self, collection_name: str):
8+
self.collection_name = collection_name
9+
10+
def get_id_by_slot_from_collection(self, slot_name: str, slot_field_value: str):
11+
"""
12+
Retrieve the identifier from a specified collection based on a slot name and field value.
13+
14+
Parameters
15+
----------
16+
slot_name : str
17+
The name of the slot to filter by.
18+
slot_field_value : str
19+
The value of the slot field to filter for. Trailing whitespace will be removed.
20+
21+
Returns
22+
-------
23+
str
24+
The identifier corresponding to the specified slot name and field value.
25+
26+
Raises
27+
------
28+
ValueError
29+
If the request to the API fails or if no resources are found for the given slot name and value.
30+
"""
31+
# trim trailing white spaces
32+
slot_field_value = slot_field_value.rstrip()
33+
34+
filter = f'{{"{slot_name}": "{slot_field_value}"}}'
35+
field = "id"
36+
37+
og_url = f'https://api-berkeley.microbiomedata.org/nmdcschema/{self.collection_name}?&filter={filter}&projection={field}'
38+
resp = requests.get(og_url)
39+
40+
# Check if the response status is 200
41+
if resp.status_code != 200:
42+
raise ValueError(f"Failed to retrieve data from {self.collection_name}, response code: {resp.status_code}")
43+
44+
data = resp.json()
45+
46+
# Ensure there is at least one resource in the response
47+
if not data['resources']:
48+
raise ValueError(f"No resources found for {slot_name} with value {slot_field_value}")
49+
50+
identifier = data['resources'][0]['id']
51+
52+
return identifier
53+
File renamed without changes.
File renamed without changes.

support_code/nmdc/nom/config.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
metadata_file = "path/to/metadata/spreadsheet.csv"
2+
data_dir = "path/to/directory/where/data/lives/"
3+
ref_calibration_path = "path/to/reference/calibration/file.ref"
4+
raw_data_object_type = "LC-DDA-MS/MS Raw Data"
5+
processed_data_object_type = "FT ICR-MS Analysis Results"
6+
registration_file = "path/to/where/database/dump/lives.json"
7+
execution_resource = "EMSL-RZR"
8+
field_strength = 7
9+
workflow_version = "2.2.3"
10+
minting_client_config_path = "enviroMS/nmdc_metadata_generation/config.yaml"

support_code/nmdc/nom/main.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from metadata_generator import MetadataGenerator
2+
import toml
3+
from pathlib import Path
4+
5+
def main():
6+
"""
7+
Runs the MetadataGenerator using the configuration provided in a TOML file.
8+
9+
The TOML configuration file must include the following fields:
10+
11+
- metadata_file: str
12+
The path to the .csv, .xlsx, or .xls file where the biosample metadata is stored (include file extension).
13+
Example: "metadata_file.csv"
14+
- data_dir: str
15+
The directory path where the data lives or will be stored.
16+
Example: "/path/to/data/"
17+
- ref_calibration_path: str
18+
The path to the .ref file where the reference calibration data is stored (include .ref).
19+
Example: "/path/to/calibration_file.ref"
20+
- raw_data_object_type: str
21+
The raw data object type. Must match one of the following options:
22+
- 'Direct Infusion FT ICR-MS Raw Data'
23+
- 'LC-DDA-MS/MS Raw Data'
24+
- processed_data_object_type: str
25+
The processed data object type. Must match one of the following options:
26+
- 'FT ICR-MS Analysis Results'
27+
- 'GC-MS Metabolomics Results'
28+
- registration_file: str
29+
The desired name of the output file where the data will be dumped (include .json).
30+
Example: "output_file.json"
31+
- execution_resource: str
32+
The execution resource used for the analysis. Must match one of the following options:
33+
- 'EMSL-RZR'
34+
- 'EMSL'
35+
- field_strength: int
36+
The field strength for the NOM analysis. Must match one of the following values:
37+
- 7
38+
- 12
39+
- 15
40+
- 21
41+
- minting_client_config_path: str
42+
The path to the NMDC minting client configuration file. Defaults to 'enviroMS/nmdc_metadata_generation/config.yaml'.
43+
44+
Notes
45+
-----
46+
This function assumes:
47+
- If a biosample_id does not exist for the sample in the metadata_file, a biosample will be generated. On the other hand,
48+
if a biosample_id does exist in the spreadsheet, no biosample will be generated. And the biosample_id present will be used to
49+
generate all other metadata.
50+
- The metadata_file conforms to a predefined structure. See example spreadsheet: https://docs.google.com/spreadsheets/d/1-xHGkkG5Gpw5Pen1iM_JUP2XmphuF19ps2ZYONvtDUs/edit?gid=1112301083#gid=1112301083
51+
- Necessary configuration and calibrations are already added to MongoDB. If new configurations are needed, they must be added beforehand.
52+
"""
53+
54+
55+
# Load arguments from TOML file
56+
config_data = toml.load('enviroMS/nmdc_metadata_generation/config.toml')
57+
58+
generator = MetadataGenerator(
59+
metadata_file=config_data['metadata_file'],
60+
data_dir=Path(config_data['data_dir']),
61+
ref_calibration_path=Path(config_data['ref_calibration_path']),
62+
raw_data_object_type=config_data['raw_data_object_type'],
63+
processed_data_object_type=config_data['processed_data_object_type'],
64+
database_dump_json_path=config_data['registration_file'],
65+
execution_resource=config_data['execution_resource'],
66+
field_strength=config_data['field_strength'],
67+
workflow_version=config_data['workflow_version'],
68+
minting_client_config_path=config_data['minting_client_config_path']
69+
)
70+
71+
generator.run()
72+
73+
if __name__ == "__main__":
74+
main()

0 commit comments

Comments
 (0)