Skip to content

Commit 56a41e5

Browse files
committed
Merge remote-tracking branch 'origin/master' into asmsbugfixes2024
2 parents 2bcd770 + 5f8bbc0 commit 56a41e5

5 files changed

Lines changed: 67 additions & 38 deletions

File tree

MANIFEST.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
include requirements.txt
22
include disclaimer.txt
33
include Dockerfile
4-
include ext_lib/*
4+
include ext_lib/*
5+
include ext_lib/dotnet/*

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
![CoreMS Logo](https://github.com/EMSL-Computing/CoreMS/tree/master/docs/CoreMS.COLOR_small.png?raw=true)
1+
![CoreMS Logo](docs/CoreMS.COLOR_small.png?raw=true)
22

33
<div align="left">
44

support_code/nmdc/nom/nmdc_metadata_gen.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,18 @@
1515

1616
env_mediums = {'ENVO_00002042': 'surface water',
1717
'ENVO_00002007': 'sediment',
18+
'ENVO:00001998': 'soil'
1819
}
19-
env_local_scales = {'ENVO_00000022': 'river'}
20-
env_broad_scales = {'ENVO_01000253': 'freshwater river biome'}
20+
env_local_scales = {'ENVO_00000022': 'river',
21+
'ENVO:01000861': 'area of dwarf scrub',
22+
'ENVO:00000516': 'hummock',
23+
'ENVO:01000869': 'area of scrub',
24+
'ENVO:01000887': 'area of sedge- and forb-dominated herbaceous vegetation',
25+
'ENVO:01001370': 'tundra ecosystem'
26+
}
27+
env_broad_scales = {'ENVO_01000253': 'freshwater river biome',
28+
'ENVO:00000446': 'terrestrial biome'
29+
}
2130

2231

2332
@dataclass
@@ -26,13 +35,15 @@ class NomAnalysisActivity:
2635
cluster_name:str = "EMSL-RZR"
2736
nom_21T_instrument_name: str = "21T_Agilent"
2837
nom_12T_instrument_name: str = "12T_FTICR_B"
38+
nom_7T_instrument_name: str = "7T_FT_ICR_MS"
2939

3040
@dataclass
3141
class OmicsProcessing:
3242
nom_omics_processing_type:str = "Organic Matter Characterization"
3343
nom_omics_processing_description:str = "High resolution MS spectra only"
3444
nom_21T_instrument_name: str = "21T Agilent"
3545
nom_12T_instrument_name: str = "12T_FTICR_B"
46+
nom_7T_instrument_name: str = "7T_FT_ICR_MS"
3647

3748
@dataclass
3849
class DataObject:
@@ -42,13 +53,13 @@ class DataObject:
4253
nom_dp_data_object_description:str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details"
4354

4455
@dataclass
45-
class BioSample:
56+
class Biosample:
4657
pass
4758

4859
@dataclass
4960
class NMDC_Types:
5061

51-
BioSample:str = "nmdc:Biosample"
62+
Biosample:str = "nmdc:Biosample"
5263
OmicsProcessing:str = "nmdc:OmicsProcessing"
5364
NomAnalysisActivity:str = "nmdc:NomAnalysisActivity"
5465
DataObject:str = "nmdc:DataObject"
@@ -68,7 +79,7 @@ def __dict__(self):
6879
@property
6980
def json(self):
7081
return dumps(self.__dict__)
71-
82+
7283
def mint_nmdc_id(type:NMDC_Types, how_many:int = 1) -> List[str]:
7384

7485
config = yaml.safe_load(open('./config.yaml','r'))
@@ -93,7 +104,7 @@ def mint_nmdc_id(type:NMDC_Types, how_many:int = 1) -> List[str]:
93104

94105
def get_biosample_object(emsl_metadata:EMSL_Metadata) -> nmdc.Biosample:
95106

96-
nmdc_id = mint_nmdc_id({'id': NMDC_Types.BioSample})[0]
107+
nmdc_id = mint_nmdc_id({'id': NMDC_Types.Biosample})[0]
97108

98109
env_medium = {
99110
'has_raw_value': emsl_metadata.env_medium,
@@ -121,7 +132,7 @@ def get_biosample_object(emsl_metadata:EMSL_Metadata) -> nmdc.Biosample:
121132
"longitude": emsl_metadata.longitude,
122133
}
123134

124-
collection_date = { 'has_raw_value': emsl_metadata.collection_date}
135+
collection_date = {'has_raw_value': emsl_metadata.collection_date}
125136

126137
geo_loc_name = {'has_raw_value': emsl_metadata.geo_loc_name}
127138

@@ -166,7 +177,7 @@ def get_data_object(file_path:Path, base_url:str, was_generated_by:str,
166177
"description": description,
167178
"type": "nmdc:DataObject"
168179
}
169-
180+
170181
data_object = nmdc.DataObject(**data_dict)
171182

172183
return data_object
@@ -228,46 +239,45 @@ def create_nmdc_metadata(raw_data_path:Path, data_product_path:Path, base_url:st
228239

229240
if not biosample_id:
230241

231-
#biosample_id = mint_nmdc_id({'id': NMDC_Types.BioSample})[0]
242+
biosample_id = mint_nmdc_id({'id': NMDC_Types.Biosample})[0]
232243
bioSample = get_biosample_object(emsl_metadata)
233244
biosample_id = bioSample.id
234245

235246
else:
236247

237248
''' needs to finish the logic for creating biosamples, this will fail because it is missing some required fields'''
238-
bioSample = nmdc.BioSample(id=biosample_id)
249+
bioSample = None
239250

240251
omicsProcessing = get_omics_processing(raw_data_path,
241-
OmicsProcessing.nom_12T_instrument_name,
242-
biosample_id, None,
252+
OmicsProcessing.nom_7T_instrument_name,
253+
biosample_id, 'nmdc:placeholder',
243254
OmicsProcessing.nom_omics_processing_type,
244255
OmicsProcessing.nom_omics_processing_description,
245-
emsl_metadata.nmdc_study
256+
emsl_metadata.nmdc_study
246257
)
247258

248-
rawDataObject = get_data_object(raw_data_path, base_url + 'nom/grow/raw/',
259+
rawDataObject = get_data_object(raw_data_path, base_url + 'nom/1000soils/raw/',
249260
was_generated_by=omicsProcessing.id,
250261
data_object_type =DataObject.nom_raw_data_object_type,
251262
description =DataObject.nom_raw_data_object_description)
252263

253264
nomAnalysisActivity = get_nom_analysis_activity(NomAnalysisActivity.cluster_name,
254265
NomAnalysisActivity.codebase_url,
255-
rawDataObject.id, None, False,
266+
rawDataObject.id, 'nmdc:placeholder', False,
256267
omicsProcessing.id,
257-
NomAnalysisActivity.nom_12T_instrument_name)
268+
NomAnalysisActivity.nom_7T_instrument_name)
258269

259-
dataProductDataObject = get_data_object(data_product_path, base_url + 'nom/grow/results/',
270+
dataProductDataObject = get_data_object(data_product_path, base_url + 'nom/1000soils/results/',
260271
was_generated_by=nomAnalysisActivity.id,
261272
data_object_type =DataObject.nom_dp_data_object_type,
262273
description =DataObject.nom_dp_data_object_description)
263274

264-
265275
#circular dependencies : great!
266-
nomAnalysisActivity.has_input = [rawDataObject.id]
267276
nomAnalysisActivity.has_output = [dataProductDataObject.id]
268277
omicsProcessing.has_output = [rawDataObject.id]
269278

270-
nom_metadata_db.biosample_set.append(bioSample)
279+
if bioSample:
280+
nom_metadata_db.biosample_set.append(bioSample)
271281
nom_metadata_db.data_object_set.append(rawDataObject)
272282
nom_metadata_db.nom_analysis_activity_set.append(nomAnalysisActivity)
273283
nom_metadata_db.omics_processing_set.append(omicsProcessing)
@@ -276,4 +286,3 @@ def create_nmdc_metadata(raw_data_path:Path, data_product_path:Path, base_url:st
276286
def dump_nmdc_database(ndmc_database:nmdc.Database, output_filepath:str):
277287

278288
json_dumper.dump(ndmc_database, output_filepath)
279-

support_code/nmdc/nom/nom_grow_workflow.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class EMSL_Metadata:
3737
description: str
3838
collection_date: str
3939
nmdc_study: str
40+
biosample_id: str
4041

4142
def parse_metadata(metadata_file_path:Path) -> EMSL_Metadata:
4243

@@ -70,6 +71,7 @@ def parse_metadata(metadata_file_path:Path) -> EMSL_Metadata:
7071
description = full_list_worksheet['Z']
7172
collection_date = full_list_worksheet['AA']
7273
nmdc_study = full_list_worksheet['AB']
74+
biosample_id = full_list_worksheet['AC']
7375

7476
for x in range(1, len(full_list_worksheet['A'])):
7577

@@ -96,7 +98,8 @@ def parse_metadata(metadata_file_path:Path) -> EMSL_Metadata:
9698
ecosystem_subtype = ecosystem_subtype[x].value,
9799
description = description[x].value,
98100
collection_date = collection_date[x].value,
99-
nmdc_study = nmdc_study[x].value
101+
nmdc_study = nmdc_study[x].value,
102+
biosample_id = biosample_id[x].value
100103
)
101104

102105
yield metadata
@@ -141,7 +144,7 @@ def run_nom_nmdc_data_processing():
141144
registration_dir.mkdir(parents=True, exist_ok=True)
142145
registration_file = registration_dir / args.registration_file_name
143146

144-
field_strength = 12
147+
field_strength = 7
145148

146149
ref_calibration_path = Path(args.ref_calibration_path)
147150
failed_list = []
@@ -150,7 +153,7 @@ def run_nom_nmdc_data_processing():
150153

151154
for each_data in parse_metadata(metadata_file_path):
152155

153-
raw_file_path = data_dir / each_data.data_path / each_data.data_path.with_suffix(file_ext)
156+
raw_file_path = data_dir / each_data.data_path.with_suffix(file_ext)
154157

155158
print(raw_file_path)
156159

@@ -174,7 +177,7 @@ def run_nom_nmdc_data_processing():
174177
output_file_path,
175178
"https://nmdcdemo.emsl.pnnl.gov/",
176179
nmdc_database,
177-
each_data, biosample_id=None)
180+
each_data, biosample_id=each_data.biosample_id)
178181

179182
else:
180183

support_code/nmdc/nom/nom_workflow.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ def run_bruker(file_location):
4343

4444
def run_thermo(file_location):
4545

46-
MSParameters.mass_spectrum.noise_threshold_methodmethod = 'log'
46+
MSParameters.mass_spectrum.noise_threshold_method = 'log'
4747
MSParameters.mass_spectrum.noise_threshold_min_std = 3
48+
MSParameters.ms_peak.peak_min_prominence_percent = 0.2
4849

4950
parser = rawFileReader.ImportMassSpectraThermoMSFileReader(file_location)
5051

@@ -58,40 +59,40 @@ def run_thermo(file_location):
5859
def calspec(msobj, refmasslist, order=2):
5960

6061
calfn = MzDomainCalibration(msobj, refmasslist)
61-
ref_mass_list_fmt = calfn.load_ref_mass_list(refmasslist)
62+
ref_mass_list_fmt = calfn.load_ref_mass_list()
6263

63-
imzmeas, mzrefs = calfn.find_calibration_points(msobj, ref_mass_list_fmt,
64+
imzmeas, mzrefs = calfn.find_calibration_points(ref_mass_list_fmt,
6465
calib_ppm_error_threshold=(-1.0, 1.0),
6566
calib_snr_threshold=4)
6667

6768
if len(mzrefs) < 5:
68-
imzmeas, mzrefs = calfn.find_calibration_points(msobj, ref_mass_list_fmt,
69+
imzmeas, mzrefs = calfn.find_calibration_points(ref_mass_list_fmt,
6970
calib_ppm_error_threshold=(-1.5, 1.5),
7071
calib_snr_threshold=4)
7172

7273
if len(mzrefs) < 5:
73-
imzmeas, mzrefs = calfn.find_calibration_points(msobj, ref_mass_list_fmt,
74+
imzmeas, mzrefs = calfn.find_calibration_points(ref_mass_list_fmt,
7475
calib_ppm_error_threshold=(-3, 3),
7576
calib_snr_threshold=4)
7677

7778
if len(mzrefs) < 5:
78-
imzmeas, mzrefs = calfn.find_calibration_points(msobj,ref_mass_list_fmt,
79+
imzmeas, mzrefs = calfn.find_calibration_points(ref_mass_list_fmt,
7980
calib_ppm_error_threshold=(-5, 5),
8081
calib_snr_threshold=4)
8182

8283
if len(mzrefs) < 5:
8384

84-
imzmeas, mzrefs = calfn.find_calibration_points(msobj, ref_mass_list_fmt,
85+
imzmeas, mzrefs = calfn.find_calibration_points(ref_mass_list_fmt,
8586
calib_ppm_error_threshold=(-7, 7),
8687
calib_snr_threshold=4)
8788

8889
if len(mzrefs) < 5:
8990

90-
imzmeas, mzrefs = calfn.find_calibration_points(msobj, ref_mass_list_fmt,
91+
imzmeas, mzrefs = calfn.find_calibration_points(ref_mass_list_fmt,
9192
calib_ppm_error_threshold=(-10, 10),
9293
calib_snr_threshold=4)
9394

94-
calfn.recalibrate_mass_spectrum(msobj, imzmeas, mzrefs, order=order)
95+
calfn.recalibrate_mass_spectrum(imzmeas, mzrefs, order=order)
9596

9697
def set_parameters(mass_spectrum, field_strength=12, pos=False):
9798

@@ -115,6 +116,17 @@ def set_parameters(mass_spectrum, field_strength=12, pos=False):
115116
mass_spectrum.molecular_search_settings.min_ppm_error = -1
116117
mass_spectrum.molecular_search_settings.max_ppm_error = 1
117118

119+
mass_spectrum.settings.calib_sn_threshold = 4
120+
121+
elif field_strength == 7:
122+
123+
mass_spectrum.settings.max_calib_ppm_error = 3
124+
mass_spectrum.settings.min_calib_ppm_error = -3
125+
126+
mass_spectrum.molecular_search_settings.error_method = 'None'
127+
mass_spectrum.molecular_search_settings.min_ppm_error = -1
128+
mass_spectrum.molecular_search_settings.max_ppm_error = 1
129+
118130
mass_spectrum.settings.calib_sn_threshold = 4
119131
# 21T Data
120132
else:
@@ -126,7 +138,7 @@ def set_parameters(mass_spectrum, field_strength=12, pos=False):
126138
mass_spectrum.molecular_search_settings.min_ppm_error = -0.5
127139
mass_spectrum.molecular_search_settings.max_ppm_error = 0.5
128140

129-
# mass_spectrum.molecular_search_settings.url_database = "postgres://coremsdb:coremsmolform@localhost:5432/molformula"
141+
mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
130142
mass_spectrum.molecular_search_settings.min_dbe = 0
131143
mass_spectrum.molecular_search_settings.max_dbe = 40
132144

@@ -200,7 +212,11 @@ def run_nmdc_workflow(args):
200212

201213
return "all_good", mass_spectrum
202214

203-
except:
215+
except Exception as inst:
216+
217+
print(type(inst)) # the exception instance
218+
print(inst.args) # arguments stored in .args
219+
print(inst)
204220
return 'error', None
205221

206222
def monitor(target):

0 commit comments

Comments
 (0)