Skip to content

Commit 13c7e06

Browse files
committed
Merge branch '151_coremshdf5importer2' into 'master'
Fix error on loading CoreMS hdf5 molecular formula embedded in mass spectrum objects See merge request mass-spectrometry/corems!120
2 parents c868a8c + bd70a3b commit 13c7e06

4 files changed

Lines changed: 154 additions & 7 deletions

File tree

corems/mass_spectrum/input/coremsHDF5.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,11 @@ def get_dataframe(self, scan_index=0, time_index=-1):
199199
data_dict = {}
200200
for data_index, data in enumerate(row):
201201
label = columnsLabels[data_index]
202+
# if data starts with a b' it is a byte string, so decode it
203+
if isinstance(data, bytes):
204+
data = data.decode("utf-8")
205+
if data == "nan":
206+
data = None
202207
data_dict[label] = data
203208

204209
list_dict.append(data_dict)

corems/mass_spectrum/input/massList.py

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
__author__ = "Yuri E. Corilo"
22
__date__ = "Jun 12, 2019"
33

4+
import numpy as np
5+
6+
from corems.encapsulation.constant import Atoms
47
from corems.mass_spectrum.input.baseClass import MassListBaseClass
58
from corems.mass_spectrum.factory.MassSpectrumClasses import MassSpecProfile, MassSpecCentroid
6-
from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula
9+
from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula, MolecularFormulaIsotopologue
710
from corems.encapsulation.constant import Labels, Atoms
811
from corems.encapsulation.factory.processingSetting import DataInputSetting
912

@@ -101,17 +104,81 @@ def add_molecular_formula(self, mass_spec_obj, dataframe):
101104
atoms = list(formula_df.columns.astype(str))
102105
counts = list(formula_df.iloc[df_index].astype(int))
103106

104-
formula_list = [sub[item] for item in range(len(atoms))
105-
for sub in [atoms, counts]]
107+
formula_dict = dict(zip(atoms, counts))
106108
if sum(counts) > 0:
107109

108110
ion_type = str(Labels.ion_type_translate.get(ion_type_df[df_index]))
109111
if adduct_df is not None:
110112
adduct_atom = str(adduct_df[df_index])
113+
if adduct_atom == 'None':
114+
adduct_atom = None
111115
else:
112116
adduct_atom = None
113-
mfobj = MolecularFormula(formula_list, int(ion_charge_df[df_index]), mspeak_parent=mass_spec_obj[ms_peak_index] , ion_type=ion_type, adduct_atom=adduct_atom)
114-
mfobj.is_isotopologue = bool(is_isotopologue_df[df_index])
117+
118+
# If not isotopologue, cast as MolecularFormula
119+
if not bool(int(is_isotopologue_df[df_index])):
120+
mfobj = MolecularFormula(
121+
formula_dict, int(ion_charge_df[df_index]),
122+
mspeak_parent=mass_spec_obj[ms_peak_index] ,
123+
ion_type=ion_type, adduct_atom=adduct_atom
124+
)
125+
126+
# if is isotopologue, recast as MolecularFormulaIsotopologue
127+
if bool(int(is_isotopologue_df[df_index])):
128+
129+
# First make a MolecularFormula object for the parent so we can get probabilities etc
130+
formula_list_parent = {}
131+
for atom in formula_dict:
132+
if atom in Atoms.isotopes.keys():
133+
formula_list_parent[atom] = formula_dict[atom]
134+
else:
135+
# remove any numbers from the atom name to cast as a mono-isotopic atom
136+
atom_mono = atom.strip('0123456789')
137+
if atom_mono in Atoms.isotopes.keys():
138+
formula_list_parent[atom_mono] = formula_list_parent[atom_mono]+formula_dict[atom]
139+
else:
140+
print(f"Atom {atom} not in Atoms.atoms_order")
141+
mono_index = int(dataframe.iloc[df_index]['Mono Isotopic Index'])
142+
mono_mfobj = MolecularFormula(
143+
formula_list_parent,
144+
int(ion_charge_df[df_index]),
145+
mspeak_parent=mass_spec_obj[mono_index],
146+
ion_type=ion_type,
147+
adduct_atom=adduct_atom
148+
)
149+
150+
# Next, generate isotopologues from the parent
151+
isos = list(
152+
mono_mfobj.isotopologues(
153+
min_abundance = mass_spec_obj[df_index].abundance*0.1,
154+
current_mono_abundance = mass_spec_obj[mono_index].abundance,
155+
dynamic_range = mass_spec_obj.dynamic_range
156+
)
157+
)
158+
159+
# Finally, find the isotopologue that matches the formula_dict
160+
matched_isos = isos
161+
for iso in isos:
162+
if set(iso.atoms) == set(formula_dict.keys()):
163+
# Check the values of the atoms match
164+
if all([iso[atom] == formula_dict[atom] for atom in formula_dict]):
165+
matched_isos = [iso]
166+
if len(matched_isos) > 1:
167+
raise ValueError("More than one isotopologue matched the formula_dict: {matched_isos}")
168+
if len(matched_isos) == 0:
169+
raise ValueError("No isotopologue matched the formula_dict")
170+
mfobj = matched_isos[0]
171+
172+
# Add the mono isotopic index, confidence score and isotopologue similarity
173+
mfobj.mspeak_index_mono_isotopic = int(dataframe.iloc[df_index]['Mono Isotopic Index'])
174+
175+
# Add the confidence score and isotopologue similarity and average MZ error score
176+
if 'm/z Error Score' in dataframe:
177+
mfobj._mass_error_average_score = float(dataframe.iloc[df_index]['m/z Error Score'])
178+
if 'Confidence Score' in dataframe:
179+
mfobj._confidence_score = float(dataframe.iloc[df_index]['Confidence Score'])
180+
if 'Isotopologue Similarity' in dataframe:
181+
mfobj._isotopologue_similarity = float(dataframe.iloc[df_index]['Isotopologue Similarity'])
115182
mass_spec_obj[ms_peak_index].add_molecular_formula(mfobj)
116183

117184

tests/test_input.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def test_import_corems_hdf5():
169169
mass_list_reader = ReadCoreMSHDF_MassSpectrum(file_location)
170170

171171
mass_spectrum = mass_list_reader.get_mass_spectrum()
172+
mass_spectrum.to_dataframe()
172173

173174
for mspeak in mass_spectrum:
174175

@@ -362,11 +363,11 @@ def test_import_thermo_average():
362363
#test_import_lcms_from_transient()
363364
#test_import_thermo_profile_mass_list()
364365
# test_import_transient()
365-
#test_import_corems_hdf5()
366+
test_import_corems_hdf5()
366367
#test_import_corems_mass_list()
367368
#test_import_mass_list()
368369
#test_import_maglab_pks()
369370
#test_andi_netcdf_gcms()
370-
test_import_corems_mass_list()
371+
#test_import_corems_mass_list()
371372
#test_import_thermo_average()
372373

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import shutil
2+
3+
from corems.mass_spectrum.input.numpyArray import ms_from_array_centroid
4+
from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas
5+
from corems.encapsulation.factory.parameters import MSParameters
6+
from corems.mass_spectrum.output.export import HighResMassSpecExport
7+
from corems.mass_spectrum.input.coremsHDF5 import ReadCoreMSHDF_MassSpectrum
8+
9+
10+
def prep_mass_spec_obj():
11+
# Test for generating accurate molecular formula from a single mass using the local sql database
12+
# Now also tests that it is handling isotopes correctly (for non-adducts)
13+
mz = [760.58156938877, 761.58548]
14+
abundance = [1000, 400]
15+
rp, s2n = [[1, 1], [10, 10]]
16+
17+
MSParameters.mass_spectrum.noise_threshold_method = "relative_abundance"
18+
MSParameters.mass_spectrum.noise_threshold_absolute_abundance = 0
19+
20+
MSParameters.molecular_search.url_database = ""
21+
MSParameters.molecular_search.error_method = "None"
22+
MSParameters.molecular_search.min_ppm_error = -5
23+
MSParameters.molecular_search.max_ppm_error = 5
24+
MSParameters.molecular_search.mz_error_range = 1
25+
MSParameters.molecular_search.isProtonated = True
26+
MSParameters.molecular_search.isRadical = False
27+
MSParameters.molecular_search.isAdduct = False
28+
29+
usedatoms = {"C": (1, 57), "H": (4, 200), "N": (0, 1)}
30+
MSParameters.molecular_search.usedAtoms = usedatoms
31+
mass_spectrum_obj = ms_from_array_centroid(
32+
mz, abundance, rp, s2n, "single mf search", polarity=1, auto_process=True
33+
)
34+
return mass_spectrum_obj
35+
36+
37+
def run_molecular_formula_search(mass_spectrum_obj):
38+
mass_spectrum_obj.molecular_search_settings.use_min_peaks_filter = False
39+
mass_spectrum_obj.molecular_search_settings.use_isotopologue_filter = False
40+
SearchMolecularFormulas(
41+
mass_spectrum_obj, find_isotopologues=True
42+
).run_worker_ms_peaks([mass_spectrum_obj[0]])
43+
return mass_spectrum_obj
44+
45+
46+
def test_mass_spec_export_import_with_annote():
47+
mass_spectrum_obj = prep_mass_spec_obj()
48+
mass_spectrum_obj = run_molecular_formula_search(mass_spectrum_obj)
49+
ms_df1 = mass_spectrum_obj.to_dataframe()
50+
assert mass_spectrum_obj[0][0].string == "C56 H73 N1"
51+
assert ms_df1.shape == (2, 26)
52+
assert mass_spectrum_obj[1][0].string == "C55 H73 N1 13C1"
53+
54+
exportMS = HighResMassSpecExport("my_mass_spec", mass_spectrum_obj)
55+
exportMS._output_type = "hdf5"
56+
exportMS.save()
57+
58+
parser = ReadCoreMSHDF_MassSpectrum("my_mass_spec.hdf5")
59+
mass_spectrum_obj2 = parser.get_mass_spectrum(auto_process=True, load_settings=True)
60+
61+
ms_df2 = mass_spectrum_obj2.to_dataframe()
62+
assert mass_spectrum_obj2[0][0].string == "C56 H73 N1"
63+
assert ms_df2.shape == (2, 26)
64+
assert mass_spectrum_obj2[1][0].string == "C55 H73 N1 13C1"
65+
66+
# Remove the file
67+
shutil.rmtree(
68+
"my_mass_spec.hdf5",
69+
ignore_errors=True,
70+
)
71+
72+
73+
if __name__ == "__main__":
74+
test_mass_spec_export_import_with_annote()

0 commit comments

Comments
 (0)