Skip to content

Commit d6bf4b4

Browse files
committed
Include the similarity scores on input
1 parent 3b55373 commit d6bf4b4

2 files changed

Lines changed: 75 additions & 7 deletions

File tree

corems/mass_spectrum/input/massList.py

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
__author__ = "Yuri E. Corilo"
22
__date__ = "Jun 12, 2019"
33

4+
import numpy as np
5+
6+
from corems.encapsulation.constant import Atoms
47
from corems.mass_spectrum.input.baseClass import MassListBaseClass
58
from corems.mass_spectrum.factory.MassSpectrumClasses import MassSpecProfile, MassSpecCentroid
6-
from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula
9+
from corems.molecular_formula.factory.MolecularFormulaFactory import MolecularFormula, MolecularFormulaIsotopologue
710
from corems.encapsulation.constant import Labels, Atoms
811
from corems.encapsulation.factory.processingSetting import DataInputSetting
912

@@ -101,17 +104,78 @@ def add_molecular_formula(self, mass_spec_obj, dataframe):
101104
atoms = list(formula_df.columns.astype(str))
102105
counts = list(formula_df.iloc[df_index].astype(int))
103106

104-
formula_list = [sub[item] for item in range(len(atoms))
105-
for sub in [atoms, counts]]
107+
formula_dict = dict(zip(atoms, counts))
106108
if sum(counts) > 0:
107109

108110
ion_type = str(Labels.ion_type_translate.get(ion_type_df[df_index]))
109111
if adduct_df is not None:
110112
adduct_atom = str(adduct_df[df_index])
113+
if adduct_atom == 'None':
114+
adduct_atom = None
111115
else:
112116
adduct_atom = None
113-
mfobj = MolecularFormula(formula_list, int(ion_charge_df[df_index]), mspeak_parent=mass_spec_obj[ms_peak_index] , ion_type=ion_type, adduct_atom=adduct_atom)
114-
mfobj.is_isotopologue = bool(int(is_isotopologue_df[df_index]))
117+
118+
# If not isotopologue, cast as MolecularFormula
119+
if not bool(int(is_isotopologue_df[df_index])):
120+
mfobj = MolecularFormula(
121+
formula_dict, int(ion_charge_df[df_index]),
122+
mspeak_parent=mass_spec_obj[ms_peak_index] ,
123+
ion_type=ion_type, adduct_atom=adduct_atom
124+
)
125+
126+
# if is isotopologue, recast as MolecularFormulaIsotopologue
127+
if bool(int(is_isotopologue_df[df_index])):
128+
129+
# First make a MolecularFormula object for the parent so we can get probabilities etc
130+
formula_list_parent = {}
131+
for atom in formula_dict:
132+
if atom in Atoms.isotopes.keys():
133+
formula_list_parent[atom] = formula_dict[atom]
134+
else:
135+
# remove any numbers from the atom name to cast as a mono-isotopic atom
136+
atom_mono = atom.strip('0123456789')
137+
if atom_mono in Atoms.isotopes.keys():
138+
formula_list_parent[atom_mono] = formula_list_parent[atom_mono]+formula_dict[atom]
139+
else:
140+
print(f"Atom {atom} not in Atoms.atoms_order")
141+
mono_index = int(dataframe.iloc[df_index]['Mono Isotopic Index'])
142+
mono_mfobj = MolecularFormula(
143+
formula_list_parent,
144+
int(ion_charge_df[df_index]),
145+
mspeak_parent=mass_spec_obj[mono_index],
146+
ion_type=ion_type,
147+
adduct_atom=adduct_atom
148+
)
149+
150+
# Next, generate isotopologues from the parent
151+
isos = list(
152+
mono_mfobj.isotopologues(
153+
min_abundance = mass_spec_obj[df_index].abundance*0.1,
154+
current_mono_abundance = mass_spec_obj[mono_index].abundance,
155+
dynamic_range = mass_spec_obj.dynamic_range
156+
)
157+
)
158+
159+
# Finally, find the isotopologue that matches the formula_dict
160+
matched_isos = isos
161+
for iso in isos:
162+
if set(iso.atoms) == set(formula_dict.keys()):
163+
# Check the values of the atoms match
164+
if all([iso[atom] == formula_dict[atom] for atom in formula_dict]):
165+
matched_isos = [iso]
166+
if len(matched_isos) > 1:
167+
raise ValueError("More than one isotopologue matched the formula_dict: {matched_isos}")
168+
if len(matched_isos) == 0:
169+
raise ValueError("No isotopologue matched the formula_dict")
170+
mfobj = matched_isos[0]
171+
172+
# Add the mono isotopic index, confidence score and isotopologue similarity
173+
mfobj.mspeak_index_mono_isotopic = int(dataframe.iloc[df_index]['Mono Isotopic Index'])
174+
175+
# Add the confidence score and isotopologue similarity and average MZ error score
176+
mfobj._mass_error_average_score = float(dataframe.iloc[df_index]['m/z Error Score'])
177+
mfobj._confidence_score = float(dataframe.iloc[df_index]['Confidence Score'])
178+
mfobj._isotopologue_similarity = float(dataframe.iloc[df_index]['Isotopologue Similarity'])
115179
mass_spec_obj[ms_peak_index].add_molecular_formula(mfobj)
116180

117181

tests/test_mass_spectra_export_import.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ def prep_mass_spec_obj():
88
# Test for generating accurate molecular formula from a single mass using the local sql database
99
# Now also tests that it is handling isotopes correctly (for non-adducts)
1010
mz = [760.58156938877, 761.58548]
11-
abundance = [1, 0.4]
12-
rp, s2n = [[1, 1],[1, 1]]
11+
abundance = [1000, 400]
12+
rp, s2n = [[1, 1],[10, 10]]
1313

1414
MSParameters.mass_spectrum.noise_threshold_method = 'relative_abundance'
1515
MSParameters.mass_spectrum.noise_threshold_absolute_abundance = 0
@@ -39,6 +39,7 @@ def run_molecular_formula_search(mass_spectrum_obj):
3939
ms_df1 = mass_spectrum_obj.to_dataframe()
4040
assert mass_spectrum_obj[0][0].string == 'C56 H73 N1'
4141
assert ms_df1.shape == (2, 26)
42+
assert mass_spectrum_obj[1][0].string == 'C55 H73 N1 13C1'
4243

4344
exportMS = HighResMassSpecExport('my_mass_spec', mass_spectrum_obj)
4445
exportMS._output_type = 'hdf5'
@@ -49,4 +50,7 @@ def run_molecular_formula_search(mass_spectrum_obj):
4950

5051
# Below errors out:
5152
ms_df2 = mass_spectrum_obj2.to_dataframe()
53+
assert mass_spectrum_obj2[0][0].string == 'C56 H73 N1'
54+
assert ms_df2.shape == (2, 26)
55+
assert mass_spectrum_obj2[1][0].string == 'C55 H73 N1 13C1'
5256

0 commit comments

Comments
 (0)