Skip to content

Commit 2d3112d

Browse files
committed
Lint part of mass_spectrum module
1 parent 1690a0e commit 2d3112d

11 files changed

Lines changed: 1770 additions & 1369 deletions

corems/mass_spectrum/calc/AutoRecalibration.py

Lines changed: 85 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,20 @@
88
"""
99

1010
from lmfit.models import GaussianModel
11-
from scipy import stats
1211
import seaborn as sns
1312
import pandas as pd
1413
import numpy as np
1514
import matplotlib.pyplot as plt
16-
from corems.encapsulation.factory.parameters import MSParameters
1715
from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulas
1816
import copy
1917

18+
2019
class HighResRecalibration:
2120
"""
2221
This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter
2322
2423
The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI)
25-
And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range.
24+
And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range.
2625
This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer
2726
It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm)
2827
@@ -34,86 +33,97 @@ class HighResRecalibration:
3433
Whether to plot the error distribution. The default is False.
3534
docker : bool, optional
3635
Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database.
37-
ppmFWHMprior : float, optional
36+
ppmFWHMprior : float, optional
3837
The FWHM of the prior distribution (ppm). The default is 3.
3938
ppmRangeprior : float, optional
4039
The range of the prior distribution (ppm). The default is 15.
4140
4241
Methods
4342
--------
44-
* determine_error_boundaries(). Determine the error boundaries for recalibration space.
43+
* determine_error_boundaries(). Determine the error boundaries for recalibration space.
4544
4645
Notes
4746
-----
48-
This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments.
49-
Possible future task is to make the base class copyable.
47+
This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments.
48+
Possible future task is to make the base class copyable.
5049
5150
"""
5251

53-
def __init__(self, mass_spectrum, plot : bool=False, docker : bool=True,
54-
ppmFWHMprior : float=3, ppmRangeprior : float=15):
55-
56-
self.mass_spectrum = copy.deepcopy(mass_spectrum)
52+
def __init__(
53+
self,
54+
mass_spectrum,
55+
plot: bool = False,
56+
docker: bool = True,
57+
ppmFWHMprior: float = 3,
58+
ppmRangeprior: float = 15,
59+
):
60+
self.mass_spectrum = copy.deepcopy(mass_spectrum)
5761
self.plot = plot
5862
self.docker = docker
5963
self.ppmFWHMprior = ppmFWHMprior
6064
self.ppmRangeprior = ppmRangeprior
6165

62-
6366
def set_uncal_settings(self):
64-
""" Set uncalibrated formula search settings
67+
"""Set uncalibrated formula search settings
6568
6669
This function serves the uncalibrated data (hence broad error tolerance)
6770
It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode
6871
6972
This will not work for positive mode data, or for other ion types, or other expected elemental searches.
7073
7174
"""
72-
#TODO rework this.
75+
# TODO rework this.
7376

7477
if self.docker:
7578
self.mass_spectrum.molecular_search_settings.url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
7679
else:
7780
self.mass_spectrum.molecular_search_settings.url_database = None
7881
self.mass_spectrum.molecular_search_settings.error_method = None
79-
self.mass_spectrum.molecular_search_settings.score_method = 'prob_score'
82+
self.mass_spectrum.molecular_search_settings.score_method = "prob_score"
8083

81-
self.mass_spectrum.molecular_search_settings.min_ppm_error = -1*self.ppmRangeprior/2 #-7.5
82-
self.mass_spectrum.molecular_search_settings.max_ppm_error = self.ppmRangeprior/2 #7.5
84+
self.mass_spectrum.molecular_search_settings.min_ppm_error = (
85+
-1 * self.ppmRangeprior / 2
86+
) # -7.5
87+
self.mass_spectrum.molecular_search_settings.max_ppm_error = (
88+
self.ppmRangeprior / 2
89+
) # 7.5
8390

8491
self.mass_spectrum.molecular_search_settings.min_dbe = 0
8592
self.mass_spectrum.molecular_search_settings.max_dbe = 50
86-
93+
8794
self.mass_spectrum.molecular_search_settings.use_isotopologue_filter = False
8895
self.mass_spectrum.molecular_search_settings.min_abun_error = -30
8996
self.mass_spectrum.molecular_search_settings.max_abun_error = 70
90-
97+
9198
self.mass_spectrum.molecular_search_settings.use_min_peaks_filter = True
92-
self.mass_spectrum.molecular_search_settings.min_peaks_per_class = 10 #default is 15
99+
self.mass_spectrum.molecular_search_settings.min_peaks_per_class = (
100+
10 # default is 15
101+
)
93102

94-
self.mass_spectrum.molecular_search_settings.usedAtoms['C'] = (1,90)
95-
self.mass_spectrum.molecular_search_settings.usedAtoms['H'] = (4,200)
96-
self.mass_spectrum.molecular_search_settings.usedAtoms['O'] = (1,23)
97-
self.mass_spectrum.molecular_search_settings.usedAtoms['N'] = (0,0)
98-
self.mass_spectrum.molecular_search_settings.usedAtoms['S'] = (0,0)
99-
self.mass_spectrum.molecular_search_settings.usedAtoms['P'] = (0,0)
103+
self.mass_spectrum.molecular_search_settings.usedAtoms["C"] = (1, 90)
104+
self.mass_spectrum.molecular_search_settings.usedAtoms["H"] = (4, 200)
105+
self.mass_spectrum.molecular_search_settings.usedAtoms["O"] = (1, 23)
106+
self.mass_spectrum.molecular_search_settings.usedAtoms["N"] = (0, 0)
107+
self.mass_spectrum.molecular_search_settings.usedAtoms["S"] = (0, 0)
108+
self.mass_spectrum.molecular_search_settings.usedAtoms["P"] = (0, 0)
100109

101110
self.mass_spectrum.molecular_search_settings.isProtonated = True
102-
self.mass_spectrum.molecular_search_settings.isRadical= False
111+
self.mass_spectrum.molecular_search_settings.isRadical = False
103112
self.mass_spectrum.molecular_search_settings.isAdduct = False
104113

105114
def positive_search_settings(self):
106-
""" Set the positive mode elemental search settings
107-
"""
115+
"""Set the positive mode elemental search settings"""
108116
self.mass_spectrum.molecular_search_settings.isProtonated = False
109117
self.mass_spectrum.molecular_search_settings.isAdduct = True
110-
self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ['Na']
118+
self.mass_spectrum.molecular_search_settings.adduct_atoms_pos = ["Na"]
111119

112120
@staticmethod
113-
def get_error_range(errors: list, ppmFWHMprior: float=3, plot_logic: bool=False):
114-
""" Get the error range from the error distribution
121+
def get_error_range(
122+
errors: list, ppmFWHMprior: float = 3, plot_logic: bool = False
123+
):
124+
"""Get the error range from the error distribution
115125
116-
Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
126+
Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
117127
118128
Parameters
119129
----------
@@ -123,7 +133,7 @@ def get_error_range(errors: list, ppmFWHMprior: float=3, plot_logic: bool=False)
123133
The FWHM of the prior distribution (ppm). The default is 3.
124134
plot_logic : bool, optional
125135
Whether to plot the error distribution. The default is False.
126-
136+
127137
Returns
128138
-------
129139
mean_error : float
@@ -133,49 +143,51 @@ def get_error_range(errors: list, ppmFWHMprior: float=3, plot_logic: bool=False)
133143
ppm_thresh : list
134144
recommended thresholds for the recalibration parameters (ppm)
135145
Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
136-
146+
137147
"""
138-
kde = sns.kdeplot(errors)
148+
kde = sns.kdeplot(errors)
139149

140150
kde_data = kde.get_lines()[0].get_data()
141-
142-
tmpdf = pd.Series(index=kde_data[0],data=kde_data[1])
151+
152+
tmpdf = pd.Series(index=kde_data[0], data=kde_data[1])
143153
kde_apex_ppm = tmpdf.idxmax()
144154
kde_apex_val = tmpdf.max()
145155

146156
plt.close(kde.figure)
147-
plt.close('all')
148-
157+
plt.close("all")
158+
149159
lmmodel = GaussianModel()
150160
lmpars = lmmodel.guess(kde_data[1], x=kde_data[0])
151-
lmpars['sigma'].value = 2.3548/ppmFWHMprior
152-
lmpars['center'].value = kde_apex_ppm
153-
lmpars['amplitude'].value = kde_apex_val
161+
lmpars["sigma"].value = 2.3548 / ppmFWHMprior
162+
lmpars["center"].value = kde_apex_ppm
163+
lmpars["amplitude"].value = kde_apex_val
154164
lmout = lmmodel.fit(kde_data[1], lmpars, x=kde_data[0])
155-
165+
156166
if plot_logic:
157-
fig,ax = plt.subplots(figsize=(8,4))
158-
lmout.plot_fit(ax=ax,data_kws ={'color':'tab:blue'},fit_kws ={'color':'tab:red'})
159-
ax.set_xlabel('$m/z$ Error (ppm)')
160-
ax.set_ylabel('Density')
161-
plt.legend(facecolor='white', framealpha=0)
162-
163-
mean_error = lmout.best_values['center']
164-
std_error = lmout.best_values['sigma']
167+
fig, ax = plt.subplots(figsize=(8, 4))
168+
lmout.plot_fit(
169+
ax=ax, data_kws={"color": "tab:blue"}, fit_kws={"color": "tab:red"}
170+
)
171+
ax.set_xlabel("$m/z$ Error (ppm)")
172+
ax.set_ylabel("Density")
173+
plt.legend(facecolor="white", framealpha=0)
174+
175+
mean_error = lmout.best_values["center"]
176+
std_error = lmout.best_values["sigma"]
165177
# FWHM from Sigma = approx. 2.355*sigma
166-
#fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
167-
fwhm_error = std_error * np.sqrt(8*np.log(2))
168-
169-
ppm_thresh = [mean_error-fwhm_error,mean_error+fwhm_error]
170-
return mean_error,fwhm_error,ppm_thresh
171-
178+
# fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
179+
fwhm_error = std_error * np.sqrt(8 * np.log(2))
180+
181+
ppm_thresh = [mean_error - fwhm_error, mean_error + fwhm_error]
182+
return mean_error, fwhm_error, ppm_thresh
183+
172184
def determine_error_boundaries(self):
173-
""" Determine the error boundaries for recalibration space
185+
"""Determine the error boundaries for recalibration space
174186
175187
This is the main function in this class
176188
Sets the Molecular Formulas search settings, performs the initial formula search
177189
Converts the data to a dataframe, and gets the error range
178-
Returns the error thresholds.
190+
Returns the error thresholds.
179191
180192
Returns
181193
-------
@@ -187,8 +199,8 @@ def determine_error_boundaries(self):
187199
recommended thresholds for the recalibration parameters (ppm)
188200
Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
189201
"""
190-
191-
# Set the search settings
202+
203+
# Set the search settings
192204
self.set_uncal_settings()
193205

194206
# Set the positive mode settings
@@ -197,24 +209,26 @@ def determine_error_boundaries(self):
197209
self.positive_search_settings()
198210

199211
# Search MFs
200-
SearchMolecularFormulas(self.mass_spectrum, first_hit=True).run_worker_mass_spectrum()
201-
202-
212+
SearchMolecularFormulas(
213+
self.mass_spectrum, first_hit=True
214+
).run_worker_mass_spectrum()
215+
203216
# Exporting to a DF is ~30x slower than just getting the errors, so this is fast.
204217
errors = []
205218
for mspeak in self.mass_spectrum.mspeaks:
206-
if len(mspeak.molecular_formulas)>0:
219+
if len(mspeak.molecular_formulas) > 0:
207220
errors.append(mspeak.best_molecular_formula_candidate.mz_error)
208221

209-
210222
# If there are NO assignments, it'll fail on the next step. Need to check for that
211223
nassign = len(errors)
212224
# Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code
213225
# We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments
214-
if nassign <5:
226+
if nassign < 5:
215227
if self.mass_spectrum.parameters.mass_spectrum.verbose_processing:
216228
print("fewer than 5 peaks assigned, cannot determine error range")
217-
return np.nan,np.nan,[np.nan,np.nan]
229+
return np.nan, np.nan, [np.nan, np.nan]
218230
else:
219-
mean_error,fwhm_error,ppm_thresh = self.get_error_range(errors, self.ppmFWHMprior, self.plot)
220-
return mean_error,fwhm_error,ppm_thresh
231+
mean_error, fwhm_error, ppm_thresh = self.get_error_range(
232+
errors, self.ppmFWHMprior, self.plot
233+
)
234+
return mean_error, fwhm_error, ppm_thresh

0 commit comments

Comments
 (0)