88"""
99
1010from lmfit .models import GaussianModel
11- from scipy import stats
1211import seaborn as sns
1312import pandas as pd
1413import numpy as np
1514import matplotlib .pyplot as plt
16- from corems .encapsulation .factory .parameters import MSParameters
1715from corems .molecular_id .search .molecularFormulaSearch import SearchMolecularFormulas
1816import copy
1917
18+
2019class HighResRecalibration :
2120 """
2221 This class is designed for high resolution (FTICR, Orbitrap) data of complex mixture, e.g. Organic matter
2322
2423 The tool first does a broad mass range search for the most commonly expected ion type (i.e. CHO, deprotonated - for negative ESI)
25- And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range.
24+ And then the assigned data mass error distribution is searched, with a gaussian fit to the most prominent range.
2625 This tool works when the data are of sufficient quality, and not outwith the typical expected range of the mass analyzer
2726 It presumes the mean error is out by 0-several ppm, but that the spread of error values is modest (<2ppm)
2827
@@ -34,86 +33,97 @@ class HighResRecalibration:
3433 Whether to plot the error distribution. The default is False.
3534 docker : bool, optional
3635 Whether to use the docker database. The default is True. If not, it uses a dynamically generated sqlite database.
37- ppmFWHMprior : float, optional
36+ ppmFWHMprior : float, optional
3837 The FWHM of the prior distribution (ppm). The default is 3.
3938 ppmRangeprior : float, optional
4039 The range of the prior distribution (ppm). The default is 15.
4140
4241 Methods
4342 --------
44- * determine_error_boundaries(). Determine the error boundaries for recalibration space.
43+ * determine_error_boundaries(). Determine the error boundaries for recalibration space.
4544
4645 Notes
4746 -----
48- This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments.
49- Possible future task is to make the base class copyable.
47+ This initialisation function creates a copy of the MassSpectrum object to avoid over-writing assignments.
48+ Possible future task is to make the base class copyable.
5049
5150 """
5251
53- def __init__ (self , mass_spectrum , plot : bool = False , docker : bool = True ,
54- ppmFWHMprior : float = 3 , ppmRangeprior : float = 15 ):
55-
56- self .mass_spectrum = copy .deepcopy (mass_spectrum )
52+ def __init__ (
53+ self ,
54+ mass_spectrum ,
55+ plot : bool = False ,
56+ docker : bool = True ,
57+ ppmFWHMprior : float = 3 ,
58+ ppmRangeprior : float = 15 ,
59+ ):
60+ self .mass_spectrum = copy .deepcopy (mass_spectrum )
5761 self .plot = plot
5862 self .docker = docker
5963 self .ppmFWHMprior = ppmFWHMprior
6064 self .ppmRangeprior = ppmRangeprior
6165
62-
6366 def set_uncal_settings (self ):
64- """ Set uncalibrated formula search settings
67+ """Set uncalibrated formula search settings
6568
6669 This function serves the uncalibrated data (hence broad error tolerance)
6770 It only allows CHO formula in deprotonated ion type- as most common for SRFA ESI negative mode
6871
6972 This will not work for positive mode data, or for other ion types, or other expected elemental searches.
7073
7174 """
72- #TODO rework this.
75+ # TODO rework this.
7376
7477 if self .docker :
7578 self .mass_spectrum .molecular_search_settings .url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@localhost:5432/coremsapp"
7679 else :
7780 self .mass_spectrum .molecular_search_settings .url_database = None
7881 self .mass_spectrum .molecular_search_settings .error_method = None
79- self .mass_spectrum .molecular_search_settings .score_method = ' prob_score'
82+ self .mass_spectrum .molecular_search_settings .score_method = " prob_score"
8083
81- self .mass_spectrum .molecular_search_settings .min_ppm_error = - 1 * self .ppmRangeprior / 2 #-7.5
82- self .mass_spectrum .molecular_search_settings .max_ppm_error = self .ppmRangeprior / 2 #7.5
84+ self .mass_spectrum .molecular_search_settings .min_ppm_error = (
85+ - 1 * self .ppmRangeprior / 2
86+ ) # -7.5
87+ self .mass_spectrum .molecular_search_settings .max_ppm_error = (
88+ self .ppmRangeprior / 2
89+ ) # 7.5
8390
8491 self .mass_spectrum .molecular_search_settings .min_dbe = 0
8592 self .mass_spectrum .molecular_search_settings .max_dbe = 50
86-
93+
8794 self .mass_spectrum .molecular_search_settings .use_isotopologue_filter = False
8895 self .mass_spectrum .molecular_search_settings .min_abun_error = - 30
8996 self .mass_spectrum .molecular_search_settings .max_abun_error = 70
90-
97+
9198 self .mass_spectrum .molecular_search_settings .use_min_peaks_filter = True
92- self .mass_spectrum .molecular_search_settings .min_peaks_per_class = 10 #default is 15
99+ self .mass_spectrum .molecular_search_settings .min_peaks_per_class = (
100+ 10 # default is 15
101+ )
93102
94- self .mass_spectrum .molecular_search_settings .usedAtoms ['C' ] = (1 ,90 )
95- self .mass_spectrum .molecular_search_settings .usedAtoms ['H' ] = (4 ,200 )
96- self .mass_spectrum .molecular_search_settings .usedAtoms ['O' ] = (1 ,23 )
97- self .mass_spectrum .molecular_search_settings .usedAtoms ['N' ] = (0 ,0 )
98- self .mass_spectrum .molecular_search_settings .usedAtoms ['S' ] = (0 ,0 )
99- self .mass_spectrum .molecular_search_settings .usedAtoms ['P' ] = (0 ,0 )
103+ self .mass_spectrum .molecular_search_settings .usedAtoms ["C" ] = (1 , 90 )
104+ self .mass_spectrum .molecular_search_settings .usedAtoms ["H" ] = (4 , 200 )
105+ self .mass_spectrum .molecular_search_settings .usedAtoms ["O" ] = (1 , 23 )
106+ self .mass_spectrum .molecular_search_settings .usedAtoms ["N" ] = (0 , 0 )
107+ self .mass_spectrum .molecular_search_settings .usedAtoms ["S" ] = (0 , 0 )
108+ self .mass_spectrum .molecular_search_settings .usedAtoms ["P" ] = (0 , 0 )
100109
101110 self .mass_spectrum .molecular_search_settings .isProtonated = True
102- self .mass_spectrum .molecular_search_settings .isRadical = False
111+ self .mass_spectrum .molecular_search_settings .isRadical = False
103112 self .mass_spectrum .molecular_search_settings .isAdduct = False
104113
105114 def positive_search_settings (self ):
106- """ Set the positive mode elemental search settings
107- """
115+ """Set the positive mode elemental search settings"""
108116 self .mass_spectrum .molecular_search_settings .isProtonated = False
109117 self .mass_spectrum .molecular_search_settings .isAdduct = True
110- self .mass_spectrum .molecular_search_settings .adduct_atoms_pos = ['Na' ]
118+ self .mass_spectrum .molecular_search_settings .adduct_atoms_pos = ["Na" ]
111119
112120 @staticmethod
113- def get_error_range (errors : list , ppmFWHMprior : float = 3 , plot_logic : bool = False ):
114- """ Get the error range from the error distribution
121+ def get_error_range (
122+ errors : list , ppmFWHMprior : float = 3 , plot_logic : bool = False
123+ ):
124+ """Get the error range from the error distribution
115125
116- Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
126+ Using lmfit and seaborn kdeplot to extract the error range from the error distribution of assigned species.
117127
118128 Parameters
119129 ----------
@@ -123,7 +133,7 @@ def get_error_range(errors: list, ppmFWHMprior: float=3, plot_logic: bool=False)
123133 The FWHM of the prior distribution (ppm). The default is 3.
124134 plot_logic : bool, optional
125135 Whether to plot the error distribution. The default is False.
126-
136+
127137 Returns
128138 -------
129139 mean_error : float
@@ -133,49 +143,51 @@ def get_error_range(errors: list, ppmFWHMprior: float=3, plot_logic: bool=False)
133143 ppm_thresh : list
134144 recommended thresholds for the recalibration parameters (ppm)
135145 Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
136-
146+
137147 """
138- kde = sns .kdeplot (errors )
148+ kde = sns .kdeplot (errors )
139149
140150 kde_data = kde .get_lines ()[0 ].get_data ()
141-
142- tmpdf = pd .Series (index = kde_data [0 ],data = kde_data [1 ])
151+
152+ tmpdf = pd .Series (index = kde_data [0 ], data = kde_data [1 ])
143153 kde_apex_ppm = tmpdf .idxmax ()
144154 kde_apex_val = tmpdf .max ()
145155
146156 plt .close (kde .figure )
147- plt .close (' all' )
148-
157+ plt .close (" all" )
158+
149159 lmmodel = GaussianModel ()
150160 lmpars = lmmodel .guess (kde_data [1 ], x = kde_data [0 ])
151- lmpars [' sigma' ].value = 2.3548 / ppmFWHMprior
152- lmpars [' center' ].value = kde_apex_ppm
153- lmpars [' amplitude' ].value = kde_apex_val
161+ lmpars [" sigma" ].value = 2.3548 / ppmFWHMprior
162+ lmpars [" center" ].value = kde_apex_ppm
163+ lmpars [" amplitude" ].value = kde_apex_val
154164 lmout = lmmodel .fit (kde_data [1 ], lmpars , x = kde_data [0 ])
155-
165+
156166 if plot_logic :
157- fig ,ax = plt .subplots (figsize = (8 ,4 ))
158- lmout .plot_fit (ax = ax ,data_kws = {'color' :'tab:blue' },fit_kws = {'color' :'tab:red' })
159- ax .set_xlabel ('$m/z$ Error (ppm)' )
160- ax .set_ylabel ('Density' )
161- plt .legend (facecolor = 'white' , framealpha = 0 )
162-
163- mean_error = lmout .best_values ['center' ]
164- std_error = lmout .best_values ['sigma' ]
167+ fig , ax = plt .subplots (figsize = (8 , 4 ))
168+ lmout .plot_fit (
169+ ax = ax , data_kws = {"color" : "tab:blue" }, fit_kws = {"color" : "tab:red" }
170+ )
171+ ax .set_xlabel ("$m/z$ Error (ppm)" )
172+ ax .set_ylabel ("Density" )
173+ plt .legend (facecolor = "white" , framealpha = 0 )
174+
175+ mean_error = lmout .best_values ["center" ]
176+ std_error = lmout .best_values ["sigma" ]
165177 # FWHM from Sigma = approx. 2.355*sigma
166- #fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
167- fwhm_error = std_error * np .sqrt (8 * np .log (2 ))
168-
169- ppm_thresh = [mean_error - fwhm_error ,mean_error + fwhm_error ]
170- return mean_error ,fwhm_error ,ppm_thresh
171-
178+ # fwhm_error = 2*np.sqrt(2*np.log(2))*std_error
179+ fwhm_error = std_error * np .sqrt (8 * np .log (2 ))
180+
181+ ppm_thresh = [mean_error - fwhm_error , mean_error + fwhm_error ]
182+ return mean_error , fwhm_error , ppm_thresh
183+
172184 def determine_error_boundaries (self ):
173- """ Determine the error boundaries for recalibration space
185+ """Determine the error boundaries for recalibration space
174186
175187 This is the main function in this class
176188 Sets the Molecular Formulas search settings, performs the initial formula search
177189 Converts the data to a dataframe, and gets the error range
178- Returns the error thresholds.
190+ Returns the error thresholds.
179191
180192 Returns
181193 -------
@@ -187,8 +199,8 @@ def determine_error_boundaries(self):
187199 recommended thresholds for the recalibration parameters (ppm)
188200 Consists of [mean_error-fwhm_error,mean_error+fwhm_error]
189201 """
190-
191- # Set the search settings
202+
203+ # Set the search settings
192204 self .set_uncal_settings ()
193205
194206 # Set the positive mode settings
@@ -197,24 +209,26 @@ def determine_error_boundaries(self):
197209 self .positive_search_settings ()
198210
199211 # Search MFs
200- SearchMolecularFormulas (self .mass_spectrum , first_hit = True ).run_worker_mass_spectrum ()
201-
202-
212+ SearchMolecularFormulas (
213+ self .mass_spectrum , first_hit = True
214+ ).run_worker_mass_spectrum ()
215+
203216 # Exporting to a DF is ~30x slower than just getting the errors, so this is fast.
204217 errors = []
205218 for mspeak in self .mass_spectrum .mspeaks :
206- if len (mspeak .molecular_formulas )> 0 :
219+ if len (mspeak .molecular_formulas ) > 0 :
207220 errors .append (mspeak .best_molecular_formula_candidate .mz_error )
208221
209-
210222 # If there are NO assignments, it'll fail on the next step. Need to check for that
211223 nassign = len (errors )
212224 # Here we say at least 5 features assigned are needed - it probably should be greater, but we are just trying to stop it breaking the code
213225 # We want to make sure the spectrum is capture in the database though - so we return the stats entries (0 assignments) and the number of assignments
214- if nassign < 5 :
226+ if nassign < 5 :
215227 if self .mass_spectrum .parameters .mass_spectrum .verbose_processing :
216228 print ("fewer than 5 peaks assigned, cannot determine error range" )
217- return np .nan ,np .nan ,[np .nan ,np .nan ]
229+ return np .nan , np .nan , [np .nan , np .nan ]
218230 else :
219- mean_error ,fwhm_error ,ppm_thresh = self .get_error_range (errors , self .ppmFWHMprior , self .plot )
220- return mean_error ,fwhm_error ,ppm_thresh
231+ mean_error , fwhm_error , ppm_thresh = self .get_error_range (
232+ errors , self .ppmFWHMprior , self .plot
233+ )
234+ return mean_error , fwhm_error , ppm_thresh
0 commit comments