@@ -116,7 +116,7 @@ def load_ref_mass_list(self):
116116 3 : 'Form2'
117117 }, axis = 1 )
118118
119- df_ref .sort_values (by = 'm/z' , ascending = False )
119+ df_ref .sort_values (by = 'm/z' , ascending = True , inplace = True )
120120 print ("Reference mass list loaded - " + str (len (df_ref )) + " calibration masses loaded." )
121121
122122 return df_ref
@@ -150,7 +150,10 @@ def gen_ref_mass_list_from_assigned(self, min_conf : float=0.7):
150150
151151 def find_calibration_points (self , df_ref ,
152152 calib_ppm_error_threshold : tuple [float , float ]= (- 1 , 1 ),
153- calib_snr_threshold : float = 5 ):
153+ calib_snr_threshold : float = 5 ,
154+ calibration_ref_match_method : str = 'legacy' ,
155+ calibration_ref_match_tolerance : float = 0.003 ,
156+ calibration_ref_match_std_raw_error_limit : float = 1.5 ):
154157 """Function to find calibration points in the mass spectrum
155158
156159 Based on the reference mass list.
@@ -186,23 +189,49 @@ def find_calibration_points(self, df_ref,
186189 peaks_mz .append (x .mz_exp )
187190 peaks_mz = np .asarray (peaks_mz )
188191
189- cal_peaks_mz = []
190- cal_refs_mz = []
191- for mzref in df_ref ['m/z' ]:
192- tmp_peaks_mz = peaks_mz [abs (peaks_mz - mzref )< 1 ]
193- for mzmeas in tmp_peaks_mz :
194- delta_mass = ((mzmeas - mzref )/ mzref )* 1e6
195- if delta_mass < max (calib_ppm_error_threshold ):
196- if delta_mass > min (calib_ppm_error_threshold ):
197- cal_peaks_mz .append (mzmeas )
198- cal_refs_mz .append (mzref )
199-
200- # To remove entries with duplicated indices (reference masses matching multiple peaks)
201- tmpdf = pd .Series (index = cal_refs_mz ,data = cal_peaks_mz ,dtype = float )
202- tmpdf = tmpdf [~ tmpdf .index .duplicated (keep = False )]
203-
204- cal_peaks_mz = list (tmpdf .values )
205- cal_refs_mz = list (tmpdf .index )
192+ if calibration_ref_match_method == 'legacy' :
193+ # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold
194+ # Then it removes ambiguities - which means the calibration threshold hasto be very tight.
195+ cal_peaks_mz = []
196+ cal_refs_mz = []
197+ for mzref in df_ref ['m/z' ]:
198+ tmp_peaks_mz = peaks_mz [abs (peaks_mz - mzref )< 1 ]
199+ for mzmeas in tmp_peaks_mz :
200+ delta_mass = ((mzmeas - mzref )/ mzref )* 1e6
201+ if delta_mass < max (calib_ppm_error_threshold ):
202+ if delta_mass > min (calib_ppm_error_threshold ):
203+ cal_peaks_mz .append (mzmeas )
204+ cal_refs_mz .append (mzref )
205+
206+ # To remove entries with duplicated indices (reference masses matching multiple peaks)
207+ tmpdf = pd .Series (index = cal_refs_mz ,data = cal_peaks_mz ,dtype = float )
208+ tmpdf = tmpdf [~ tmpdf .index .duplicated (keep = False )]
209+
210+ cal_peaks_mz = list (tmpdf .values )
211+ cal_refs_mz = list (tmpdf .index )
212+ elif calibration_ref_match_method == 'merged' :
213+ print ('Using experimental new reference mass list merging' )
214+ # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between
215+ # reference and measured masses. This is a quicker way to match, and seems to get more matches.
216+ # It may not work as well when the data are far from correc initial mass
217+ # e.g. if the correct peak is further from the reference than an incorrect peak.
218+ meas_df = pd .DataFrame (columns = ['meas_m/z' ],data = peaks_mz )
219+ tolerance = calibration_ref_match_tolerance
220+ merged_df = pd .merge_asof (df_ref , meas_df , left_on = 'm/z' , right_on = 'meas_m/z' ,tolerance = tolerance ,direction = 'nearest' )
221+ merged_df .dropna (how = 'any' ,inplace = True )
222+ merged_df ['Error_ppm' ] = ((merged_df ['meas_m/z' ]- merged_df ['m/z' ])/ merged_df ['m/z' ])* 1e6
223+ median_raw_error = merged_df ['Error_ppm' ].median ()
224+ std_raw_error = merged_df ['Error_ppm' ].std ()
225+ if std_raw_error > calibration_ref_match_std_raw_error_limit :
226+ std_raw_error = calibration_ref_match_std_raw_error_limit
227+ self .mass_spectrum .calibration_raw_error_median = median_raw_error
228+ self .mass_spectrum .calibration_raw_error_stdev = std_raw_error
229+ merged_df = merged_df [(merged_df ['Error_ppm' ]> (median_raw_error - 1.5 * std_raw_error ))& (merged_df ['Error_ppm' ]< (median_raw_error + 1.5 * std_raw_error ))]
230+ #merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))]
231+ cal_peaks_mz = list (merged_df ['meas_m/z' ])
232+ cal_refs_mz = list (merged_df ['m/z' ])
233+ else :
234+ print (f'{ calibration_ref_match_method } not allowed.' )
206235
207236 if False :
208237 min_calib_ppm_error = calib_ppm_error_threshold [0 ]
@@ -420,6 +449,9 @@ def run(self):
420449 max_calib_ppm_error = self .mass_spectrum .settings .max_calib_ppm_error
421450 min_calib_ppm_error = self .mass_spectrum .settings .min_calib_ppm_error
422451 calib_pol_order = self .mass_spectrum .settings .calib_pol_order
452+ calibration_ref_match_method = self .mass_spectrum .settings .calibration_ref_match_method
453+ calibration_ref_match_tolerance = self .mass_spectrum .settings .calibration_ref_match_tolerance
454+ calibration_ref_match_std_raw_error_limit = self .mass_spectrum .settings .calibration_ref_match_std_raw_error_limit
423455
424456 # load reference mass list
425457 df_ref = self .load_ref_mass_list ()
@@ -428,7 +460,10 @@ def run(self):
428460 cal_peaks_mz , cal_refs_mz = self .find_calibration_points (df_ref ,
429461 calib_ppm_error_threshold = (min_calib_ppm_error ,
430462 max_calib_ppm_error ),
431- calib_snr_threshold = calib_ppm_error_threshold )
463+ calib_snr_threshold = calib_ppm_error_threshold ,
464+ calibration_ref_match_method = calibration_ref_match_method ,
465+ calibration_ref_match_tolerance = calibration_ref_match_tolerance ,
466+ calibration_ref_match_std_raw_error_limit = calibration_ref_match_std_raw_error_limit )
432467 if len (cal_peaks_mz )== 2 :
433468 self .mass_spectrum .settings .calib_pol_order = 1
434469 calib_pol_order = 1
0 commit comments