Merge branch 'masscalibration2024' into 'master'

corilo · corilo · commit c868a8c23553 · 2024-08-08T19:34:32.000Z
Masscalibration2024

See merge request mass-spectrometry/corems!117
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.0.9
+current_version = 2.0.10
 commit = False
 tag = False
 
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ CoreMS aims to provide
 
 ## Current Version
 
- `2.0.9`
+ `2.0.10`
 
 ***
 
@@ -323,7 +323,7 @@ UML (unified modeling language) diagrams for Direct Infusion FT-MS and GC-MS cla
 
 If you use CoreMS in your work, please use the following citation:
 
-Version [2.0.9 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v2.0.9), archived on Zenodo:  
+Version [2.0.10 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v2.0.10), archived on Zenodo:  
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4641552.svg)](https://doi.org/10.5281/zenodo.4641552)
 
diff --git a/corems/__init__.py b/corems/__init__.py
@@ -1,5 +1,5 @@
 __author__ = 'Yuri E. Corilo'
-__version__ = '2.0.9'
+__version__ = '2.0.10'
 __doc__ = '''
 <div align="left">
 
diff --git a/corems/encapsulation/factory/processingSetting.py b/corems/encapsulation/factory/processingSetting.py
@@ -135,7 +135,7 @@ class LiquidChromatographSetting:
         0-100 % used for extracted ion chromatogram peak detection. Default is 0.01.
     """
    
-    scans: list | tuple = (0, 1)
+    scans: list | tuple = (-1,-1)
     
     eic_tolerance_ppm: float = 5
     
@@ -224,6 +224,10 @@ class MassSpectrumSetting:
         Minimum ppm error to use for calibration. Default is -1.0.
     calib_sn_threshold : float, optional
         Signal to noise threshold to use for calibration. Default is 2.0.
+    calibration_ref_match_method: string, optional
+        Method for matching reference masses with measured masses for recalibration. Default is 'legacy'. 
+    calibration_ref_match_tolerance: float, optional
+        If using the new method for calibration reference mass matching, this tolerance is the initial matching tolerance. Default is 0.003
     do_calibration : bool, optional
         If True, perform calibration. Default is True.    
     """
@@ -259,6 +263,10 @@ class MassSpectrumSetting:
     max_calib_ppm_error: float = 1.0
     min_calib_ppm_error: float = -1.0
     calib_sn_threshold: float = 2.0
+    calibration_ref_match_method: str = 'legacy'
+    calibration_ref_match_method_implemented: tuple = ('legacy', 'merged')
+    calibration_ref_match_tolerance: float = 0.003
+    calibration_ref_match_std_raw_error_limit: float = 1.5
     #calib_ref_mzs: list = [0]
 
     do_calibration: bool = True
diff --git a/corems/mass_spectrum/calc/Calibration.py b/corems/mass_spectrum/calc/Calibration.py
@@ -116,7 +116,7 @@ def load_ref_mass_list(self):
                                 3: 'Form2'
                                 }, axis=1)
 
-        df_ref.sort_values(by='m/z', ascending=False)
+        df_ref.sort_values(by='m/z', ascending=True,inplace=True)
         print("Reference mass list loaded - " + str(len(df_ref)) + " calibration masses loaded.")
 
         return df_ref
@@ -150,7 +150,10 @@ def gen_ref_mass_list_from_assigned(self, min_conf : float=0.7):
 
     def find_calibration_points(self, df_ref,
                                 calib_ppm_error_threshold : tuple[float, float]=(-1, 1),
-                                calib_snr_threshold : float=5):
+                                calib_snr_threshold : float=5,
+                                calibration_ref_match_method : str='legacy',
+                                calibration_ref_match_tolerance : float=0.003,
+                                calibration_ref_match_std_raw_error_limit: float=1.5):
         """Function to find calibration points in the mass spectrum 
         
         Based on the reference mass list.
@@ -186,23 +189,49 @@ def find_calibration_points(self, df_ref,
                     peaks_mz.append(x.mz_exp)
         peaks_mz = np.asarray(peaks_mz)
         
-        cal_peaks_mz = []
-        cal_refs_mz = []
-        for mzref in df_ref['m/z']:
-            tmp_peaks_mz = peaks_mz[abs(peaks_mz-mzref)<1]
-            for mzmeas in tmp_peaks_mz:
-                delta_mass = ((mzmeas-mzref)/mzref)*1e6
-                if delta_mass < max(calib_ppm_error_threshold):
-                    if delta_mass > min(calib_ppm_error_threshold):
-                        cal_peaks_mz.append(mzmeas)
-                        cal_refs_mz.append(mzref)
-
-        # To remove entries with duplicated indices (reference masses matching multiple peaks)
-        tmpdf = pd.Series(index = cal_refs_mz,data = cal_peaks_mz,dtype=float)
-        tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)]
-
-        cal_peaks_mz = list(tmpdf.values)
-        cal_refs_mz = list(tmpdf.index)
+        if calibration_ref_match_method == 'legacy':
+            # This legacy approach iterates through each reference match and finds the entries within 1 mz and within the user defined PPM error threshold
+            # Then it removes ambiguities - which means the calibration threshold hasto be very tight.
+            cal_peaks_mz = []
+            cal_refs_mz = []
+            for mzref in df_ref['m/z']:
+                tmp_peaks_mz = peaks_mz[abs(peaks_mz-mzref)<1]
+                for mzmeas in tmp_peaks_mz:
+                    delta_mass = ((mzmeas-mzref)/mzref)*1e6
+                    if delta_mass < max(calib_ppm_error_threshold):
+                        if delta_mass > min(calib_ppm_error_threshold):
+                            cal_peaks_mz.append(mzmeas)
+                            cal_refs_mz.append(mzref)
+
+            # To remove entries with duplicated indices (reference masses matching multiple peaks)
+            tmpdf = pd.Series(index = cal_refs_mz,data = cal_peaks_mz,dtype=float)
+            tmpdf = tmpdf[~tmpdf.index.duplicated(keep=False)]
+
+            cal_peaks_mz = list(tmpdf.values)
+            cal_refs_mz = list(tmpdf.index)
+        elif calibration_ref_match_method == 'merged':
+            print('Using experimental new reference mass list merging')
+            # This is a new approach (August 2024) which uses Pandas 'merged_asof' to find the peaks closest in m/z between 
+            # reference and measured masses. This is a quicker way to match, and seems to get more matches.
+            # It may not work as well when the data are far from correc initial mass
+            # e.g. if the correct peak is further from the reference than an incorrect peak.
+            meas_df = pd.DataFrame(columns=['meas_m/z'],data = peaks_mz)
+            tolerance = calibration_ref_match_tolerance
+            merged_df = pd.merge_asof(df_ref, meas_df, left_on='m/z', right_on = 'meas_m/z',tolerance=tolerance,direction='nearest')
+            merged_df.dropna(how='any',inplace=True)
+            merged_df['Error_ppm'] = ((merged_df['meas_m/z']-merged_df['m/z'])/merged_df['m/z'])*1e6
+            median_raw_error = merged_df['Error_ppm'].median()
+            std_raw_error = merged_df['Error_ppm'].std()
+            if std_raw_error > calibration_ref_match_std_raw_error_limit:
+                std_raw_error = calibration_ref_match_std_raw_error_limit
+            self.mass_spectrum.calibration_raw_error_median = median_raw_error
+            self.mass_spectrum.calibration_raw_error_stdev = std_raw_error
+            merged_df= merged_df[(merged_df['Error_ppm']>(median_raw_error-1.5*std_raw_error))&(merged_df['Error_ppm']<(median_raw_error+1.5*std_raw_error))]
+            #merged_df= merged_df[(merged_df['Error_ppm']>min(calib_ppm_error_threshold))&(merged_df['Error_ppm']<max(calib_ppm_error_threshold))]
+            cal_peaks_mz = list(merged_df['meas_m/z'])
+            cal_refs_mz = list(merged_df['m/z'])   
+        else:
+            print(f'{calibration_ref_match_method} not allowed.')
 
         if False:
             min_calib_ppm_error = calib_ppm_error_threshold[0]
@@ -420,6 +449,9 @@ def run(self):
         max_calib_ppm_error = self.mass_spectrum.settings.max_calib_ppm_error
         min_calib_ppm_error = self.mass_spectrum.settings.min_calib_ppm_error
         calib_pol_order = self.mass_spectrum.settings.calib_pol_order
+        calibration_ref_match_method = self.mass_spectrum.settings.calibration_ref_match_method
+        calibration_ref_match_tolerance = self.mass_spectrum.settings.calibration_ref_match_tolerance
+        calibration_ref_match_std_raw_error_limit = self.mass_spectrum.settings.calibration_ref_match_std_raw_error_limit
 
         # load reference mass list
         df_ref = self.load_ref_mass_list()
@@ -428,7 +460,10 @@ def run(self):
         cal_peaks_mz, cal_refs_mz = self.find_calibration_points(df_ref,
                                                        calib_ppm_error_threshold=(min_calib_ppm_error,
                                                                                   max_calib_ppm_error),
-                                                       calib_snr_threshold=calib_ppm_error_threshold)
+                                                       calib_snr_threshold=calib_ppm_error_threshold,
+                                                       calibration_ref_match_method = calibration_ref_match_method,
+                                                       calibration_ref_match_tolerance = calibration_ref_match_tolerance,
+                                                       calibration_ref_match_std_raw_error_limit = calibration_ref_match_std_raw_error_limit)
         if len(cal_peaks_mz)==2:
             self.mass_spectrum.settings.calib_pol_order = 1
             calib_pol_order = 1
diff --git a/corems/mass_spectrum/factory/MassSpectrumClasses.py b/corems/mass_spectrum/factory/MassSpectrumClasses.py
@@ -119,6 +119,8 @@ def __init__(self, mz_exp, abundance, d_params, **kwargs):
         self.calibration_points = None
         self.calibration_RMS = None
         self.calibration_segment = None
+        self.calibration_raw_error_median = None
+        self.calibration_raw_error_stdev = None
 
     def _init_settings(self):
         """Initializes the settings for the mass spectrum."""
diff --git a/examples/notebooks/Mass_Recalibration_Example.ipynb b/examples/notebooks/Mass_Recalibration_Example.ipynb
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 # This call to setup() does all the work
 setup(
     name="CoreMS",
-    version="2.0.9",
+    version="2.0.10",
     description="Mass Spectrometry Framework for Small Molecules Analysis",
     long_description=long_description,
     long_description_content_type="text/markdown",