EMSL-Computing
diff --git a/‎.bumpversion.cfg‎
Lines changed: 1 addition & 1 deletion b/‎.bumpversion.cfg‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 7 additions & 3 deletions b/‎CONTRIBUTING.md‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 3 additions & 0 deletions b/‎Makefile‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎corems/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎corems/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎corems/mass_spectrum/input/massList.py‎
Lines changed: 27 additions & 22 deletions b/‎corems/mass_spectrum/input/massList.py‎
Lines changed: 27 additions & 22 deletions
diff --git a/‎corems/molecular_id/search/database_interfaces.py‎
Lines changed: 68 additions & 14 deletions b/‎corems/molecular_id/search/database_interfaces.py‎
Lines changed: 68 additions & 14 deletions
diff --git a/‎corems/molecular_id/search/lcms_spectral_search.py‎
Lines changed: 1 addition & 1 deletion b/‎corems/molecular_id/search/lcms_spectral_search.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 3.1.0
+current_version = 3.2.0
 commit = False
 tag = False
 
 
@@ -5,6 +5,7 @@ Thank you for considering contributing to CoreMS! We appreciate your interest in
 ## Table of Contents
 
 - [Getting Started](#getting-started)
+- [Versioning](#versioning)
 - [Merge Request Checklist](#merge-request-checklist)
 - [Code Style](#code-style)
 - [Issue Reporting](#issue-reporting)
@@ -19,9 +20,12 @@ To get started with contributing to CoreMS, please follow these steps:
 3. Install the necessary dependencies. Refer to the [README](./README.md) for detailed installation instructions.
 4. Make your changes or additions.
 5. Test your changes thoroughly.
-6. Re-render documenation using the following `pdoc --o docs --d numpy corems`. Note that pdoc versioning is part of the requirements-dev.txt.
-7. Commit your changes and push them to your forked repository. Reference your original issue in your commits (i.e. closes #23)
-8. Submit a merge request to the main CoreMS repository and select an appropriate reviewer for the changes. Note the merge request checklist below that will be checked before each merge into the master branch. See the merge request checkliist
+6. Commit your changes and push them to your forked repository. Reference your original issue in your commits (i.e. closes #23)
+7. Submit a merge request to the main CoreMS repository and select an appropriate reviewer for the changes. Note the merge request checklist below that will be checked before each merge into the master branch. See the merge request checklist
+
+## Versioning
+
+We strive to use semantic versioning. To bump a new version and regenerate documentation, use one of the following make commands (according to version number)  `make major`, `make minor`, or `make patch`.  This should accompany each PiPy release.
 
 ## Merge Request Checklist
 
 
@@ -14,14 +14,17 @@ mem:
 major:
 
 	@bumpversion major --allow-dirty
+	@$(MAKE) docu
 
 minor:
 
 	@bumpversion minor --allow-dirty
+	@$(MAKE) docu
 
 patch:
 
 	@bumpversion patch --allow-dirty
+	@$(MAKE) docu
 
 pypi_test:
 	@rm -rf build dist *.egg-info
 
@@ -49,7 +49,7 @@ CoreMS aims to provide
 
 ## Current Version
 
- `3.1.0`
+ `3.2.0`
 
 ***
 
@@ -335,11 +335,11 @@ UML (unified modeling language) diagrams for Direct Infusion FT-MS and GC-MS cla
 
 If you use CoreMS in your work, please use the following citation:
 
-Version [3.1.0 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.1.0), archived on Zenodo:  
+Version [3.2.0 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.2.0), archived on Zenodo:  
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14009575.svg)](https://doi.org/10.5281/zenodo.14009575)
 
-Yuri E. Corilo, William R. Kew, Lee Ann McCue, Katherine R . Heal, James C. Carr (2024, October 29). EMSL-Computing/CoreMS: CoreMS 3.1.0 (Version v3.1.0), as developed on Github. Zenodo. http://doi.org/10.5281/zenodo.14009575
+Yuri E. Corilo, William R. Kew, Lee Ann McCue, Katherine R . Heal, James C. Carr (2024, October 29). EMSL-Computing/CoreMS: CoreMS 3.2.0 (Version v3.2.0), as developed on Github. Zenodo. http://doi.org/10.5281/zenodo.14009575
 
 ```
 
 
@@ -1,5 +1,5 @@
 __author__ = "Yuri E. Corilo"
-__version__ = "3.1.0"
+__version__ = "3.2.0"
 import time
 import os
 import sys
 
@@ -106,6 +106,7 @@ def add_molecular_formula(self, mass_spec_obj, dataframe):
         mass_spec_mz_exp_list = mass_spec_obj.mz_exp
 
         for df_index, mz_exp in enumerate(mz_exp_df):
+            bad_mf = False
             counts = 0
 
             ms_peak_index = list(mass_spec_mz_exp_list).index(float(mz_exp))
@@ -200,28 +201,32 @@ def add_molecular_formula(self, mass_spec_obj, dataframe):
                                     matched_isos.append(iso)
 
                     if len(matched_isos) == 0:
-                        raise ValueError("No isotopologue matched the formula_dict")
-                    mfobj = matched_isos[0]
-
-                    # Add the mono isotopic index, confidence score and isotopologue similarity
-                    mfobj.mspeak_index_mono_isotopic = int(
-                        dataframe.iloc[df_index]["Mono Isotopic Index"]
-                    )
-
-                # Add the confidence score and isotopologue similarity and average MZ error score
-                if "m/z Error Score" in dataframe:
-                    mfobj._mass_error_average_score = float(
-                        dataframe.iloc[df_index]["m/z Error Score"]
-                    )
-                if "Confidence Score" in dataframe:
-                    mfobj._confidence_score = float(
-                        dataframe.iloc[df_index]["Confidence Score"]
-                    )
-                if "Isotopologue Similarity" in dataframe:
-                    mfobj._isotopologue_similarity = float(
-                        dataframe.iloc[df_index]["Isotopologue Similarity"]
-                    )
-                mass_spec_obj[ms_peak_index].add_molecular_formula(mfobj)
+                        #FIXME: This should not occur see https://code.emsl.pnl.gov/mass-spectrometry/corems/-/issues/190
+                        warnings.warn(f"No isotopologue matched the formula_dict: {formula_dict}")
+                        bad_mf = True
+                    else:
+                        bad_mf = False                   
+                        mfobj = matched_isos[0]
+
+                        # Add the mono isotopic index, confidence score and isotopologue similarity
+                        mfobj.mspeak_index_mono_isotopic = int(
+                            dataframe.iloc[df_index]["Mono Isotopic Index"]
+                        )
+                if not bad_mf:
+                    # Add the confidence score and isotopologue similarity and average MZ error score
+                    if "m/z Error Score" in dataframe:
+                        mfobj._mass_error_average_score = float(
+                            dataframe.iloc[df_index]["m/z Error Score"]
+                        )
+                    if "Confidence Score" in dataframe:
+                        mfobj._confidence_score = float(
+                            dataframe.iloc[df_index]["Confidence Score"]
+                        )
+                    if "Isotopologue Similarity" in dataframe:
+                        mfobj._isotopologue_similarity = float(
+                            dataframe.iloc[df_index]["Isotopologue Similarity"]
+                        )
+                    mass_spec_obj[ms_peak_index].add_molecular_formula(mfobj)
 
 
 class ReadMassList(MassListBaseClass):
 
@@ -646,12 +646,14 @@ def __init__(self):
         super().__init__()
 
         # API endpoint for precursor m/z search
+        # inputs = mz, tolerance (in Da), polarity, page_no, per_page
         self.PRECURSOR_MZ_URL = (
-            "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}"
+            "https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
         )
 
         # API endpoint for returning full list of precursor m/z values in database
-        self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}"
+        # inputs = polarity, page_no, per_page
+        self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
 
         self.__init_format_map__()
 
@@ -674,7 +676,7 @@ def __init_format_map__(self):
         self.format_map["fe"] = self.format_map["flashentropy"]
         self.format_map["flash-entropy"] = self.format_map["flashentropy"]
 
-    def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
+    def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
         """
         Query MetabRef by precursor m/z values.
 
@@ -690,6 +692,8 @@ def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
         mz_tol_da_api : float, optional
             Maximum tolerance between precursor m/z values for API search, in daltons.
             Used to group similar mzs into a single API query for speed. Default is 0.2.
+        max_per_page : int, optional
+            Maximum records to return from MetabRef API query at a time.  Default is 50.
 
         Returns
         -------
@@ -705,7 +709,7 @@ def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
         mz_list.sort()
         mz_groups = [[mz_list[0]]]
         for x in mz_list[1:]:
-            if abs(x - mz_groups[-1][-1]) <= mz_tol_da_api:
+            if abs(x - mz_groups[-1][0]) <= mz_tol_da_api:
                 mz_groups[-1].append(x)
             else:
                 mz_groups.append([x])
@@ -722,32 +726,59 @@ def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
                 tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
                     mz_group
                 )
-            lib = lib + self.get_query(
-                self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity)
+            
+            # Get first page of results
+            response = self.get_query(
+                self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page)
             )
+            lib = lib + response['results']
+
+            # If there are more pages of results, get them
+            if response['total_pages'] > 1: 
+                for i in np.arange(2, response['total_pages']+1):
+                    lib = lib + self.get_query(
+                        self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page)
+                        )['results']
 
         return lib
 
-    def request_all_precursors(self, polarity):
+    def request_all_precursors(self, polarity, per_page = 50000):
         """
-        Request all precursor m/z values from MetabRef.
+        Request all precursor m/z values for MS2 spectra from MetabRef.
 
         Parameters
         ----------
         polarity : str
             Ionization polarity, either "positive" or "negative".
+        per_page : int, optional
+            Number of records to fetch per call. Default is 50000
 
         Returns
         -------
         list
-            List of all precursor m/z values.
+            List of all precursor m/z values, sorted.
         """
         # If polarity is anything other than positive or negative, raise error
         if polarity not in ["positive", "negative"]:
             raise ValueError("Polarity must be 'positive' or 'negative'")
 
-        # Query MetabRef for all precursor m/z values
-        return self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity))
+        precursors = []    
+
+        # Get first page of results and total number of pages of results
+        response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)))
+        total_pages = response['total_pages']
+        precursors.extend([x['precursor_ion'] for x in response['results']])
+
+        # Go through remaining pages of results
+        for i in np.arange(2, total_pages + 1):
+            response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)))
+            precursors.extend([x['precursor_ion'] for x in response['results']])
+        
+        # Sort precursors from smallest to largest and remove duplicates
+        precursors = list(set(precursors))
+        precursors.sort()
+
+        return precursors
 
     def get_lipid_library(
         self,
@@ -789,14 +820,25 @@ def get_lipid_library(
 
         """
         mz_list.sort()
+        mz_list = np.array(mz_list)
 
         # Get all precursors in the library matching the polarity
         precusors_in_lib = self.request_all_precursors(polarity=polarity)
-        precusors_in_lib.sort()
         precusors_in_lib = np.array(precusors_in_lib)
 
         # Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
-        mz_list = np.array(mz_list)
+        lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
+        lib_mz_df["closest_obs_mz"] = mz_list[
+            find_closest(mz_list, lib_mz_df.lib_mz.values)
+        ]
+        lib_mz_df["mz_diff_ppm"] = np.abs(
+            (lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
+            / lib_mz_df["lib_mz"]
+            * 1e6
+        )
+        lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
+
+        # Do the same in the opposite direction
         mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
         mz_df["closest_lib_pre_mz"] = precusors_in_lib[
             find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
@@ -808,9 +850,15 @@ def get_lipid_library(
         )
         mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
 
+        # Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
+        if len(lib_mz_sub) < len(mz_df_sub):
+            mzs_to_query = lib_mz_sub.lib_mz.values
+        else:
+            mzs_to_query = mz_df_sub.mass_feature_mz.values
+
         # Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
         lib = self.query_by_precursor(
-            mz_list=mz_df_sub.mass_feature_mz.values,
+            mz_list=mzs_to_query,
             polarity=polarity,
             mz_tol_ppm=mz_tol_ppm,
             mz_tol_da_api=mz_tol_da_api,
@@ -830,6 +878,12 @@ def get_lipid_library(
             {k: v for k, v in x.items() if k not in ["Molecular Data", "Lipid Tree"]}
             for x in lib
         ]
+        # Unpack the 'Lipid Fragments' key and the 'MSO Data" key from each entry
+        for x in lib:
+            if "Lipid Fragments" in x.keys():
+                x.update(x.pop("Lipid Fragments"))
+            if "MSO Data" in x.keys():
+                x.update(x.pop("MSO Data"))
 
         # Format the spectral library
         format_func = self._get_format_func(format)
 
@@ -88,7 +88,7 @@ def get_more_match_quals(
                 )
 
             # Get types of fragments in the lib entry
-            lib_frags = lib_entry["fragment_types"].split(", ")
+            lib_frags = lib_entry["fragment_types"]
             # make list of the fragment types that are present in the query spectrum
             lib_in_query_ids = list(
                 set(
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def get_more_match_quals(`
`88`	`88`	`)`
`89`	`89`
`90`	`90`	`# Get types of fragments in the lib entry`
`91`		`- lib_frags = lib_entry["fragment_types"].split(", ")`
	`91`	`+ lib_frags = lib_entry["fragment_types"]`
`92`	`92`	`# make list of the fragment types that are present in the query spectrum`
`93`	`93`	`lib_in_query_ids = list(`
`94`	`94`	`set(`