Skip to content

Commit 2d78548

Browse files
committed
Start to add pagination options to MetabRefLCInterface class's methods
1 parent c77d0ae commit 2d78548

2 files changed

Lines changed: 63 additions & 16 deletions

File tree

corems/molecular_id/search/database_interfaces.py

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -646,12 +646,14 @@ def __init__(self):
646646
super().__init__()
647647

648648
# API endpoint for precursor m/z search
649+
# inputs = mz, tolerance (in Da), polarity, page_no, per_page
649650
self.PRECURSOR_MZ_URL = (
650-
"https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}"
651+
"https://metabref.emsl.pnnl.gov/api/precursors/m/{}/t/{}/{}?page={}&per_page={}"
651652
)
652653

653654
# API endpoint for returning full list of precursor m/z values in database
654-
self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}"
655+
# inputs = polarity, page_no, per_page
656+
self.PRECURSOR_MZ_ALL_URL = "https://metabref.emsl.pnnl.gov/api/precursors/{}?page={}&per_page={}"
655657

656658
self.__init_format_map__()
657659

@@ -674,7 +676,7 @@ def __init_format_map__(self):
674676
self.format_map["fe"] = self.format_map["flashentropy"]
675677
self.format_map["flash-entropy"] = self.format_map["flashentropy"]
676678

677-
def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
679+
def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2, max_per_page=50):
678680
"""
679681
Query MetabRef by precursor m/z values.
680682
@@ -690,6 +692,8 @@ def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
690692
mz_tol_da_api : float, optional
691693
Maximum tolerance between precursor m/z values for API search, in daltons.
692694
Used to group similar mzs into a single API query for speed. Default is 0.2.
695+
max_per_page : int, optional
696+
Maximum records to return from MetabRef API query at a time. Default is 50.
693697
694698
Returns
695699
-------
@@ -722,32 +726,59 @@ def query_by_precursor(self, mz_list, polarity, mz_tol_ppm, mz_tol_da_api=0.2):
722726
tol = (max(mz_group) - min(mz_group)) / 2 + mz_tol_ppm**-6 * max(
723727
mz_group
724728
)
725-
lib = lib + self.get_query(
726-
self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity)
729+
730+
# Get first page of results
731+
response = self.get_query(
732+
self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, 1, max_per_page)
727733
)
734+
lib = lib + response['results']
735+
736+
# If there are more pages of results, get them
737+
if response['total_pages'] > 1:
738+
for i in np.arange(2, response['total_pages']+1):
739+
lib = lib + self.get_query(
740+
self.PRECURSOR_MZ_URL.format(str(mz), str(tol), polarity, i, max_per_page)
741+
)['results']
728742

729743
return lib
730744

731-
def request_all_precursors(self, polarity):
745+
def request_all_precursors(self, polarity, per_page = 50000):
732746
"""
733-
Request all precursor m/z values from MetabRef.
747+
Request all precursor m/z values for MS2 spectra from MetabRef.
734748
735749
Parameters
736750
----------
737751
polarity : str
738752
Ionization polarity, either "positive" or "negative".
753+
per_page : int, optional
754+
Number of records to fetch per call. Default is 50000
739755
740756
Returns
741757
-------
742758
list
743-
List of all precursor m/z values.
759+
List of all precursor m/z values, sorted.
744760
"""
745761
# If polarity is anything other than positive or negative, raise error
746762
if polarity not in ["positive", "negative"]:
747763
raise ValueError("Polarity must be 'positive' or 'negative'")
748764

749-
# Query MetabRef for all precursor m/z values
750-
return self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity))
765+
precursors = []
766+
767+
# Get first page of results and total number of pages of results
768+
response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(1), str(per_page)))
769+
total_pages = response['total_pages']
770+
precursors.extend([x['precursor_ion'] for x in response['results']])
771+
772+
# Go through remaining pages of results
773+
for i in np.arange(2, total_pages + 1):
774+
response = self.get_query(self.PRECURSOR_MZ_ALL_URL.format(polarity, str(i), str(per_page)))
775+
precursors.extend([x['precursor_ion'] for x in response['results']])
776+
777+
# Sort precursors from smallest to largest and remove duplicates
778+
precursors = list(set(precursors))
779+
precursors.sort()
780+
781+
return precursors
751782

752783
def get_lipid_library(
753784
self,
@@ -789,14 +820,25 @@ def get_lipid_library(
789820
790821
"""
791822
mz_list.sort()
823+
mz_list = np.array(mz_list)
792824

793825
# Get all precursors in the library matching the polarity
794826
precusors_in_lib = self.request_all_precursors(polarity=polarity)
795-
precusors_in_lib.sort()
796827
precusors_in_lib = np.array(precusors_in_lib)
797828

798829
# Compare the mz_list with the precursors in the library, keep any mzs that are within mz_tol of any precursor in the library
799-
mz_list = np.array(mz_list)
830+
lib_mz_df = pd.DataFrame(precusors_in_lib, columns=["lib_mz"])
831+
lib_mz_df["closest_obs_mz"] = mz_list[
832+
find_closest(mz_list, lib_mz_df.lib_mz.values)
833+
]
834+
lib_mz_df["mz_diff_ppm"] = np.abs(
835+
(lib_mz_df["lib_mz"] - lib_mz_df["closest_obs_mz"])
836+
/ lib_mz_df["lib_mz"]
837+
* 1e6
838+
)
839+
lib_mz_sub = lib_mz_df[lib_mz_df["mz_diff_ppm"] <= mz_tol_ppm]
840+
841+
# Do the same in the opposite direction
800842
mz_df = pd.DataFrame(mz_list, columns=["mass_feature_mz"])
801843
mz_df["closest_lib_pre_mz"] = precusors_in_lib[
802844
find_closest(precusors_in_lib, mz_df.mass_feature_mz.values)
@@ -808,9 +850,15 @@ def get_lipid_library(
808850
)
809851
mz_df_sub = mz_df[mz_df["mz_diff_ppm"] <= mz_tol_ppm]
810852

853+
# Evaluate which is fewer mzs - lib_mz_sub or mz_df_sub and use that as the input for next step
854+
if len(lib_mz_sub) < len(mz_df_sub):
855+
mzs_to_query = lib_mz_sub.lib_mz.values
856+
else:
857+
mzs_to_query = mz_df_sub.mass_feature_mz.values
858+
811859
# Query the library for the precursors in the mz_list that are in the library to retrieve the spectra and metadata
812860
lib = self.query_by_precursor(
813-
mz_list=mz_df_sub.mass_feature_mz.values,
861+
mz_list=mzs_to_query,
814862
polarity=polarity,
815863
mz_tol_ppm=mz_tol_ppm,
816864
mz_tol_da_api=mz_tol_da_api,

support_code/nmdc/lipidomics/lipidomics_workflow.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,8 @@ def run_lipid_sp_ms1(
416416
check_scan_translator(myLCMSobj, scan_translator)
417417
add_mass_features(myLCMSobj, scan_translator)
418418
myLCMSobj.remove_unprocessed_data()
419-
if ms1_molecular_search:
420-
molecular_formula_search(myLCMSobj)
419+
#if ms1_molecular_search:
420+
# molecular_formula_search(myLCMSobj)
421421
export_results(myLCMSobj, out_path=out_path, final=False)
422422
save_times(myLCMSobj, time_start, out_path)
423423
if return_mzs:
@@ -472,7 +472,6 @@ def prep_metadata(mz_dicts, out_dir, token_path):
472472
mz_list=metadata["mzs"]["positive"],
473473
polarity="positive",
474474
mz_tol_ppm=5,
475-
mz_tol_da_api=0.01,
476475
format="flashentropy",
477476
normalize=True,
478477
fe_kwargs={

0 commit comments

Comments
 (0)