1-
1+ import pandas as pd
22from sklearn .cluster import DBSCAN
33from sklearn .preprocessing import StandardScaler
4- from sklearn .cluster import MeanShift , estimate_bandwidth
54
5+ # import matplotlib.pyplot as plt
66
77
8- import numpy as np
9- import pandas as pd
10- #import matplotlib.pyplot as plt
11-
12- class ClusteringFilter ():
13- """ Class for filtering and clustering mass spectra data using various algorithms.
8+ class ClusteringFilter :
9+ """Class for filtering and clustering mass spectra data using various algorithms.
1410
1511 Attributes
1612 -------
@@ -34,22 +30,23 @@ class ClusteringFilter():
3430 If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized.
3531 min_peaks_per_class : int
3632 Minimum number of peaks per class.
37-
33+
3834 Methods
3935 -------
4036 * get_mass_error_matrix_data(ms_peaks).
41- Get the mass error matrix data from a list of mass peaks.
37+ Get the mass error matrix data from a list of mass peaks.
4238 * get_kendrick_matrix_data(mass_spectrum).
43- Get the Kendrick matrix data from a mass spectrum.
39+ Get the Kendrick matrix data from a mass spectrum.
4440 * filter_kendrick(mass_spectrum).
45- Filter the mass spectrum data using the Kendrick algorithm.
41+ Filter the mass spectrum data using the Kendrick algorithm.
4642 * filter_kendrick_by_index(ms_peak_indexes, mass_spectrum_obj).
47- Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
43+ Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
4844 * remove_assignment_by_mass_error(mass_spectrum).
49- Remove assignments from the mass spectrum based on mass error.
50-
45+ Remove assignments from the mass spectrum based on mass error.
46+
5147
5248 """
49+
5350 def get_mass_error_matrix_data (self , ms_peaks ):
5451 """Get the mass error matrix data from a list of mass peaks.
5552
@@ -62,26 +59,24 @@ def get_mass_error_matrix_data(self, ms_peaks):
6259 -------
6360 matrix_data : ndarray
6461 Matrix data containing mass and error values.
65- list_indexes_mass_spec : list
62+ list_indexes_mass_spec : list
6663 List of indexes of mass peaks in the original mass spectrum.
6764 """
6865 mass_list = list ()
6966 error_list = list ()
7067 list_indexes_mass_spec = []
71-
72- for index , mspeak in enumerate (ms_peaks ):
7368
69+ for index , mspeak in enumerate (ms_peaks ):
7470 if mspeak .is_assigned :
75-
76- #print(mspeak.mz_exp, len(mspeak))
71+ # print(mspeak.mz_exp, len(mspeak))
7772 for mformula in mspeak :
7873 mass_list .append (mspeak .mz_exp )
7974 error_list .append (mformula .mz_error )
8075 list_indexes_mass_spec .append (index )
81-
82- kendrick_dict = {' mass' : mass_list , ' error' : error_list }
83- df = pd .DataFrame (kendrick_dict )
84- matrix_data = df .values .astype ("float32" , copy = False )
76+
77+ kendrick_dict = {" mass" : mass_list , " error" : error_list }
78+ df = pd .DataFrame (kendrick_dict )
79+ matrix_data = df .values .astype ("float32" , copy = False )
8580 return matrix_data , list_indexes_mass_spec
8681
8782 def get_kendrick_matrix_data (self , mass_spectrum ):
@@ -99,154 +94,158 @@ def get_kendrick_matrix_data(self, mass_spectrum):
9994 """
10095 km = mass_spectrum .kendrick_mass
10196 kmd = mass_spectrum .kmd
102- kendrick_dict = {'km' : km , ' kmd' : kmd }
103- df = pd .DataFrame (kendrick_dict )
104- matrix_data = df .values .astype ("float32" , copy = False )
97+ kendrick_dict = {"km" : km , " kmd" : kmd }
98+ df = pd .DataFrame (kendrick_dict )
99+ matrix_data = df .values .astype ("float32" , copy = False )
105100 return matrix_data
106101
107102 def filter_kendrick (self , mass_spectrum ):
108- """ Filter the mass spectrum data using the Kendrick algorithm.
103+ """Filter the mass spectrum data using the Kendrick algorithm.
109104
110105 Parameters
111106 ----------
112- mass_spectrum : MassSpectrum
107+ mass_spectrum : MassSpectrum
113108 Mass spectrum object.
114109
115110 """
116111 matrix_data = self .get_kendrick_matrix_data (mass_spectrum )
117112
118113 stdscaler = StandardScaler ().fit (matrix_data )
119-
114+
120115 matrix_data_scaled = stdscaler .transform (matrix_data )
121116
122- clusters = DBSCAN (eps = .75 , min_samples = 50 ).fit_predict (matrix_data_scaled )
123-
117+ clusters = DBSCAN (eps = 0 .75 , min_samples = 50 ).fit_predict (matrix_data_scaled )
118+
124119 # Number of clusters in labels, ignoring noise if present.
125120 n_clusters_ = len (set (clusters )) - (1 if - 1 in clusters else 0 )
126121 n_noise_ = list (clusters ).count (- 1 )
127-
122+
128123 indexes = []
129124 for i in range (len (clusters )):
130125 if clusters [i ] == - 1 :
131126 indexes .append (i )
132-
127+
133128 if mass_spectrum .parameters .mass_spectrum .verbose_processing :
134- print (' Estimated number of clusters: %d' % n_clusters_ )
135- print (' Estimated number of noise points: %d' % n_noise_ )
129+ print (" Estimated number of clusters: %d" % n_clusters_ )
130+ print (" Estimated number of noise points: %d" % n_noise_ )
136131 mass_spectrum .filter_by_index (indexes )
137- #from matplotlib import pyplot as plt
138- #plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
139- #plt.xlabel("km")
140- #plt.ylabel("kmd")
141- #plt.show()
142- #plt.close()
132+ # from matplotlib import pyplot as plt
133+ # plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
134+ # plt.xlabel("km")
135+ # plt.ylabel("kmd")
136+ # plt.show()
137+ # plt.close()
143138
144139 def filter_kendrick_by_index (self , ms_peak_indexes , mass_spectrum_obj ):
145- """ Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
140+ """Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
146141
147142 Parameters
148143 ----------
149- ms_peak_indexes : list
144+ ms_peak_indexes : list
150145 List of peak indexes.
151- mass_spectrum_obj : MassSpectrum
146+ mass_spectrum_obj : MassSpectrum
152147 Mass spectrum object.
153148
154149 Returns
155150 -------
156- noise_idx : list
151+ noise_idx : list
157152 List of indexes of noise points in the mass spectrum.
158153 """
159154 min_samples = mass_spectrum_obj .molecular_search_settings .min_peaks_per_class
160155
161- kendrick_dict = {'km' : list (), 'kmd' : list ()}
156+ kendrick_dict = {"km" : list (), "kmd" : list ()}
157+
158+ if len (ms_peak_indexes ) <= 1 :
159+ return []
162160
163- if len (ms_peak_indexes ) <= 1 : return []
164-
165161 for index , _ in ms_peak_indexes :
166- kendrick_dict ["km" ].append (mass_spectrum_obj [index ].kendrick_mass )
167- kendrick_dict ["kmd" ].append (mass_spectrum_obj [index ].kmd )
168-
162+ kendrick_dict ["km" ].append (mass_spectrum_obj [index ].kendrick_mass )
163+ kendrick_dict ["kmd" ].append (mass_spectrum_obj [index ].kmd )
164+
169165 # check min data points otherwise StandardScaler().fit(0 will fail
170-
171- df = pd .DataFrame (kendrick_dict )
172- matrix_data = df .values .astype ("float32" , copy = False )
166+
167+ df = pd .DataFrame (kendrick_dict )
168+ matrix_data = df .values .astype ("float32" , copy = False )
173169
174170 stdscaler = StandardScaler ().fit (matrix_data )
175171 matrix_data_scaled = stdscaler .transform (matrix_data )
176172
177- clusters = DBSCAN (eps = .8 , min_samples = min_samples ).fit_predict (matrix_data_scaled )
178-
173+ clusters = DBSCAN (eps = 0.8 , min_samples = min_samples ).fit_predict (
174+ matrix_data_scaled
175+ )
176+
179177 # Number of clusters in labels, ignoring noise if present.
180178 n_clusters_ = len (set (clusters )) - (1 if - 1 in clusters else 0 )
181179 n_noise_ = list (clusters ).count (- 1 )
182-
180+
183181 if mass_spectrum_obj .parameters .mass_spectrum .verbose_processing :
184- print (' Estimated number of clusters: %d' % n_clusters_ )
185- print (' Estimated number of noise points: %d' % n_noise_ )
182+ print (" Estimated number of clusters: %d" % n_clusters_ )
183+ print (" Estimated number of noise points: %d" % n_noise_ )
186184
187185 noise_idx = []
188-
186+
189187 other_peaks_idx = []
190188
191189 for i in range (len (clusters )):
192-
193190 if clusters [i ] == - 1 :
194191 noise_idx .append (ms_peak_indexes [i ])
195-
192+
196193 else :
197- other_peaks_idx .append (ms_peak_indexes [i ])
198-
199- #mfs = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in other_peaks_idx]
200-
201- #mfs_noise = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in noise_idx]
202-
203- #print(mfs)
204- #print(mfs_noise)
205-
206- #from matplotlib import pyplot as plt
207- #plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
208- #plt.xlabel("km")
209- #plt.ylabel("kmd")
210- #plt.show()
211- #plt.close()
212-
213- return noise_idx
194+ other_peaks_idx .append (ms_peak_indexes [i ])
195+
196+ # mfs = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in other_peaks_idx]
197+
198+ # mfs_noise = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in noise_idx]
199+
200+ # print(mfs)
201+ # print(mfs_noise)
202+
203+ # from matplotlib import pyplot as plt
204+ # plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
205+ # plt.xlabel("km")
206+ # plt.ylabel("kmd")
207+ # plt.show()
208+ # plt.close()
209+
210+ return noise_idx
214211
215212 def remove_assignment_by_mass_error (self , mass_spectrum ):
216- """ Remove assignments from the mass spectrum based on mass error.
213+ """Remove assignments from the mass spectrum based on mass error.
217214
218215 Parameters
219216 ----------
220217 mass_spectrum : MassSpectrum
221218 Mass spectrum object.
222219
223220 """
224- #data need to be binned by mz unit or more to be able to use clustering
225-
226- matrix_data , list_indexes_mass_spec = self .get_mass_error_matrix_data (mass_spectrum )
221+ # data need to be binned by mz unit or more to be able to use clustering
222+
223+ matrix_data , list_indexes_mass_spec = self .get_mass_error_matrix_data (
224+ mass_spectrum
225+ )
227226
228227 stdscaler = StandardScaler ().fit (matrix_data )
229-
228+
230229 matrix_data_scaled = stdscaler .transform (matrix_data )
231-
232- #bandwidth = estimate_bandwidth(matrix_data_scaled, quantile=0.3, n_samples=int(len(ms_peaks)/3))
233-
234- #clusters = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(matrix_data_scaled)
235-
236- #eps and min_samp need to be optimized by precision and number of mspeaks
237- clusters = DBSCAN (eps = .15 ).fit_predict (matrix_data_scaled )
238-
230+
231+ # bandwidth = estimate_bandwidth(matrix_data_scaled, quantile=0.3, n_samples=int(len(ms_peaks)/3))
232+
233+ # clusters = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(matrix_data_scaled)
234+
235+ # eps and min_samp need to be optimized by precision and number of mspeaks
236+ clusters = DBSCAN (eps = 0 .15 ).fit_predict (matrix_data_scaled )
237+
239238 indexes = []
240-
241- #from matplotlib import pyplot as plt
242- #plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="plasma")
243- #plt.xlabel("km")
244- #plt.ylabel("kmd")
245- #plt.show()
246- #plt.close()
239+
240+ # from matplotlib import pyplot as plt
241+ # plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="plasma")
242+ # plt.xlabel("km")
243+ # plt.ylabel("kmd")
244+ # plt.show()
245+ # plt.close()
247246
248247 for i in range (len (clusters )):
249248 if clusters [i ] == - 1 :
250249 indexes .append (list_indexes_mass_spec [i ])
251-
252- mass_spectrum .remove_assignment_by_index (indexes )
250+
251+ mass_spectrum .remove_assignment_by_index (indexes )
0 commit comments