Skip to content

Commit f7c457a

Browse files
committed
Lint molecular_id and ms_peak modules
1 parent 6e8dc66 commit f7c457a

19 files changed

Lines changed: 3170 additions & 2474 deletions
Lines changed: 101 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
1-
1+
import pandas as pd
22
from sklearn.cluster import DBSCAN
33
from sklearn.preprocessing import StandardScaler
4-
from sklearn.cluster import MeanShift, estimate_bandwidth
54

5+
# import matplotlib.pyplot as plt
66

77

8-
import numpy as np
9-
import pandas as pd
10-
#import matplotlib.pyplot as plt
11-
12-
class ClusteringFilter():
13-
""" Class for filtering and clustering mass spectra data using various algorithms.
8+
class ClusteringFilter:
9+
"""Class for filtering and clustering mass spectra data using various algorithms.
1410
1511
Attributes
1612
-------
@@ -34,22 +30,23 @@ class ClusteringFilter():
3430
If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized.
3531
min_peaks_per_class : int
3632
Minimum number of peaks per class.
37-
33+
3834
Methods
3935
-------
4036
* get_mass_error_matrix_data(ms_peaks).
41-
Get the mass error matrix data from a list of mass peaks.
37+
Get the mass error matrix data from a list of mass peaks.
4238
* get_kendrick_matrix_data(mass_spectrum).
43-
Get the Kendrick matrix data from a mass spectrum.
39+
Get the Kendrick matrix data from a mass spectrum.
4440
* filter_kendrick(mass_spectrum).
45-
Filter the mass spectrum data using the Kendrick algorithm.
41+
Filter the mass spectrum data using the Kendrick algorithm.
4642
* filter_kendrick_by_index(ms_peak_indexes, mass_spectrum_obj).
47-
Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
43+
Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
4844
* remove_assignment_by_mass_error(mass_spectrum).
49-
Remove assignments from the mass spectrum based on mass error.
50-
45+
Remove assignments from the mass spectrum based on mass error.
46+
5147
5248
"""
49+
5350
def get_mass_error_matrix_data(self, ms_peaks):
5451
"""Get the mass error matrix data from a list of mass peaks.
5552
@@ -62,26 +59,24 @@ def get_mass_error_matrix_data(self, ms_peaks):
6259
-------
6360
matrix_data : ndarray
6461
Matrix data containing mass and error values.
65-
list_indexes_mass_spec : list
62+
list_indexes_mass_spec : list
6663
List of indexes of mass peaks in the original mass spectrum.
6764
"""
6865
mass_list = list()
6966
error_list = list()
7067
list_indexes_mass_spec = []
71-
72-
for index, mspeak in enumerate(ms_peaks):
7368

69+
for index, mspeak in enumerate(ms_peaks):
7470
if mspeak.is_assigned:
75-
76-
#print(mspeak.mz_exp, len(mspeak))
71+
# print(mspeak.mz_exp, len(mspeak))
7772
for mformula in mspeak:
7873
mass_list.append(mspeak.mz_exp)
7974
error_list.append(mformula.mz_error)
8075
list_indexes_mass_spec.append(index)
81-
82-
kendrick_dict = {'mass': mass_list, 'error': error_list}
83-
df = pd.DataFrame(kendrick_dict)
84-
matrix_data = df.values.astype("float32", copy = False)
76+
77+
kendrick_dict = {"mass": mass_list, "error": error_list}
78+
df = pd.DataFrame(kendrick_dict)
79+
matrix_data = df.values.astype("float32", copy=False)
8580
return matrix_data, list_indexes_mass_spec
8681

8782
def get_kendrick_matrix_data(self, mass_spectrum):
@@ -99,154 +94,158 @@ def get_kendrick_matrix_data(self, mass_spectrum):
9994
"""
10095
km = mass_spectrum.kendrick_mass
10196
kmd = mass_spectrum.kmd
102-
kendrick_dict = {'km': km, 'kmd': kmd}
103-
df = pd.DataFrame(kendrick_dict)
104-
matrix_data = df.values.astype("float32", copy = False)
97+
kendrick_dict = {"km": km, "kmd": kmd}
98+
df = pd.DataFrame(kendrick_dict)
99+
matrix_data = df.values.astype("float32", copy=False)
105100
return matrix_data
106101

107102
def filter_kendrick(self, mass_spectrum):
108-
""" Filter the mass spectrum data using the Kendrick algorithm.
103+
"""Filter the mass spectrum data using the Kendrick algorithm.
109104
110105
Parameters
111106
----------
112-
mass_spectrum : MassSpectrum
107+
mass_spectrum : MassSpectrum
113108
Mass spectrum object.
114109
115110
"""
116111
matrix_data = self.get_kendrick_matrix_data(mass_spectrum)
117112

118113
stdscaler = StandardScaler().fit(matrix_data)
119-
114+
120115
matrix_data_scaled = stdscaler.transform(matrix_data)
121116

122-
clusters = DBSCAN(eps = .75, min_samples=50).fit_predict(matrix_data_scaled)
123-
117+
clusters = DBSCAN(eps=0.75, min_samples=50).fit_predict(matrix_data_scaled)
118+
124119
# Number of clusters in labels, ignoring noise if present.
125120
n_clusters_ = len(set(clusters)) - (1 if -1 in clusters else 0)
126121
n_noise_ = list(clusters).count(-1)
127-
122+
128123
indexes = []
129124
for i in range(len(clusters)):
130125
if clusters[i] == -1:
131126
indexes.append(i)
132-
127+
133128
if mass_spectrum.parameters.mass_spectrum.verbose_processing:
134-
print('Estimated number of clusters: %d' % n_clusters_)
135-
print('Estimated number of noise points: %d' % n_noise_)
129+
print("Estimated number of clusters: %d" % n_clusters_)
130+
print("Estimated number of noise points: %d" % n_noise_)
136131
mass_spectrum.filter_by_index(indexes)
137-
#from matplotlib import pyplot as plt
138-
#plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
139-
#plt.xlabel("km")
140-
#plt.ylabel("kmd")
141-
#plt.show()
142-
#plt.close()
132+
# from matplotlib import pyplot as plt
133+
# plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
134+
# plt.xlabel("km")
135+
# plt.ylabel("kmd")
136+
# plt.show()
137+
# plt.close()
143138

144139
def filter_kendrick_by_index(self, ms_peak_indexes, mass_spectrum_obj):
145-
""" Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
140+
"""Filter the mass spectrum data using the Kendrick algorithm based on a list of peak indexes.
146141
147142
Parameters
148143
----------
149-
ms_peak_indexes : list
144+
ms_peak_indexes : list
150145
List of peak indexes.
151-
mass_spectrum_obj : MassSpectrum
146+
mass_spectrum_obj : MassSpectrum
152147
Mass spectrum object.
153148
154149
Returns
155150
-------
156-
noise_idx : list
151+
noise_idx : list
157152
List of indexes of noise points in the mass spectrum.
158153
"""
159154
min_samples = mass_spectrum_obj.molecular_search_settings.min_peaks_per_class
160155

161-
kendrick_dict = {'km': list(), 'kmd': list()}
156+
kendrick_dict = {"km": list(), "kmd": list()}
157+
158+
if len(ms_peak_indexes) <= 1:
159+
return []
162160

163-
if len(ms_peak_indexes) <= 1: return []
164-
165161
for index, _ in ms_peak_indexes:
166-
kendrick_dict["km"].append(mass_spectrum_obj[index].kendrick_mass)
167-
kendrick_dict["kmd"].append(mass_spectrum_obj[index].kmd)
168-
162+
kendrick_dict["km"].append(mass_spectrum_obj[index].kendrick_mass)
163+
kendrick_dict["kmd"].append(mass_spectrum_obj[index].kmd)
164+
169165
# check min data points otherwise StandardScaler().fit(0 will fail
170-
171-
df = pd.DataFrame(kendrick_dict)
172-
matrix_data = df.values.astype("float32", copy = False)
166+
167+
df = pd.DataFrame(kendrick_dict)
168+
matrix_data = df.values.astype("float32", copy=False)
173169

174170
stdscaler = StandardScaler().fit(matrix_data)
175171
matrix_data_scaled = stdscaler.transform(matrix_data)
176172

177-
clusters = DBSCAN(eps = .8, min_samples=min_samples).fit_predict(matrix_data_scaled)
178-
173+
clusters = DBSCAN(eps=0.8, min_samples=min_samples).fit_predict(
174+
matrix_data_scaled
175+
)
176+
179177
# Number of clusters in labels, ignoring noise if present.
180178
n_clusters_ = len(set(clusters)) - (1 if -1 in clusters else 0)
181179
n_noise_ = list(clusters).count(-1)
182-
180+
183181
if mass_spectrum_obj.parameters.mass_spectrum.verbose_processing:
184-
print('Estimated number of clusters: %d' % n_clusters_)
185-
print('Estimated number of noise points: %d' % n_noise_)
182+
print("Estimated number of clusters: %d" % n_clusters_)
183+
print("Estimated number of noise points: %d" % n_noise_)
186184

187185
noise_idx = []
188-
186+
189187
other_peaks_idx = []
190188

191189
for i in range(len(clusters)):
192-
193190
if clusters[i] == -1:
194191
noise_idx.append(ms_peak_indexes[i])
195-
192+
196193
else:
197-
other_peaks_idx.append(ms_peak_indexes[i])
198-
199-
#mfs = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in other_peaks_idx]
200-
201-
#mfs_noise = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in noise_idx]
202-
203-
#print(mfs)
204-
#print(mfs_noise)
205-
206-
#from matplotlib import pyplot as plt
207-
#plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
208-
#plt.xlabel("km")
209-
#plt.ylabel("kmd")
210-
#plt.show()
211-
#plt.close()
212-
213-
return noise_idx
194+
other_peaks_idx.append(ms_peak_indexes[i])
195+
196+
# mfs = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in other_peaks_idx]
197+
198+
# mfs_noise = [mass_spectrum_obj[index].best_molecular_formula_candidate.string for index in noise_idx]
199+
200+
# print(mfs)
201+
# print(mfs_noise)
202+
203+
# from matplotlib import pyplot as plt
204+
# plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="jet")
205+
# plt.xlabel("km")
206+
# plt.ylabel("kmd")
207+
# plt.show()
208+
# plt.close()
209+
210+
return noise_idx
214211

215212
def remove_assignment_by_mass_error(self, mass_spectrum):
216-
""" Remove assignments from the mass spectrum based on mass error.
213+
"""Remove assignments from the mass spectrum based on mass error.
217214
218215
Parameters
219216
----------
220217
mass_spectrum : MassSpectrum
221218
Mass spectrum object.
222219
223220
"""
224-
#data need to be binned by mz unit or more to be able to use clustering
225-
226-
matrix_data, list_indexes_mass_spec = self.get_mass_error_matrix_data(mass_spectrum)
221+
# data need to be binned by mz unit or more to be able to use clustering
222+
223+
matrix_data, list_indexes_mass_spec = self.get_mass_error_matrix_data(
224+
mass_spectrum
225+
)
227226

228227
stdscaler = StandardScaler().fit(matrix_data)
229-
228+
230229
matrix_data_scaled = stdscaler.transform(matrix_data)
231-
232-
#bandwidth = estimate_bandwidth(matrix_data_scaled, quantile=0.3, n_samples=int(len(ms_peaks)/3))
233-
234-
#clusters = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(matrix_data_scaled)
235-
236-
#eps and min_samp need to be optimized by precision and number of mspeaks
237-
clusters = DBSCAN(eps = .15).fit_predict(matrix_data_scaled)
238-
230+
231+
# bandwidth = estimate_bandwidth(matrix_data_scaled, quantile=0.3, n_samples=int(len(ms_peaks)/3))
232+
233+
# clusters = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(matrix_data_scaled)
234+
235+
# eps and min_samp need to be optimized by precision and number of mspeaks
236+
clusters = DBSCAN(eps=0.15).fit_predict(matrix_data_scaled)
237+
239238
indexes = []
240-
241-
#from matplotlib import pyplot as plt
242-
#plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="plasma")
243-
#plt.xlabel("km")
244-
#plt.ylabel("kmd")
245-
#plt.show()
246-
#plt.close()
239+
240+
# from matplotlib import pyplot as plt
241+
# plt.scatter(matrix_data[:, 0], matrix_data[:, 1], c=clusters, cmap="plasma")
242+
# plt.xlabel("km")
243+
# plt.ylabel("kmd")
244+
# plt.show()
245+
# plt.close()
247246

248247
for i in range(len(clusters)):
249248
if clusters[i] == -1:
250249
indexes.append(list_indexes_mass_spec[i])
251-
252-
mass_spectrum.remove_assignment_by_index(indexes)
250+
251+
mass_spectrum.remove_assignment_by_index(indexes)

corems/molecular_id/calc/KendrickGroup.py

Whitespace-only changes.

0 commit comments

Comments
 (0)