Skip to content

Commit cf42a93

Browse files
authored
Compute PCA with all possible components then give solution with optimal number of selected criterion (#49)
* Compute PCA with all possible components then select optimal number Also share variance explained with all criteria. * Updated docstring * Fix linting error
1 parent e379c26 commit cf42a93

1 file changed

Lines changed: 89 additions & 41 deletions

File tree

mapca/mapca.py

Lines changed: 89 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -84,22 +84,32 @@ class MovingAveragePCA:
8484
n_features_ : int
8585
Number of features in the training data.
8686
n_samples_ : int
87-
Number of samples in the training data.
88-
noise_variance_ : float
89-
The estimated noise covariance following the Probabilistic PCA model
90-
from Tipping and Bishop 1999.
91-
See “Pattern Recognition and Machine Learning” by C. Bishop, 12.2.1 p. 574
92-
or http://www.miketipping.com/papers/met-mppca.pdf.
93-
It is required to compute the estimated data covariance and score samples.
94-
95-
Equal to the average of (min(n_features, n_samples) - n_components) smallest
96-
eigenvalues of the covariance matrix of X.
97-
aic_ : :obj:`numpy.ndarray`, shape (n_components)
98-
The Akaike Information Criterion optimization curve.
99-
kic_ : :obj:`numpy.ndarray`, shape (n_components)
100-
The Kullback-Leibler Information Criterion optimization curve.
101-
mdl_ : :obj:`numpy.ndarray`, shape (n_components)
102-
The Minimum Description Length optimization curve.
87+
Number of samples in the training data
88+
aic_ : dict
89+
Dictionary containing the Akaike Information Criterion results:
90+
- 'n_components': The number of components chosen by the AIC criterion.
91+
- 'value': The AIC curve values.
92+
- 'explained_variance_total': The total explained variance of the components.
93+
kic_ : dict
94+
Dictionary containing the Kullback-Leibler Information Criterion results:
95+
- 'n_components': The number of components chosen by the KIC criterion.
96+
- 'value': The KIC curve values.
97+
- 'explained_variance_total': The total explained variance of the components.
98+
mdl_ : dict
99+
Dictionary containing the Minimum Description Length results:
100+
- 'n_components': The number of components chosen by the MDL criterion.
101+
- 'value': The MDL curve values.
102+
- 'explained_variance_total': The total explained variance of the components.
103+
varexp_90 : dict
104+
Dictionary containing the 90% variance explained results:
105+
- 'n_components': The number of components chosen by the 90% variance explained
106+
criterion.
107+
- 'explained_variance_total': The total explained variance of the components.
108+
varexp_95 : dict
109+
Dictionary containing the 95% variance explained results:
110+
- 'n_components': The number of components chosen by the 95% variance explained
111+
criterion.
112+
- 'explained_variance_total': The total explained variance of the components.
103113
104114
References
105115
----------
@@ -240,66 +250,104 @@ def _fit(self, img, mask):
240250

241251
LGR.info("Estimating the dimensionality ...")
242252
p = n_timepoints
243-
self.aic_ = np.zeros(p - 1)
244-
self.kic_ = np.zeros(p - 1)
245-
self.mdl_ = np.zeros(p - 1)
253+
aic = np.zeros(p - 1)
254+
kic = np.zeros(p - 1)
255+
mdl = np.zeros(p - 1)
246256

247257
for k_idx, k in enumerate(np.arange(1, p)):
248258
LH = np.log(np.prod(np.power(eigenvalues[k:], 1 / (p - k))) / np.mean(eigenvalues[k:]))
249259
mlh = 0.5 * N * (p - k) * LH
250260
df = 1 + 0.5 * k * (2 * p - k + 1)
251-
self.aic_[k_idx] = (-2 * mlh) + (2 * df)
252-
self.kic_[k_idx] = (-2 * mlh) + (3 * df)
253-
self.mdl_[k_idx] = -mlh + (0.5 * df * np.log(N))
261+
aic[k_idx] = (-2 * mlh) + (2 * df)
262+
kic[k_idx] = (-2 * mlh) + (3 * df)
263+
mdl[k_idx] = -mlh + (0.5 * df * np.log(N))
254264

255-
itc = np.row_stack([self.aic_, self.kic_, self.mdl_])
265+
itc = np.row_stack([aic, kic, mdl])
256266

257267
dlap = np.diff(itc, axis=1)
258268

269+
# Calculate optimal number of components with each criterion
259270
# AIC
260271
a_aic = np.where(dlap[0, :] > 0)[0] + 1
261272
if a_aic.size == 0:
262-
self.n_aic_ = itc[0, :].shape[0]
273+
n_aic = itc[0, :].shape[0]
263274
else:
264-
self.n_aic_ = a_aic[0]
275+
n_aic = a_aic[0]
265276

266277
# KIC
267278
a_kic = np.where(dlap[1, :] > 0)[0] + 1
268279
if a_kic.size == 0:
269-
self.n_kic_ = itc[1, :].shape[0]
280+
n_kic = itc[1, :].shape[0]
270281
else:
271-
self.n_kic_ = a_kic[0]
282+
n_kic = a_kic[0]
272283

273284
# MDL
274285
a_mdl = np.where(dlap[2, :] > 0)[0] + 1
275286
if a_mdl.size == 0:
276-
self.n_mdl_ = itc[2, :].shape[0]
287+
n_mdl = itc[2, :].shape[0]
277288
else:
278-
self.n_mdl_ = a_mdl[0]
289+
n_mdl = a_mdl[0]
279290

280291
if self.criterion == "aic":
281-
n_components = self.n_aic_
292+
n_components = n_aic
282293
elif self.criterion == "kic":
283-
n_components = self.n_kic_
294+
n_components = n_kic
284295
elif self.criterion == "mdl":
285-
n_components = self.n_mdl_
296+
n_components = n_mdl
286297

287-
LGR.info("Estimated number of components is %d" % n_components)
298+
LGR.info("Performing PCA")
288299

289-
# PCA with estimated number of components
290-
ppca = PCA(n_components=n_components, svd_solver="full", copy=False, whiten=False)
300+
# PCA with all possible components (the estimated selection is made after)
301+
ppca = PCA(n_components=None, svd_solver="full", copy=False, whiten=False)
291302
ppca.fit(X)
292303

304+
# Get cumulative explained variance as components are added
305+
cumsum_varexp = np.cumsum(ppca.explained_variance_ratio_)
306+
307+
# Calculate number of components for 90% varexp
308+
n_comp_varexp_90 = np.where(cumsum_varexp >= 0.9)[0][0] + 1
309+
310+
# Calculate number of components for 95% varexp
311+
n_comp_varexp_95 = np.where(cumsum_varexp >= 0.95)[0][0] + 1
312+
313+
LGR.info("Estimated number of components is %d" % n_components)
314+
315+
# Save results of each criterion into dictionaries
316+
self.aic_ = {
317+
"n_components": n_aic,
318+
"value": aic,
319+
"explained_variance_total": cumsum_varexp[n_aic - 1],
320+
}
321+
self.kic_ = {
322+
"n_components": n_kic,
323+
"value": kic,
324+
"explained_variance_total": cumsum_varexp[n_kic - 1],
325+
}
326+
self.mdl_ = {
327+
"n_components": n_mdl,
328+
"value": mdl,
329+
"explained_variance_total": cumsum_varexp[n_mdl - 1],
330+
}
331+
self.varexp_90_ = {
332+
"n_components": n_comp_varexp_90,
333+
"explained_variance_total": cumsum_varexp[n_comp_varexp_90 - 1],
334+
}
335+
self.varexp_95_ = {
336+
"n_components": n_comp_varexp_95,
337+
"explained_variance_total": cumsum_varexp[n_comp_varexp_95 - 1],
338+
}
339+
293340
# Assign attributes from model
294-
self.components_ = ppca.components_
295-
self.explained_variance_ = ppca.explained_variance_
296-
self.explained_variance_ratio_ = ppca.explained_variance_ratio_
297-
self.singular_values_ = ppca.singular_values_
341+
self.components_ = ppca.components_[:n_components, :]
342+
self.explained_variance_ = ppca.explained_variance_[:n_components]
343+
self.explained_variance_ratio_ = ppca.explained_variance_ratio_[:n_components]
344+
self.singular_values_ = ppca.singular_values_[:n_components]
298345
self.mean_ = ppca.mean_
299-
self.n_components_ = ppca.n_components_
346+
self.n_components_ = n_components
300347
self.n_features_ = ppca.n_features_
301348
self.n_samples_ = ppca.n_samples_
302-
self.noise_variance_ = ppca.noise_variance_
349+
# Commenting out noise variance as it depends on the covariance of the estimation
350+
# self.noise_variance_ = ppca.noise_variance_
303351
component_maps = np.dot(
304352
np.dot(X, self.components_.T), np.diag(1.0 / self.explained_variance_)
305353
)

0 commit comments

Comments
 (0)