Skip to content

Commit f68b40a

Browse files
authored
Merge pull request #50 from JesseLivezey/pi_stride
Pi stride
2 parents e402859 + 0e30ff0 commit f68b40a

4 files changed

Lines changed: 95 additions & 30 deletions

File tree

dca/cov_util.py

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
from numpy.lib.stride_tricks import as_strided
77

88
from sklearn.utils.extmath import randomized_svd
9+
from sklearn.utils import check_random_state
910

1011

11-
def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
12+
def form_lag_matrix(X, T, stride=1, stride_tricks=True, rng=None, writeable=False):
1213
"""Form the data matrix with `T` lags.
1314
1415
Parameters
@@ -17,8 +18,13 @@ def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
1718
Timeseries with no lags.
1819
T : int
1920
Number of lags.
20-
stride : int
21-
Number of original samples to move between lagged samples.
21+
stride : int or float
22+
If stride is an `int`, it defines the stride between lagged samples used
23+
to estimate the cross covariance matrix. Setting stride > 1 can speed up the
24+
calculation, but may lead to a loss in accuracy. Setting stride to a `float`
25+
greater than 0 and less than 1 will random subselect samples.
26+
rng : NumPy random state
27+
Only used if `stride` is a float.
2228
stride_tricks : bool
2329
Whether to use numpy stride tricks to form the lagged matrix or create
2430
a new array. Using numpy stride tricks can can lower memory usage, especially for
@@ -34,8 +40,14 @@ def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
3440
Timeseries with lags.
3541
"""
3642
if not isinstance(stride, int) or stride < 1:
37-
raise ValueError('stride should be an int and greater than or equal to 1.')
43+
if not isinstance(stride, float) or stride <= 0. or stride >= 1.:
44+
raise ValueError('stride should be an int and greater than or equal to 1 or a float ' +
45+
'between 0 and 1.')
3846
N = X.shape[1]
47+
frac = None
48+
if isinstance(stride, float):
49+
frac = stride
50+
stride = 1
3951
n_lagged_samples = (len(X) - T) // stride + 1
4052
if n_lagged_samples < 1:
4153
raise ValueError('T is too long for a timeseries of length {}.'.format(len(X)))
@@ -48,6 +60,12 @@ def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
4860
X_with_lags = np.zeros((n_lagged_samples, T * N))
4961
for i in range(n_lagged_samples):
5062
X_with_lags[i, :] = X[i * stride:i * stride + T, :].flatten()
63+
if frac is not None:
64+
rng = check_random_state(rng)
65+
idxs = np.sort(rng.choice(n_lagged_samples, size=int(np.ceil(n_lagged_samples * frac)),
66+
replace=False))
67+
X_with_lags = X_with_lags[idxs]
68+
5169
return X_with_lags
5270

5371

@@ -108,7 +126,7 @@ def toeplitzify(cov, T, N, symmetrize=True):
108126
return cov_toep
109127

110128

111-
def calc_chunked_cov(X, T, stride, chunks, cov_est=None, stride_tricks=True):
129+
def calc_chunked_cov(X, T, stride, chunks, cov_est=None, rng=None, stride_tricks=True):
112130
"""Calculate an unormalized (by sample count) lagged covariance matrix
113131
in chunks to save memory.
114132
@@ -143,7 +161,7 @@ def calc_chunked_cov(X, T, stride, chunks, cov_est=None, stride_tricks=True):
143161
start = 0
144162
for chunk in range(chunks):
145163
X_with_lags = form_lag_matrix(X[start:ends[chunk]], T, stride=stride,
146-
stride_tricks=stride_tricks)
164+
rng=rng, stride_tricks=stride_tricks)
147165
start = ends[chunk] - T + 1
148166
ni_samples = X_with_lags.shape[0]
149167
cov_est += np.dot(X_with_lags.T, X_with_lags)
@@ -152,7 +170,7 @@ def calc_chunked_cov(X, T, stride, chunks, cov_est=None, stride_tricks=True):
152170

153171

154172
def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
155-
regularization=None, reg_ops=None,
173+
rng=None, regularization=None, reg_ops=None,
156174
stride_tricks=True):
157175
"""Compute the N-by-N cross-covariance matrix, where N is the data dimensionality,
158176
for each time lag up to T-1.
@@ -167,6 +185,13 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
167185
chunks : int
168186
Number of chunks to break the data into when calculating the lagged cross
169187
covariance. More chunks will mean less memory used
188+
stride : int or float
189+
If stride is an `int`, it defines the stride between lagged samples used
190+
to estimate the cross covariance matrix. Setting stride > 1 can speed up the
191+
calculation, but may lead to a loss in accuracy. Setting stride to a `float`
192+
greater than 0 and less than 1 will random subselect samples.
193+
rng : NumPy random state
194+
Only used if `stride` is a float.
170195
regularization : string
171196
Regularization method for computing the spatiotemporal covariance matrix.
172197
reg_ops : dict
@@ -200,7 +225,8 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
200225
cov_est = np.zeros((N * T, N * T))
201226
n_samples = 0
202227
for Xi in X:
203-
X_with_lags = form_lag_matrix(Xi, T, stride=stride, stride_tricks=stride_tricks)
228+
X_with_lags = form_lag_matrix(Xi, T, stride=stride, stride_tricks=stride_tricks,
229+
rng=rng)
204230
cov_est += np.dot(X_with_lags.T, X_with_lags)
205231
n_samples += len(X_with_lags)
206232
cov_est /= (n_samples - 1.)
@@ -209,7 +235,7 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
209235
cov_est = np.zeros((N * T, N * T))
210236
for Xi in X:
211237
cov_est, ni_samples = calc_chunked_cov(Xi, T, stride, chunks, cov_est=cov_est,
212-
stride_tricks=stride_tricks)
238+
stride_tricks=stride_tricks, rng=rng)
213239
n_samples += ni_samples
214240
cov_est /= (n_samples - 1.)
215241
else:
@@ -222,11 +248,12 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
222248
X = X - mean
223249
N = X.shape[-1]
224250
if chunks is None:
225-
X_with_lags = form_lag_matrix(X, T, stride=stride, stride_tricks=stride_tricks)
251+
X_with_lags = form_lag_matrix(X, T, stride=stride, stride_tricks=stride_tricks,
252+
rng=rng)
226253
cov_est = np.cov(X_with_lags, rowvar=False)
227254
else:
228255
cov_est, n_samples = calc_chunked_cov(X, T, stride, chunks,
229-
stride_tricks=stride_tricks)
256+
stride_tricks=stride_tricks, rng=rng)
230257
cov_est /= (n_samples - 1.)
231258

232259
if regularization is None:
@@ -342,7 +369,7 @@ def calc_cov_from_cross_cov_mats(cross_cov_mats):
342369
return cov
343370

344371

345-
def calc_pi_from_data(X, T, proj=None):
372+
def calc_pi_from_data(X, T, proj=None, stride=1, rng=None):
346373
"""Calculates the Gaussian Predictive Information between variables
347374
{1,...,T_pi} and {T_pi+1,...,2*T_pi}..
348375
@@ -356,13 +383,20 @@ def calc_pi_from_data(X, T, proj=None):
356383
proj : ndarray or torch tensor
357384
Projection matrix for data (optional). If `proj` is not given, the PI of
358385
the dataset is given.
386+
stride : int or float
387+
If stride is an `int`, it defines the stride between lagged samples used
388+
to estimate the cross covariance matrix. Setting stride > 1 can speed up the
389+
calculation, but may lead to a loss in accuracy. Setting stride to a `float`
390+
greater than 0 and less than 1 will random subselect samples.
391+
rng : NumPy random state
392+
Only used if `stride` is a float.
359393
360394
Returns
361395
-------
362396
PI : float
363397
Mutual information in nats.
364398
"""
365-
ccms = calc_cross_cov_mats_from_data(X, T)
399+
ccms = calc_cross_cov_mats_from_data(X, T, stride=stride, rng=rng)
366400

367401
return calc_pi_from_cross_cov_mats(ccms, proj=proj)
368402

dca/dca.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import scipy.stats
44
from scipy.optimize import minimize
55
from scipy.signal.windows import hann
6+
from sklearn.utils import check_random_state
67

78
import torch
89
import torch.nn.functional as F
@@ -275,12 +276,7 @@ def __init__(self, d=None, T=None, init="random_ortho", n_init=1, stride=1, tol=
275276
self.cross_covs = None
276277
self.coef_ = None
277278
self.mean_ = None
278-
if rng_or_seed is None:
279-
self.rng = np.random
280-
elif isinstance(rng_or_seed, np.random.RandomState):
281-
self.rng = rng_or_seed
282-
else:
283-
self.rng = np.random.RandomState(rng_or_seed)
279+
self.rng = check_random_state(rng_or_seed)
284280

285281
def estimate_cross_covariance(self, X, T=None, regularization=None, reg_ops=None):
286282
"""Estimate the cross covariance matrix from data.
@@ -310,6 +306,7 @@ def estimate_cross_covariance(self, X, T=None, regularization=None, reg_ops=None
310306
cross_covs = calc_cross_cov_mats_from_data(X, 2 * self.T, mean=self.mean_,
311307
chunks=self.chunk_cov_estimate,
312308
stride=self.stride,
309+
rng=self.rng,
313310
regularization=regularization,
314311
reg_ops=reg_ops)
315312
self.cross_covs = torch.tensor(cross_covs, device=self.device, dtype=self.dtype)

tests/test_cov_util.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pytest
44
import torch
55

6-
from dca import DynamicalComponentsAnalysis as DCA
76
from dca.dca import init_coef
87
from dca.data_util import form_lag_matrix
98
from dca.synth_data import gen_lorenz_data
@@ -170,15 +169,11 @@ def test_regularize_cov(lorenz_dataset):
170169
reg_ops={'num_folds': 3})
171170

172171

173-
def test_stride(lorenz_dataset):
172+
def test_stride_pi(lorenz_dataset):
174173
_, _, X, _, _ = lorenz_dataset
175174
X = X[:, :3]
176-
model = DCA(T=1)
177-
model.estimate_cross_covariance(X)
178-
ccms1 = model.cross_covs.numpy()
179-
180-
model = DCA(T=1, stride=2)
181-
model.estimate_cross_covariance(X)
182-
ccms2 = model.cross_covs.numpy()
183-
assert not np.allclose(ccms1, ccms2)
184-
assert_allclose(ccms1, ccms2, atol=5e-2)
175+
pi = calc_pi_from_data(X, 4)
176+
pi2 = calc_pi_from_data(X, 4, stride=2)
177+
assert pi != pi2
178+
pi2 = calc_pi_from_data(X, 4, stride=.5)
179+
assert pi != pi2

tests/test_dca.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from dca import (DynamicalComponentsAnalysis as DCA,
66
DynamicalComponentsAnalysisFFT as DCAFFT)
77
from dca.knn import DynamicalComponentsAnalysisKNN as DCAKNN
8+
from dca.synth_data import gen_lorenz_data
89

910

1011
@pytest.fixture
@@ -13,6 +14,12 @@ def noise_dataset():
1314
return X
1415

1516

17+
@pytest.fixture
18+
def lorenz_dataset():
19+
X = gen_lorenz_data(10000)
20+
return X
21+
22+
1623
def test_DCA(noise_dataset):
1724
"""Test that a DCA model can be fit with no errors.
1825
"""
@@ -145,3 +152,35 @@ def test_DCAKNN(noise_dataset):
145152
model.fit(X)
146153
model.transform(X)
147154
model.fit_transform(X)
155+
156+
157+
def test_stride_DCA(lorenz_dataset):
158+
"""Check that deterministic and random strides work for DCA.
159+
"""
160+
X = lorenz_dataset
161+
model = DCA(T=1)
162+
model.estimate_cross_covariance(X)
163+
ccms1 = model.cross_covs.numpy()
164+
165+
model = DCA(T=1, stride=2)
166+
model.estimate_cross_covariance(X)
167+
ccms2 = model.cross_covs.numpy()
168+
assert not np.allclose(ccms1, ccms2)
169+
assert_allclose(ccms1, ccms2, atol=5e-2)
170+
171+
model = DCA(T=1, stride=.5, rng_or_seed=0)
172+
model.estimate_cross_covariance(X)
173+
ccms2 = model.cross_covs.numpy()
174+
assert not np.allclose(ccms1, ccms2)
175+
assert_allclose(ccms1, ccms2, atol=5e-2)
176+
177+
model = DCA(T=1, stride=.5, rng_or_seed=1)
178+
model.estimate_cross_covariance(X)
179+
ccms1 = model.cross_covs.numpy()
180+
assert not np.allclose(ccms1, ccms2)
181+
assert_allclose(ccms1, ccms2, atol=5e-2)
182+
183+
model = DCA(T=1, stride=.5, rng_or_seed=1)
184+
model.estimate_cross_covariance(X)
185+
ccms2 = model.cross_covs.numpy()
186+
assert_allclose(ccms1, ccms2)

0 commit comments

Comments
 (0)