66from numpy .lib .stride_tricks import as_strided
77
88from sklearn .utils .extmath import randomized_svd
9+ from sklearn .utils import check_random_state
910
1011
11- def form_lag_matrix (X , T , stride = 1 , stride_tricks = True , writeable = False ):
12+ def form_lag_matrix (X , T , stride = 1 , stride_tricks = True , rng = None , writeable = False ):
1213 """Form the data matrix with `T` lags.
1314
1415 Parameters
@@ -17,8 +18,13 @@ def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
1718 Timeseries with no lags.
1819 T : int
1920 Number of lags.
20- stride : int
21- Number of original samples to move between lagged samples.
21+ stride : int or float
22+ If stride is an `int`, it defines the stride between lagged samples used
23+ to estimate the cross covariance matrix. Setting stride > 1 can speed up the
24+ calculation, but may lead to a loss in accuracy. Setting stride to a `float`
25+ greater than 0 and less than 1 will random subselect samples.
26+ rng : NumPy random state
27+ Only used if `stride` is a float.
2228 stride_tricks : bool
2329 Whether to use numpy stride tricks to form the lagged matrix or create
2430 a new array. Using numpy stride tricks can can lower memory usage, especially for
@@ -34,8 +40,14 @@ def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
3440 Timeseries with lags.
3541 """
3642 if not isinstance (stride , int ) or stride < 1 :
37- raise ValueError ('stride should be an int and greater than or equal to 1.' )
43+ if not isinstance (stride , float ) or stride <= 0. or stride >= 1. :
44+ raise ValueError ('stride should be an int and greater than or equal to 1 or a float ' +
45+ 'between 0 and 1.' )
3846 N = X .shape [1 ]
47+ frac = None
48+ if isinstance (stride , float ):
49+ frac = stride
50+ stride = 1
3951 n_lagged_samples = (len (X ) - T ) // stride + 1
4052 if n_lagged_samples < 1 :
4153 raise ValueError ('T is too long for a timeseries of length {}.' .format (len (X )))
@@ -48,6 +60,12 @@ def form_lag_matrix(X, T, stride=1, stride_tricks=True, writeable=False):
4860 X_with_lags = np .zeros ((n_lagged_samples , T * N ))
4961 for i in range (n_lagged_samples ):
5062 X_with_lags [i , :] = X [i * stride :i * stride + T , :].flatten ()
63+ if frac is not None :
64+ rng = check_random_state (rng )
65+ idxs = np .sort (rng .choice (n_lagged_samples , size = int (np .ceil (n_lagged_samples * frac )),
66+ replace = False ))
67+ X_with_lags = X_with_lags [idxs ]
68+
5169 return X_with_lags
5270
5371
@@ -108,7 +126,7 @@ def toeplitzify(cov, T, N, symmetrize=True):
108126 return cov_toep
109127
110128
111- def calc_chunked_cov (X , T , stride , chunks , cov_est = None , stride_tricks = True ):
129+ def calc_chunked_cov (X , T , stride , chunks , cov_est = None , rng = None , stride_tricks = True ):
112130 """Calculate an unormalized (by sample count) lagged covariance matrix
113131 in chunks to save memory.
114132
@@ -143,7 +161,7 @@ def calc_chunked_cov(X, T, stride, chunks, cov_est=None, stride_tricks=True):
143161 start = 0
144162 for chunk in range (chunks ):
145163 X_with_lags = form_lag_matrix (X [start :ends [chunk ]], T , stride = stride ,
146- stride_tricks = stride_tricks )
164+ rng = rng , stride_tricks = stride_tricks )
147165 start = ends [chunk ] - T + 1
148166 ni_samples = X_with_lags .shape [0 ]
149167 cov_est += np .dot (X_with_lags .T , X_with_lags )
@@ -152,7 +170,7 @@ def calc_chunked_cov(X, T, stride, chunks, cov_est=None, stride_tricks=True):
152170
153171
154172def calc_cross_cov_mats_from_data (X , T , mean = None , chunks = None , stride = 1 ,
155- regularization = None , reg_ops = None ,
173+ rng = None , regularization = None , reg_ops = None ,
156174 stride_tricks = True ):
157175 """Compute the N-by-N cross-covariance matrix, where N is the data dimensionality,
158176 for each time lag up to T-1.
@@ -167,6 +185,13 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
167185 chunks : int
168186 Number of chunks to break the data into when calculating the lagged cross
169187 covariance. More chunks will mean less memory used
188+ stride : int or float
189+ If stride is an `int`, it defines the stride between lagged samples used
190+ to estimate the cross covariance matrix. Setting stride > 1 can speed up the
191+ calculation, but may lead to a loss in accuracy. Setting stride to a `float`
192+ greater than 0 and less than 1 will random subselect samples.
193+ rng : NumPy random state
194+ Only used if `stride` is a float.
170195 regularization : string
171196 Regularization method for computing the spatiotemporal covariance matrix.
172197 reg_ops : dict
@@ -200,7 +225,8 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
200225 cov_est = np .zeros ((N * T , N * T ))
201226 n_samples = 0
202227 for Xi in X :
203- X_with_lags = form_lag_matrix (Xi , T , stride = stride , stride_tricks = stride_tricks )
228+ X_with_lags = form_lag_matrix (Xi , T , stride = stride , stride_tricks = stride_tricks ,
229+ rng = rng )
204230 cov_est += np .dot (X_with_lags .T , X_with_lags )
205231 n_samples += len (X_with_lags )
206232 cov_est /= (n_samples - 1. )
@@ -209,7 +235,7 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
209235 cov_est = np .zeros ((N * T , N * T ))
210236 for Xi in X :
211237 cov_est , ni_samples = calc_chunked_cov (Xi , T , stride , chunks , cov_est = cov_est ,
212- stride_tricks = stride_tricks )
238+ stride_tricks = stride_tricks , rng = rng )
213239 n_samples += ni_samples
214240 cov_est /= (n_samples - 1. )
215241 else :
@@ -222,11 +248,12 @@ def calc_cross_cov_mats_from_data(X, T, mean=None, chunks=None, stride=1,
222248 X = X - mean
223249 N = X .shape [- 1 ]
224250 if chunks is None :
225- X_with_lags = form_lag_matrix (X , T , stride = stride , stride_tricks = stride_tricks )
251+ X_with_lags = form_lag_matrix (X , T , stride = stride , stride_tricks = stride_tricks ,
252+ rng = rng )
226253 cov_est = np .cov (X_with_lags , rowvar = False )
227254 else :
228255 cov_est , n_samples = calc_chunked_cov (X , T , stride , chunks ,
229- stride_tricks = stride_tricks )
256+ stride_tricks = stride_tricks , rng = rng )
230257 cov_est /= (n_samples - 1. )
231258
232259 if regularization is None :
@@ -342,7 +369,7 @@ def calc_cov_from_cross_cov_mats(cross_cov_mats):
342369 return cov
343370
344371
345- def calc_pi_from_data (X , T , proj = None ):
372+ def calc_pi_from_data (X , T , proj = None , stride = 1 , rng = None ):
346373 """Calculates the Gaussian Predictive Information between variables
347374 {1,...,T_pi} and {T_pi+1,...,2*T_pi}..
348375
@@ -356,13 +383,20 @@ def calc_pi_from_data(X, T, proj=None):
356383 proj : ndarray or torch tensor
357384 Projection matrix for data (optional). If `proj` is not given, the PI of
358385 the dataset is given.
386+ stride : int or float
387+ If stride is an `int`, it defines the stride between lagged samples used
388+ to estimate the cross covariance matrix. Setting stride > 1 can speed up the
389+ calculation, but may lead to a loss in accuracy. Setting stride to a `float`
390+ greater than 0 and less than 1 will random subselect samples.
391+ rng : NumPy random state
392+ Only used if `stride` is a float.
359393
360394 Returns
361395 -------
362396 PI : float
363397 Mutual information in nats.
364398 """
365- ccms = calc_cross_cov_mats_from_data (X , T )
399+ ccms = calc_cross_cov_mats_from_data (X , T , stride = stride , rng = rng )
366400
367401 return calc_pi_from_cross_cov_mats (ccms , proj = proj )
368402
0 commit comments