Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/tsa/stattools.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Statistical tools for time series analysis
3"""
4from statsmodels.compat.python import iteritems, lrange, lzip
5from statsmodels.compat.pandas import deprecate_kwarg
6from statsmodels.compat.numpy import lstsq
7from statsmodels.compat.scipy import _next_regular
9import numpy as np
10from numpy.linalg import LinAlgError
11from scipy import stats
12import pandas as pd
14from statsmodels.regression.linear_model import OLS, yule_walker
15from statsmodels.tools.sm_exceptions import (InterpolationWarning,
16 MissingDataError,
17 CollinearityWarning)
18from statsmodels.tools.tools import add_constant, Bunch
19from statsmodels.tools.validation import (array_like, string_like, bool_like,
20 int_like, dict_like, float_like)
21from statsmodels.tsa._bds import bds
22from statsmodels.tsa._innovations import innovations_filter, innovations_algo
23from statsmodels.tsa.adfvalues import mackinnonp, mackinnoncrit
24from statsmodels.tsa.arima_model import ARMA
25from statsmodels.tsa.tsatools import lagmat, lagmat2ds, add_trend
27__all__ = ['acovf', 'acf', 'pacf', 'pacf_yw', 'pacf_ols', 'ccovf', 'ccf',
28 'periodogram', 'q_stat', 'coint', 'arma_order_select_ic',
29 'adfuller', 'kpss', 'bds', 'pacf_burg', 'innovations_algo',
30 'innovations_filter', 'levinson_durbin_pacf', 'levinson_durbin',
31 'zivot_andrews']
33SQRTEPS = np.sqrt(np.finfo(np.double).eps)
36#NOTE: now in two places to avoid circular import
37#TODO: I like the bunch pattern for this too.
38class ResultsStore(object):
39 def __str__(self):
40 return self._str # pylint: disable=E1101
43def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(),
44 fitargs=(), regresults=False):
45 """
46 Returns the results for the lag length that maximizes the info criterion.
48 Parameters
49 ----------
50 mod : Model class
51 Model estimator class
52 endog : array_like
53 nobs array containing endogenous variable
54 exog : array_like
55 nobs by (startlag + maxlag) array containing lags and possibly other
56 variables
57 startlag : int
58 The first zero-indexed column to hold a lag. See Notes.
59 maxlag : int
60 The highest lag order for lag length selection.
61 method : {'aic', 'bic', 't-stat'}
62 aic - Akaike Information Criterion
63 bic - Bayes Information Criterion
64 t-stat - Based on last lag
65 modargs : tuple, optional
66 args to pass to model. See notes.
67 fitargs : tuple, optional
68 args to pass to fit. See notes.
69 regresults : bool, optional
70 Flag indicating to return optional return results
72 Returns
73 -------
74 icbest : float
75 Best information criteria.
76 bestlag : int
77 The lag length that maximizes the information criterion.
78 results : dict, optional
79 Dictionary containing all estimation results
81 Notes
82 -----
83 Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs)
84 where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are
85 assumed to be in contiguous columns from low to high lag length with
86 the highest lag in the last column.
87 """
88 #TODO: can tcol be replaced by maxlag + 2?
89 #TODO: This could be changed to laggedRHS and exog keyword arguments if
90 # this will be more general.
92 results = {}
93 method = method.lower()
94 for lag in range(startlag, startlag + maxlag + 1):
95 mod_instance = mod(endog, exog[:, :lag], *modargs)
96 results[lag] = mod_instance.fit()
98 if method == "aic":
99 icbest, bestlag = min((v.aic, k) for k, v in iteritems(results))
100 elif method == "bic":
101 icbest, bestlag = min((v.bic, k) for k, v in iteritems(results))
102 elif method == "t-stat":
103 #stop = stats.norm.ppf(.95)
104 stop = 1.6448536269514722
105 for lag in range(startlag + maxlag, startlag - 1, -1):
106 icbest = np.abs(results[lag].tvalues[-1])
107 if np.abs(icbest) >= stop:
108 bestlag = lag
109 icbest = icbest
110 break
111 else:
112 raise ValueError("Information Criterion %s not understood.") % method
114 if not regresults:
115 return icbest, bestlag
116 else:
117 return icbest, bestlag, results
120#this needs to be converted to a class like HetGoldfeldQuandt,
121# 3 different returns are a mess
122# See:
123#Ng and Perron(2001), Lag length selection and the construction of unit root
124#tests with good size and power, Econometrica, Vol 69 (6) pp 1519-1554
125#TODO: include drift keyword, only valid with regression == "c"
126# just changes the distribution of the test statistic to a t distribution
127#TODO: autolag is untested
128def adfuller(x, maxlag=None, regression="c", autolag='AIC',
129 store=False, regresults=False):
130 """
131 Augmented Dickey-Fuller unit root test.
133 The Augmented Dickey-Fuller test can be used to test for a unit root in a
134 univariate process in the presence of serial correlation.
136 Parameters
137 ----------
138 x : array_like, 1d
139 The data series to test.
140 maxlag : int
141 Maximum lag which is included in test, default 12*(nobs/100)^{1/4}.
142 regression : {'c','ct','ctt','nc'}
143 Constant and trend order to include in regression.
145 * 'c' : constant only (default).
146 * 'ct' : constant and trend.
147 * 'ctt' : constant, and linear and quadratic trend.
148 * 'nc' : no constant, no trend.
150 autolag : {'AIC', 'BIC', 't-stat', None}
151 Method to use when automatically determining the lag.
153 * if None, then maxlag lags are used.
154 * if 'AIC' (default) or 'BIC', then the number of lags is chosen
155 to minimize the corresponding information criterion.
156 * 't-stat' based choice of maxlag. Starts with maxlag and drops a
157 lag until the t-statistic on the last lag length is significant
158 using a 5%-sized test.
159 store : bool
160 If True, then a result instance is returned additionally to
161 the adf statistic. Default is False.
162 regresults : bool, optional
163 If True, the full regression results are returned. Default is False.
165 Returns
166 -------
167 adf : float
168 The test statistic.
169 pvalue : float
170 MacKinnon's approximate p-value based on MacKinnon (1994, 2010).
171 usedlag : int
172 The number of lags used.
173 nobs : int
174 The number of observations used for the ADF regression and calculation
175 of the critical values.
176 critical values : dict
177 Critical values for the test statistic at the 1 %, 5 %, and 10 %
178 levels. Based on MacKinnon (2010).
179 icbest : float
180 The maximized information criterion if autolag is not None.
181 resstore : ResultStore, optional
182 A dummy class with results attached as attributes.
184 Notes
185 -----
186 The null hypothesis of the Augmented Dickey-Fuller is that there is a unit
187 root, with the alternative that there is no unit root. If the pvalue is
188 above a critical size, then we cannot reject that there is a unit root.
190 The p-values are obtained through regression surface approximation from
191 MacKinnon 1994, but using the updated 2010 tables. If the p-value is close
192 to significant, then the critical values should be used to judge whether
193 to reject the null.
195 The autolag option and maxlag for it are described in Greene.
197 References
198 ----------
199 .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003.
201 .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994.
203 .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for
204 unit-root and cointegration tests. `Journal of Business and Economic
205 Statistics` 12, 167-76.
207 .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's
208 University, Dept of Economics, Working Papers. Available at
209 http://ideas.repec.org/p/qed/wpaper/1227.html
211 Examples
212 --------
213 See example notebook
214 """
215 x = array_like(x, 'x')
216 maxlag = int_like(maxlag, 'maxlag', optional=True)
217 regression = string_like(regression, 'regression',
218 options=('c', 'ct', 'ctt', 'nc'))
219 autolag = string_like(autolag, 'autolag', optional=True,
220 options=('aic', 'bic', 't-stat'))
221 store = bool_like(store, 'store')
222 regresults = bool_like(regresults, 'regresults')
224 if regresults:
225 store = True
227 trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'}
228 if regression is None or isinstance(regression, int):
229 regression = trenddict[regression]
230 regression = regression.lower()
231 nobs = x.shape[0]
233 ntrend = len(regression) if regression != 'nc' else 0
234 if maxlag is None:
235 # from Greene referencing Schwert 1989
236 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.)))
237 # -1 for the diff
238 maxlag = min(nobs // 2 - ntrend - 1, maxlag)
239 if maxlag < 0:
240 raise ValueError('sample size is too short to use selected '
241 'regression component')
242 elif maxlag > nobs // 2 - ntrend - 1:
243 raise ValueError('maxlag must be less than (nobs/2 - 1 - ntrend) '
244 'where n trend is the number of included '
245 'deterministic regressors')
246 xdiff = np.diff(x)
247 xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in')
248 nobs = xdall.shape[0]
250 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x
251 xdshort = xdiff[-nobs:]
253 if store:
254 resstore = ResultsStore()
255 if autolag:
256 if regression != 'nc':
257 fullRHS = add_trend(xdall, regression, prepend=True)
258 else:
259 fullRHS = xdall
260 startlag = fullRHS.shape[1] - xdall.shape[1] + 1
261 # 1 for level
262 # search for lag length with smallest information criteria
263 # Note: use the same number of observations to have comparable IC
264 # aic and bic: smaller is better
266 if not regresults:
267 icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag,
268 maxlag, autolag)
269 else:
270 icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag,
271 maxlag, autolag,
272 regresults=regresults)
273 resstore.autolag_results = alres
275 bestlag -= startlag # convert to lag not column index
277 # rerun ols with best autolag
278 xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in')
279 nobs = xdall.shape[0]
280 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x
281 xdshort = xdiff[-nobs:]
282 usedlag = bestlag
283 else:
284 usedlag = maxlag
285 icbest = None
286 if regression != 'nc':
287 resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1],
288 regression)).fit()
289 else:
290 resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit()
292 adfstat = resols.tvalues[0]
293# adfstat = (resols.params[0]-1.0)/resols.bse[0]
294 # the "asymptotically correct" z statistic is obtained as
295 # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1)
296 # I think this is the statistic that is used for series that are integrated
297 # for orders higher than I(1), ie., not ADF but cointegration tests.
299 # Get approx p-value and critical values
300 pvalue = mackinnonp(adfstat, regression=regression, N=1)
301 critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs)
302 critvalues = {"1%" : critvalues[0], "5%" : critvalues[1],
303 "10%" : critvalues[2]}
304 if store:
305 resstore.resols = resols
306 resstore.maxlag = maxlag
307 resstore.usedlag = usedlag
308 resstore.adfstat = adfstat
309 resstore.critvalues = critvalues
310 resstore.nobs = nobs
311 resstore.H0 = ("The coefficient on the lagged level equals 1 - "
312 "unit root")
313 resstore.HA = "The coefficient on the lagged level < 1 - stationary"
314 resstore.icbest = icbest
315 resstore._str = 'Augmented Dickey-Fuller Test Results'
316 return adfstat, pvalue, critvalues, resstore
317 else:
318 if not autolag:
319 return adfstat, pvalue, usedlag, nobs, critvalues
320 else:
321 return adfstat, pvalue, usedlag, nobs, critvalues, icbest
324def acovf(x, unbiased=False, demean=True, fft=None, missing='none', nlag=None):
325 """
326 Estimate autocovariances.
328 Parameters
329 ----------
330 x : array_like
331 Time series data. Must be 1d.
332 unbiased : bool
333 If True, then denominators is n-k, otherwise n.
334 demean : bool
335 If True, then subtract the mean x from each element of x.
336 fft : bool
337 If True, use FFT convolution. This method should be preferred
338 for long time series.
339 missing : str
340 A string in ['none', 'raise', 'conservative', 'drop'] specifying how
341 the NaNs are to be treated.
342 nlag : {int, None}
343 Limit the number of autocovariances returned. Size of returned
344 array is nlag + 1. Setting nlag when fft is False uses a simple,
345 direct estimator of the autocovariances that only computes the first
346 nlag + 1 values. This can be much faster when the time series is long
347 and only a small number of autocovariances are needed.
349 Returns
350 -------
351 ndarray
352 The estimated autocovariances.
354 References
355 -----------
356 .. [1] Parzen, E., 1963. On spectral analysis with missing observations
357 and amplitude modulation. Sankhya: The Indian Journal of
358 Statistics, Series A, pp.383-392.
359 """
360 unbiased = bool_like(unbiased, 'unbiased')
361 demean = bool_like(demean, 'demean')
362 fft = bool_like(fft, 'fft', optional=True)
363 missing = string_like(missing, 'missing',
364 options=('none', 'raise', 'conservative', 'drop'))
365 nlag = int_like(nlag, 'nlag', optional=True)
367 if fft is None:
368 import warnings
369 msg = 'fft=True will become the default in a future version of ' \
370 'statsmodels. To suppress this warning, explicitly set ' \
371 'fft=False.'
372 warnings.warn(msg, FutureWarning)
373 fft = False
375 x = array_like(x, 'x', ndim=1)
377 missing = missing.lower()
378 if missing not in ['none', 'raise', 'conservative', 'drop']:
379 raise ValueError("missing option %s not understood" % missing)
380 if missing == 'none':
381 deal_with_masked = False
382 else:
383 deal_with_masked = has_missing(x)
384 if deal_with_masked:
385 if missing == 'raise':
386 raise MissingDataError("NaNs were encountered in the data")
387 notmask_bool = ~np.isnan(x) # bool
388 if missing == 'conservative':
389 # Must copy for thread safety
390 x = x.copy()
391 x[~notmask_bool] = 0
392 else: # 'drop'
393 x = x[notmask_bool] # copies non-missing
394 notmask_int = notmask_bool.astype(int) # int
396 if demean and deal_with_masked:
397 # whether 'drop' or 'conservative':
398 xo = x - x.sum() / notmask_int.sum()
399 if missing == 'conservative':
400 xo[~notmask_bool] = 0
401 elif demean:
402 xo = x - x.mean()
403 else:
404 xo = x
406 n = len(x)
407 lag_len = nlag
408 if nlag is None:
409 lag_len = n - 1
410 elif nlag > n - 1:
411 raise ValueError('nlag must be smaller than nobs - 1')
413 if not fft and nlag is not None:
414 acov = np.empty(lag_len + 1)
415 acov[0] = xo.dot(xo)
416 for i in range(lag_len):
417 acov[i + 1] = xo[i + 1:].dot(xo[:-(i + 1)])
418 if not deal_with_masked or missing == 'drop':
419 if unbiased:
420 acov /= (n - np.arange(lag_len + 1))
421 else:
422 acov /= n
423 else:
424 if unbiased:
425 divisor = np.empty(lag_len + 1, dtype=np.int64)
426 divisor[0] = notmask_int.sum()
427 for i in range(lag_len):
428 divisor[i + 1] = notmask_int[i + 1:].dot(notmask_int[:-(i + 1)])
429 divisor[divisor == 0] = 1
430 acov /= divisor
431 else: # biased, missing data but npt 'drop'
432 acov /= notmask_int.sum()
433 return acov
435 if unbiased and deal_with_masked and missing == 'conservative':
436 d = np.correlate(notmask_int, notmask_int, 'full')
437 d[d == 0] = 1
438 elif unbiased:
439 xi = np.arange(1, n + 1)
440 d = np.hstack((xi, xi[:-1][::-1]))
441 elif deal_with_masked: # biased and NaNs given and ('drop' or 'conservative')
442 d = notmask_int.sum() * np.ones(2 * n - 1)
443 else: # biased and no NaNs or missing=='none'
444 d = n * np.ones(2 * n - 1)
446 if fft:
447 nobs = len(xo)
448 n = _next_regular(2 * nobs + 1)
449 Frf = np.fft.fft(xo, n=n)
450 acov = np.fft.ifft(Frf * np.conjugate(Frf))[:nobs] / d[nobs - 1:]
451 acov = acov.real
452 else:
453 acov = np.correlate(xo, xo, 'full')[n - 1:] / d[n - 1:]
455 if nlag is not None:
456 # Copy to allow gc of full array rather than view
457 return acov[:lag_len + 1].copy()
458 return acov
461def q_stat(x, nobs, type=None):
462 """
463 Compute Ljung-Box Q Statistic.
465 Parameters
466 ----------
467 x : array_like
468 Array of autocorrelation coefficients. Can be obtained from acf.
469 nobs : int, optional
470 Number of observations in the entire sample (ie., not just the length
471 of the autocorrelation function results.
473 Returns
474 -------
475 q-stat : ndarray
476 Ljung-Box Q-statistic for autocorrelation parameters.
477 p-value : ndarray
478 P-value of the Q statistic.
480 Notes
481 -----
482 Designed to be used with acf.
483 """
484 x = array_like(x, 'x')
485 nobs = int_like(nobs, 'nobs')
487 if type is not None:
488 import warnings
489 warnings.warn('The `type` argument is deprecated and has no effect',
490 FutureWarning)
491 ret = (nobs * (nobs + 2) *
492 np.cumsum((1. / (nobs - np.arange(1, len(x) + 1))) * x ** 2))
493 chi2 = stats.chi2.sf(ret, np.arange(1, len(x) + 1))
494 return ret, chi2
497#NOTE: Changed unbiased to False
498#see for example
499# http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm
500def acf(x, unbiased=False, nlags=40, qstat=False, fft=None, alpha=None,
501 missing='none'):
502 """
503 Calculate the autocorrelation function.
505 Parameters
506 ----------
507 x : array_like
508 The time series data.
509 unbiased : bool
510 If True, then denominators for autocovariance are n-k, otherwise n.
511 nlags : int, optional
512 Number of lags to return autocorrelation for.
513 qstat : bool, optional
514 If True, returns the Ljung-Box q statistic for each autocorrelation
515 coefficient. See q_stat for more information.
516 fft : bool, optional
517 If True, computes the ACF via FFT.
518 alpha : scalar, optional
519 If a number is given, the confidence intervals for the given level are
520 returned. For instance if alpha=.05, 95 % confidence intervals are
521 returned where the standard deviation is computed according to
522 Bartlett's formula.
523 missing : str, optional
524 A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs
525 are to be treated.
527 Returns
528 -------
529 acf : ndarray
530 The autocorrelation function.
531 confint : ndarray, optional
532 Confidence intervals for the ACF. Returned if alpha is not None.
533 qstat : ndarray, optional
534 The Ljung-Box Q-Statistic. Returned if q_stat is True.
535 pvalues : ndarray, optional
536 The p-values associated with the Q-statistics. Returned if q_stat is
537 True.
539 Notes
540 -----
541 The acf at lag 0 (ie., 1) is returned.
543 For very long time series it is recommended to use fft convolution instead.
544 When fft is False uses a simple, direct estimator of the autocovariances
545 that only computes the first nlag + 1 values. This can be much faster when
546 the time series is long and only a small number of autocovariances are
547 needed.
549 If unbiased is true, the denominator for the autocovariance is adjusted
550 but the autocorrelation is not an unbiased estimator.
552 References
553 ----------
554 .. [1] Parzen, E., 1963. On spectral analysis with missing observations
555 and amplitude modulation. Sankhya: The Indian Journal of
556 Statistics, Series A, pp.383-392.
557 """
558 unbiased = bool_like(unbiased, 'unbiased')
559 nlags = int_like(nlags, 'nlags')
560 qstat = bool_like(qstat, 'qstat')
561 fft = bool_like(fft, 'fft', optional=True)
562 alpha = float_like(alpha, 'alpha', optional=True)
563 missing = string_like(missing, 'missing',
564 options=('none', 'raise', 'conservative', 'drop'))
566 if fft is None:
567 import warnings
568 warnings.warn(
569 'fft=True will become the default in a future version of '
570 'statsmodels. To suppress this warning, explicitly set '
571 'fft=False.',
572 FutureWarning
573 )
574 fft = False
575 x = array_like(x, 'x')
576 nobs = len(x) # TODO: should this shrink for missing='drop' and NaNs in x?
577 avf = acovf(x, unbiased=unbiased, demean=True, fft=fft, missing=missing)
578 acf = avf[:nlags + 1] / avf[0]
579 if not (qstat or alpha):
580 return acf
581 if alpha is not None:
582 varacf = np.ones(nlags + 1) / nobs
583 varacf[0] = 0
584 varacf[1] = 1. / nobs
585 varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2)
586 interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf)
587 confint = np.array(lzip(acf - interval, acf + interval))
588 if not qstat:
589 return acf, confint
590 if qstat:
591 qstat, pvalue = q_stat(acf[1:], nobs=nobs) # drop lag 0
592 if alpha is not None:
593 return acf, confint, qstat, pvalue
594 else:
595 return acf, qstat, pvalue
598def pacf_yw(x, nlags=40, method='unbiased'):
599 """
600 Partial autocorrelation estimated with non-recursive yule_walker.
602 Parameters
603 ----------
604 x : array_like
605 The observations of time series for which pacf is calculated.
606 nlags : int, optional
607 The largest lag for which pacf is returned.
608 method : {'unbiased', 'mle'}
609 The method for the autocovariance calculations in yule walker.
611 Returns
612 -------
613 ndarray
614 The partial autocorrelations, maxlag+1 elements.
616 See Also
617 --------
618 statsmodels.tsa.stattools.pacf
619 Partial autocorrelation estimation.
620 statsmodels.tsa.stattools.pacf_ols
621 Partial autocorrelation estimation using OLS.
622 statsmodels.tsa.stattools.pacf_burg
623 Partial autocorrelation estimation using Burg's method.
625 Notes
626 -----
627 This solves yule_walker for each desired lag and contains
628 currently duplicate calculations.
629 """
630 x = array_like(x, 'x')
631 nlags = int_like(nlags, 'nlags')
632 method = string_like(method, 'method', options=('unbiased', 'mle'))
634 pacf = [1.]
635 for k in range(1, nlags + 1):
636 pacf.append(yule_walker(x, k, method=method)[0][-1])
637 return np.array(pacf)
640def pacf_burg(x, nlags=None, demean=True):
641 """
642 Calculate Burg's partial autocorrelation estimator.
644 Parameters
645 ----------
646 x : array_like
647 Observations of time series for which pacf is calculated.
648 nlags : int, optional
649 Number of lags to compute the partial autocorrelations. If omitted,
650 uses the smaller of 10(log10(nobs)) or nobs - 1.
651 demean : bool, optional
652 Flag indicating to demean that data. Set to False if x has been
653 previously demeaned.
655 Returns
656 -------
657 pacf : ndarray
658 Partial autocorrelations for lags 0, 1, ..., nlag.
659 sigma2 : ndarray
660 Residual variance estimates where the value in position m is the
661 residual variance in an AR model that includes m lags.
663 See Also
664 --------
665 statsmodels.tsa.stattools.pacf
666 Partial autocorrelation estimation.
667 statsmodels.tsa.stattools.pacf_yw
668 Partial autocorrelation estimation using Yule-Walker.
669 statsmodels.tsa.stattools.pacf_ols
670 Partial autocorrelation estimation using OLS.
672 References
673 ----------
674 .. [1] Brockwell, P.J. and Davis, R.A., 2016. Introduction to time series
675 and forecasting. Springer.
676 """
677 x = array_like(x, 'x')
678 if demean:
679 x = x - x.mean()
680 nobs = x.shape[0]
681 p = nlags if nlags is not None else min(int(10 * np.log10(nobs)), nobs - 1)
682 if p > nobs - 1:
683 raise ValueError('nlags must be smaller than nobs - 1')
684 d = np.zeros(p + 1)
685 d[0] = 2 * x.dot(x)
686 pacf = np.zeros(p + 1)
687 u = x[::-1].copy()
688 v = x[::-1].copy()
689 d[1] = u[:-1].dot(u[:-1]) + v[1:].dot(v[1:])
690 pacf[1] = 2 / d[1] * v[1:].dot(u[:-1])
691 last_u = np.empty_like(u)
692 last_v = np.empty_like(v)
693 for i in range(1, p):
694 last_u[:] = u
695 last_v[:] = v
696 u[1:] = last_u[:-1] - pacf[i] * last_v[1:]
697 v[1:] = last_v[1:] - pacf[i] * last_u[:-1]
698 d[i + 1] = (1 - pacf[i] ** 2) * d[i] - v[i] ** 2 - u[-1] ** 2
699 pacf[i + 1] = 2 / d[i + 1] * v[i + 1:].dot(u[i:-1])
700 sigma2 = (1 - pacf ** 2) * d / (2. * (nobs - np.arange(0, p + 1)))
701 pacf[0] = 1 # Insert the 0 lag partial autocorrel
703 return pacf, sigma2
706def pacf_ols(x, nlags=40, efficient=True, unbiased=False):
707 """
708 Calculate partial autocorrelations via OLS.
710 Parameters
711 ----------
712 x : array_like
713 Observations of time series for which pacf is calculated.
714 nlags : int
715 Number of lags for which pacf is returned. Lag 0 is not returned.
716 efficient : bool, optional
717 If true, uses the maximum number of available observations to compute
718 each partial autocorrelation. If not, uses the same number of
719 observations to compute all pacf values.
720 unbiased : bool, optional
721 Adjust each partial autocorrelation by n / (n - lag).
723 Returns
724 -------
725 ndarray
726 The partial autocorrelations, (maxlag,) array corresponding to lags
727 0, 1, ..., maxlag.
729 See Also
730 --------
731 statsmodels.tsa.stattools.pacf
732 Partial autocorrelation estimation.
733 statsmodels.tsa.stattools.pacf_yw
734 Partial autocorrelation estimation using Yule-Walker.
735 statsmodels.tsa.stattools.pacf_burg
736 Partial autocorrelation estimation using Burg's method.
738 Notes
739 -----
740 This solves a separate OLS estimation for each desired lag using method in
741 [1]_. Setting efficient to True has two effects. First, it uses
742 `nobs - lag` observations of estimate each pacf. Second, it re-estimates
743 the mean in each regression. If efficient is False, then the data are first
744 demeaned, and then `nobs - maxlag` observations are used to estimate each
745 partial autocorrelation.
747 The inefficient estimator appears to have better finite sample properties.
748 This option should only be used in time series that are covariance
749 stationary.
751 OLS estimation of the pacf does not guarantee that all pacf values are
752 between -1 and 1.
754 References
755 ----------
756 .. [1] Box, G. E., Jenkins, G. M., Reinsel, G. C., & Ljung, G. M. (2015).
757 Time series analysis: forecasting and control. John Wiley & Sons, p. 66
758 """
759 x = array_like(x, 'x')
760 nlags = int_like(nlags, 'nlags')
761 efficient = bool_like(efficient, 'efficient')
762 unbiased = bool_like(unbiased, 'unbiased')
764 pacf = np.empty(nlags + 1)
765 pacf[0] = 1.0
766 if efficient:
767 xlags, x0 = lagmat(x, nlags, original='sep')
768 xlags = add_constant(xlags)
769 for k in range(1, nlags + 1):
770 params = lstsq(xlags[k:, :k + 1], x0[k:], rcond=None)[0]
771 pacf[k] = params[-1]
772 else:
773 x = x - np.mean(x)
774 # Create a single set of lags for multivariate OLS
775 xlags, x0 = lagmat(x, nlags, original='sep', trim='both')
776 for k in range(1, nlags + 1):
777 params = lstsq(xlags[:, :k], x0, rcond=None)[0]
778 # Last coefficient corresponds to PACF value (see [1])
779 pacf[k] = params[-1]
781 if unbiased:
782 n = len(x)
783 pacf *= n / (n - np.arange(nlags + 1))
785 return pacf
788def pacf(x, nlags=40, method='ywunbiased', alpha=None):
789 """
790 Partial autocorrelation estimate.
792 Parameters
793 ----------
794 x : array_like
795 Observations of time series for which pacf is calculated.
796 nlags : int, optional
797 The largest lag for which the pacf is returned.
798 method : str, optional
799 Specifies which method for the calculations to use.
801 - 'yw' or 'ywunbiased' : Yule-Walker with bias correction in
802 denominator for acovf. Default.
803 - 'ywm' or 'ywmle' : Yule-Walker without bias correction.
804 - 'ols' : regression of time series on lags of it and on constant.
805 - 'ols-inefficient' : regression of time series on lags using a single
806 common sample to estimate all pacf coefficients.
807 - 'ols-unbiased' : regression of time series on lags with a bias
808 adjustment.
809 - 'ld' or 'ldunbiased' : Levinson-Durbin recursion with bias
810 correction.
811 - 'ldb' or 'ldbiased' : Levinson-Durbin recursion without bias
812 correction.
814 alpha : float, optional
815 If a number is given, the confidence intervals for the given level are
816 returned. For instance if alpha=.05, 95 % confidence intervals are
817 returned where the standard deviation is computed according to
818 1/sqrt(len(x)).
820 Returns
821 -------
822 pacf : ndarray
823 Partial autocorrelations, nlags elements, including lag zero.
824 confint : ndarray, optional
825 Confidence intervals for the PACF. Returned if confint is not None.
827 See Also
828 --------
829 statsmodels.tsa.stattools.acf
830 Estimate the autocorrelation function.
831 statsmodels.tsa.stattools.pacf
832 Partial autocorrelation estimation.
833 statsmodels.tsa.stattools.pacf_yw
834 Partial autocorrelation estimation using Yule-Walker.
835 statsmodels.tsa.stattools.pacf_ols
836 Partial autocorrelation estimation using OLS.
837 statsmodels.tsa.stattools.pacf_burg
838 Partial autocorrelation estimation using Burg's method.
840 Notes
841 -----
842 Based on simulation evidence across a range of low-order ARMA models,
843 the best methods based on root MSE are Yule-Walker (MLW), Levinson-Durbin
844 (MLE) and Burg, respectively. The estimators with the lowest bias included
845 included these three in addition to OLS and OLS-unbiased.
847 Yule-Walker (unbiased) and Levinson-Durbin (unbiased) performed
848 consistently worse than the other options.
849 """
850 nlags = int_like(nlags, 'nlags')
851 methods = ('ols', 'ols-inefficient', 'ols-unbiased', 'yw', 'ywu', 'ld',
852 'ywunbiased', 'yw_unbiased', 'ywm', 'ywmle', 'yw_mle', 'ldu',
853 'ldunbiased', 'ld_unbiased', 'ldb', 'ldbiased', 'ld_biased')
854 method = string_like(method, 'method', options=methods)
856 alpha = float_like(alpha, 'alpha', optional=True)
858 if method in ('ols', 'ols-inefficient', 'ols-unbiased'):
859 efficient = 'inefficient' not in method
860 unbiased = 'unbiased' in method
861 ret = pacf_ols(x, nlags=nlags, efficient=efficient, unbiased=unbiased)
862 elif method in ('yw', 'ywu', 'ywunbiased', 'yw_unbiased'):
863 ret = pacf_yw(x, nlags=nlags, method='unbiased')
864 elif method in ('ywm', 'ywmle', 'yw_mle'):
865 ret = pacf_yw(x, nlags=nlags, method='mle')
866 elif method in ('ld', 'ldu', 'ldunbiased', 'ld_unbiased'):
867 acv = acovf(x, unbiased=True, fft=False)
868 ld_ = levinson_durbin(acv, nlags=nlags, isacov=True)
869 ret = ld_[2]
870 # inconsistent naming with ywmle
871 else: # method in ('ldb', 'ldbiased', 'ld_biased')
872 acv = acovf(x, unbiased=False, fft=False)
873 ld_ = levinson_durbin(acv, nlags=nlags, isacov=True)
874 ret = ld_[2]
876 if alpha is not None:
877 varacf = 1. / len(x) # for all lags >=1
878 interval = stats.norm.ppf(1. - alpha / 2.) * np.sqrt(varacf)
879 confint = np.array(lzip(ret - interval, ret + interval))
880 confint[0] = ret[0] # fix confidence interval for lag 0 to varpacf=0
881 return ret, confint
882 else:
883 return ret
886def ccovf(x, y, unbiased=True, demean=True):
887 """
888 Calculate the crosscovariance between two series.
890 Parameters
891 ----------
892 x, y : array_like
893 The time series data to use in the calculation.
894 unbiased : bool, optional
895 If True, then denominators for autocovariance is n-k, otherwise n.
896 demean : bool, optional
897 Flag indicating whether to demean x and y.
899 Returns
900 -------
901 ndarray
902 The estimated crosscovariance function.
904 Notes
905 -----
906 This uses np.correlate which does full convolution. For very long time
907 series it is recommended to use fft convolution instead.
908 """
909 x = array_like(x, 'x')
910 y = array_like(y, 'y')
911 unbiased = bool_like(unbiased, 'unbiased')
912 demean = bool_like(demean, 'demean')
914 n = len(x)
915 if demean:
916 xo = x - x.mean()
917 yo = y - y.mean()
918 else:
919 xo = x
920 yo = y
921 if unbiased:
922 xi = np.ones(n)
923 d = np.correlate(xi, xi, 'full')
924 else:
925 d = n
926 return (np.correlate(xo, yo, 'full') / d)[n - 1:]
929def ccf(x, y, unbiased=True):
930 """
931 The cross-correlation function.
933 Parameters
934 ----------
935 x, y : array_like
936 The time series data to use in the calculation.
937 unbiased : bool
938 If True, then denominators for autocovariance is n-k, otherwise n.
940 Returns
941 -------
942 ndarray
943 The cross-correlation function of x and y.
945 Notes
946 -----
947 This is based np.correlate which does full convolution. For very long time
948 series it is recommended to use fft convolution instead.
950 If unbiased is true, the denominator for the autocovariance is adjusted
951 but the autocorrelation is not an unbiased estimator.
952 """
953 x = array_like(x, 'x')
954 y = array_like(y, 'y')
955 unbiased = bool_like(unbiased, 'unbiased')
957 cvf = ccovf(x, y, unbiased=unbiased, demean=True)
958 return cvf / (np.std(x) * np.std(y))
961def periodogram(x):
962 """
963 Compute the periodogram for the natural frequency of x.
965 .. deprecated::
966 Use scipy.signal.periodogram instead
968 Parameters
969 ----------
970 x : array_like
971 Array for which the periodogram is desired.
973 Returns
974 -------
975 ndarray
976 The periodogram defined as 1./len(x) * np.abs(np.fft.fft(x))**2.
978 References
979 ----------
980 .. [1] Brockwell, P.J. and Davis, R.A., 2016. Introduction to time series
981 and forecasting. Springer.
982 """
983 # TODO: Remove after 0.11
984 import warnings
985 warnings.warn('periodogram is deprecated and will be removed after 0.11. '
986 'Use scipy.signal.periodogram instead.', FutureWarning)
987 x = array_like(x, 'x')
989 pergr = 1. / len(x) * np.abs(np.fft.fft(x)) ** 2
990 pergr[0] = 0. # what are the implications of this?
991 return pergr
994# moved from sandbox.tsa.examples.try_ld_nitime, via nitime
995# TODO: check what to return, for testing and trying out returns everything
996def levinson_durbin(s, nlags=10, isacov=False):
997 """
998 Levinson-Durbin recursion for autoregressive processes.
1000 Parameters
1001 ----------
1002 s : array_like
1003 If isacov is False, then this is the time series. If iasacov is true
1004 then this is interpreted as autocovariance starting with lag 0.
1005 nlags : int, optional
1006 The largest lag to include in recursion or order of the autoregressive
1007 process.
1008 isacov : bool, optional
1009 Flag indicating whether the first argument, s, contains the
1010 autocovariances or the data series.
1012 Returns
1013 -------
1014 sigma_v : float
1015 The estimate of the error variance.
1016 arcoefs : ndarray
1017 The estimate of the autoregressive coefficients for a model including
1018 nlags.
1019 pacf : ndarray
1020 The partial autocorrelation function.
1021 sigma : ndarray
1022 The entire sigma array from intermediate result, last value is sigma_v.
1023 phi : ndarray
1024 The entire phi array from intermediate result, last column contains
1025 autoregressive coefficients for AR(nlags).
1027 Notes
1028 -----
1029 This function returns currently all results, but maybe we drop sigma and
1030 phi from the returns.
1032 If this function is called with the time series (isacov=False), then the
1033 sample autocovariance function is calculated with the default options
1034 (biased, no fft).
1035 """
1036 s = array_like(s, 's')
1037 nlags = int_like(nlags, 'nlags')
1038 isacov = bool_like(isacov, 'isacov')
1040 order = nlags
1042 if isacov:
1043 sxx_m = s
1044 else:
1045 sxx_m = acovf(s, fft=False)[:order + 1] # not tested
1047 phi = np.zeros((order + 1, order + 1), 'd')
1048 sig = np.zeros(order + 1)
1049 # initial points for the recursion
1050 phi[1, 1] = sxx_m[1] / sxx_m[0]
1051 sig[1] = sxx_m[0] - phi[1, 1] * sxx_m[1]
1052 for k in range(2, order + 1):
1053 phi[k, k] = (sxx_m[k] - np.dot(phi[1:k, k-1],
1054 sxx_m[1:k][::-1])) / sig[k-1]
1055 for j in range(1, k):
1056 phi[j, k] = phi[j, k-1] - phi[k, k] * phi[k-j, k-1]
1057 sig[k] = sig[k-1] * (1 - phi[k, k]**2)
1059 sigma_v = sig[-1]
1060 arcoefs = phi[1:, -1]
1061 pacf_ = np.diag(phi).copy()
1062 pacf_[0] = 1.
1063 return sigma_v, arcoefs, pacf_, sig, phi # return everything
1066def levinson_durbin_pacf(pacf, nlags=None):
1067 """
1068 Levinson-Durbin algorithm that returns the acf and ar coefficients.
1070 Parameters
1071 ----------
1072 pacf : array_like
1073 Partial autocorrelation array for lags 0, 1, ... p.
1074 nlags : int, optional
1075 Number of lags in the AR model. If omitted, returns coefficients from
1076 an AR(p) and the first p autocorrelations.
1078 Returns
1079 -------
1080 arcoefs : ndarray
1081 AR coefficients computed from the partial autocorrelations.
1082 acf : ndarray
1083 The acf computed from the partial autocorrelations. Array returned
1084 contains the autocorrelations corresponding to lags 0, 1, ..., p.
1086 References
1087 ----------
1088 .. [1] Brockwell, P.J. and Davis, R.A., 2016. Introduction to time series
1089 and forecasting. Springer.
1090 """
1091 pacf = array_like(pacf, 'pacf')
1092 nlags = int_like(nlags, 'nlags', optional=True)
1093 pacf = np.squeeze(np.asarray(pacf))
1095 if pacf[0] != 1:
1096 raise ValueError('The first entry of the pacf corresponds to lags 0 '
1097 'and so must be 1.')
1098 pacf = pacf[1:]
1099 n = pacf.shape[0]
1100 if nlags is not None:
1101 if nlags > n:
1102 raise ValueError('Must provide at least as many values from the '
1103 'pacf as the number of lags.')
1104 pacf = pacf[:nlags]
1105 n = pacf.shape[0]
1107 acf = np.zeros(n + 1)
1108 acf[1] = pacf[0]
1109 nu = np.cumprod(1 - pacf ** 2)
1110 arcoefs = pacf.copy()
1111 for i in range(1, n):
1112 prev = arcoefs[:-(n - i)].copy()
1113 arcoefs[:-(n - i)] = prev - arcoefs[i] * prev[::-1]
1114 acf[i + 1] = arcoefs[i] * nu[i-1] + prev.dot(acf[1:-(n - i)][::-1])
1115 acf[0] = 1
1116 return arcoefs, acf
1119def grangercausalitytests(x, maxlag, addconst=True, verbose=True):
1120 """
1121 Four tests for granger non causality of 2 time series.
1123 All four tests give similar results. `params_ftest` and `ssr_ftest` are
1124 equivalent based on F test which is identical to lmtest:grangertest in R.
1126 Parameters
1127 ----------
1128 x : array_like
1129 The data for test whether the time series in the second column Granger
1130 causes the time series in the first column. Missing values are not
1131 supported.
1132 maxlag : {int, Iterable[int]}
1133 If an integer, computes the test for all lags up to maxlag. If an
1134 iterable, computes the tests only for the lags in maxlag.
1135 addconst : bool
1136 Include a constant in the model.
1137 verbose : bool
1138 Print results.
1140 Returns
1141 -------
1142 dict
1143 All test results, dictionary keys are the number of lags. For each
1144 lag the values are a tuple, with the first element a dictionary with
1145 test statistic, pvalues, degrees of freedom, the second element are
1146 the OLS estimation results for the restricted model, the unrestricted
1147 model and the restriction (contrast) matrix for the parameter f_test.
1149 Notes
1150 -----
1151 TODO: convert to class and attach results properly
1153 The Null hypothesis for grangercausalitytests is that the time series in
1154 the second column, x2, does NOT Granger cause the time series in the first
1155 column, x1. Grange causality means that past values of x2 have a
1156 statistically significant effect on the current value of x1, taking past
1157 values of x1 into account as regressors. We reject the null hypothesis
1158 that x2 does not Granger cause x1 if the pvalues are below a desired size
1159 of the test.
1161 The null hypothesis for all four test is that the coefficients
1162 corresponding to past values of the second time series are zero.
1164 'params_ftest', 'ssr_ftest' are based on F distribution
1166 'ssr_chi2test', 'lrtest' are based on chi-square distribution
1168 References
1169 ----------
1170 .. [1] https://en.wikipedia.org/wiki/Granger_causality
1172 .. [2] Greene: Econometric Analysis
1174 Examples
1175 --------
1176 >>> import statsmodels.api as sm
1177 >>> from statsmodels.tsa.stattools import grangercausalitytests
1178 >>> import numpy as np
1179 >>> data = sm.datasets.macrodata.load_pandas()
1180 >>> data = data.data[['realgdp', 'realcons']].pct_change().dropna()
1182 # All lags up to 4
1183 >>> gc_res = grangercausalitytests(data, 4)
1185 # Only lag 4
1186 >>> gc_res = grangercausalitytests(data, [4])
1187 """
1188 x = array_like(x, 'x', ndim=2)
1189 if not np.isfinite(x).all():
1190 raise ValueError('x contains NaN or inf values.')
1191 addconst = bool_like(addconst, 'addconst')
1192 verbose = bool_like(verbose, 'verbose')
1193 try:
1194 lags = np.array([int(lag) for lag in maxlag])
1195 maxlag = lags.max()
1196 if lags.min() <= 0 or lags.size == 0:
1197 raise ValueError('maxlag must be a non-empty list containing only '
1198 'positive integers')
1199 except Exception:
1200 maxlag = int_like(maxlag, 'maxlag')
1201 if maxlag <= 0:
1202 raise ValueError('maxlag must a a positive integer')
1203 lags = np.arange(1, maxlag + 1)
1205 if x.shape[0] <= 3 * maxlag + int(addconst):
1206 raise ValueError("Insufficient observations. Maximum allowable "
1207 "lag is {0}".format(int((x.shape[0] - int(addconst)) /
1208 3) - 1))
1210 resli = {}
1212 for mlg in lags:
1213 result = {}
1214 if verbose:
1215 print('\nGranger Causality')
1216 print('number of lags (no zero)', mlg)
1217 mxlg = mlg
1219 # create lagmat of both time series
1220 dta = lagmat2ds(x, mxlg, trim='both', dropex=1)
1222 # add constant
1223 if addconst:
1224 dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)
1225 dtajoint = add_constant(dta[:, 1:], prepend=False)
1226 else:
1227 raise NotImplementedError('Not Implemented')
1228 # dtaown = dta[:, 1:mxlg]
1229 # dtajoint = dta[:, 1:]
1231 # Run ols on both models without and with lags of second variable
1232 res2down = OLS(dta[:, 0], dtaown).fit()
1233 res2djoint = OLS(dta[:, 0], dtajoint).fit()
1235 # print results
1236 # for ssr based tests see:
1237 # http://support.sas.com/rnd/app/examples/ets/granger/index.htm
1238 # the other tests are made-up
1240 # Granger Causality test using ssr (F statistic)
1241 fgc1 = ((res2down.ssr - res2djoint.ssr) /
1242 res2djoint.ssr / mxlg * res2djoint.df_resid)
1243 if verbose:
1244 print('ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d,'
1245 ' df_num=%d' % (fgc1,
1246 stats.f.sf(fgc1, mxlg,
1247 res2djoint.df_resid),
1248 res2djoint.df_resid, mxlg))
1249 result['ssr_ftest'] = (fgc1,
1250 stats.f.sf(fgc1, mxlg, res2djoint.df_resid),
1251 res2djoint.df_resid, mxlg)
1253 # Granger Causality test using ssr (ch2 statistic)
1254 fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
1255 if verbose:
1256 print('ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, '
1257 'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg))
1258 result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)
1260 # likelihood ratio test pvalue:
1261 lr = -2 * (res2down.llf - res2djoint.llf)
1262 if verbose:
1263 print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %
1264 (lr, stats.chi2.sf(lr, mxlg), mxlg))
1265 result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)
1267 # F test that all lag coefficients of exog are zero
1268 rconstr = np.column_stack((np.zeros((mxlg, mxlg)),
1269 np.eye(mxlg, mxlg),
1270 np.zeros((mxlg, 1))))
1271 ftres = res2djoint.f_test(rconstr)
1272 if verbose:
1273 print('parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d,'
1274 ' df_num=%d' % (ftres.fvalue, ftres.pvalue, ftres.df_denom,
1275 ftres.df_num))
1276 result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
1277 np.squeeze(ftres.pvalue)[()],
1278 ftres.df_denom, ftres.df_num)
1280 resli[mxlg] = (result, [res2down, res2djoint, rconstr])
1282 return resli
1285def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic',
1286 return_results=None):
1287 """
1288 Test for no-cointegration of a univariate equation.
1290 The null hypothesis is no cointegration. Variables in y0 and y1 are
1291 assumed to be integrated of order 1, I(1).
1293 This uses the augmented Engle-Granger two-step cointegration test.
1294 Constant or trend is included in 1st stage regression, i.e. in
1295 cointegrating equation.
1297 **Warning:** The autolag default has changed compared to statsmodels 0.8.
1298 In 0.8 autolag was always None, no the keyword is used and defaults to
1299 'aic'. Use `autolag=None` to avoid the lag search.
1301 Parameters
1302 ----------
1303 y0 : array_like
1304 The first element in cointegrated system. Must be 1-d.
1305 y1 : array_like
1306 The remaining elements in cointegrated system.
1307 trend : str {'c', 'ct'}
1308 The trend term included in regression for cointegrating equation.
1310 * 'c' : constant.
1311 * 'ct' : constant and linear trend.
1312 * also available quadratic trend 'ctt', and no constant 'nc'.
1314 method : {'aeg'}
1315 Only 'aeg' (augmented Engle-Granger) is available.
1316 maxlag : None or int
1317 Argument for `adfuller`, largest or given number of lags.
1318 autolag : str
1319 Argument for `adfuller`, lag selection criterion.
1321 * If None, then maxlag lags are used without lag search.
1322 * If 'AIC' (default) or 'BIC', then the number of lags is chosen
1323 to minimize the corresponding information criterion.
1324 * 't-stat' based choice of maxlag. Starts with maxlag and drops a
1325 lag until the t-statistic on the last lag length is significant
1326 using a 5%-sized test.
1327 return_results : bool
1328 For future compatibility, currently only tuple available.
1329 If True, then a results instance is returned. Otherwise, a tuple
1330 with the test outcome is returned. Set `return_results=False` to
1331 avoid future changes in return.
1333 Returns
1334 -------
1335 coint_t : float
1336 The t-statistic of unit-root test on residuals.
1337 pvalue : float
1338 MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994).
1339 crit_value : dict
1340 Critical values for the test statistic at the 1 %, 5 %, and 10 %
1341 levels based on regression curve. This depends on the number of
1342 observations.
1344 Notes
1345 -----
1346 The Null hypothesis is that there is no cointegration, the alternative
1347 hypothesis is that there is cointegrating relationship. If the pvalue is
1348 small, below a critical size, then we can reject the hypothesis that there
1349 is no cointegrating relationship.
1351 P-values and critical values are obtained through regression surface
1352 approximation from MacKinnon 1994 and 2010.
1354 If the two series are almost perfectly collinear, then computing the
1355 test is numerically unstable. However, the two series will be cointegrated
1356 under the maintained assumption that they are integrated. In this case
1357 the t-statistic will be set to -inf and the pvalue to zero.
1359 TODO: We could handle gaps in data by dropping rows with nans in the
1360 Auxiliary regressions. Not implemented yet, currently assumes no nans
1361 and no gaps in time series.
1363 References
1364 ----------
1365 .. [1] MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions
1366 for Unit-Root and Cointegration Tests." Journal of Business & Economics
1367 Statistics, 12.2, 167-76.
1368 .. [2] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests."
1369 Queen's University, Dept of Economics Working Papers 1227.
1370 http://ideas.repec.org/p/qed/wpaper/1227.html
1371 """
1372 y0 = array_like(y0, 'y0')
1373 y1 = array_like(y1, 'y1', ndim=2)
1374 trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt'))
1375 method = string_like(method, 'method', options=('aeg',))
1376 maxlag = int_like(maxlag, 'maxlag', optional=True)
1377 autolag = string_like(autolag, 'autolag', optional=True,
1378 options=('aic', 'bic', 't-stat'))
1379 return_results = bool_like(return_results, 'return_results', optional=True)
1381 nobs, k_vars = y1.shape
1382 k_vars += 1 # add 1 for y0
1384 if trend == 'nc':
1385 xx = y1
1386 else:
1387 xx = add_trend(y1, trend=trend, prepend=False)
1389 res_co = OLS(y0, xx).fit()
1391 if res_co.rsquared < 1 - 100 * SQRTEPS:
1392 res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=autolag,
1393 regression='nc')
1394 else:
1395 import warnings
1396 warnings.warn("y0 and y1 are (almost) perfectly colinear."
1397 "Cointegration test is not reliable in this case.",
1398 CollinearityWarning)
1399 # Edge case where series are too similar
1400 res_adf = (-np.inf,)
1402 # no constant or trend, see egranger in Stata and MacKinnon
1403 if trend == 'nc':
1404 crit = [np.nan] * 3 # 2010 critical values not available
1405 else:
1406 crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1)
1407 # nobs - 1, the -1 is to match egranger in Stata, I do not know why.
1408 # TODO: check nobs or df = nobs - k
1410 pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars)
1411 return res_adf[0], pval_asy, crit
1414def _safe_arma_fit(y, order, model_kw, trend, fit_kw, start_params=None):
1415 try:
1416 return ARMA(y, order=order, **model_kw).fit(disp=0, trend=trend,
1417 start_params=start_params,
1418 **fit_kw)
1419 except LinAlgError:
1420 # SVD convergence failure on badly misspecified models
1421 return
1423 except ValueError as error:
1424 if start_params is not None: # do not recurse again
1425 # user supplied start_params only get one chance
1426 return
1427 # try a little harder, should be handled in fit really
1428 elif ('initial' not in error.args[0] or 'initial' in str(error)):
1429 start_params = [.1] * sum(order)
1430 if trend == 'c':
1431 start_params = [.1] + start_params
1432 return _safe_arma_fit(y, order, model_kw, trend, fit_kw,
1433 start_params)
1434 else:
1435 return
1436 except: # no idea what happened
1437 return
1440def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c',
1441 model_kw=None, fit_kw=None):
1442 """
1443 Compute information criteria for many ARMA models.
1445 Parameters
1446 ----------
1447 y : array_like
1448 Array of time-series data.
1449 max_ar : int
1450 Maximum number of AR lags to use. Default 4.
1451 max_ma : int
1452 Maximum number of MA lags to use. Default 2.
1453 ic : str, list
1454 Information criteria to report. Either a single string or a list
1455 of different criteria is possible.
1456 trend : str
1457 The trend to use when fitting the ARMA models.
1458 model_kw : dict
1459 Keyword arguments to be passed to the ``ARMA`` model.
1460 fit_kw : dict
1461 Keyword arguments to be passed to ``ARMA.fit``.
1463 Returns
1464 -------
1465 Bunch
1466 Dict-like object with attribute access. Each ic is an attribute with a
1467 DataFrame for the results. The AR order used is the row index. The ma
1468 order used is the column index. The minimum orders are available as
1469 ``ic_min_order``.
1471 Notes
1472 -----
1473 This method can be used to tentatively identify the order of an ARMA
1474 process, provided that the time series is stationary and invertible. This
1475 function computes the full exact MLE estimate of each model and can be,
1476 therefore a little slow. An implementation using approximate estimates
1477 will be provided in the future. In the meantime, consider passing
1478 {method : 'css'} to fit_kw.
1480 Examples
1481 --------
1483 >>> from statsmodels.tsa.arima_process import arma_generate_sample
1484 >>> import statsmodels.api as sm
1485 >>> import numpy as np
1487 >>> arparams = np.array([.75, -.25])
1488 >>> maparams = np.array([.65, .35])
1489 >>> arparams = np.r_[1, -arparams]
1490 >>> maparam = np.r_[1, maparams]
1491 >>> nobs = 250
1492 >>> np.random.seed(2014)
1493 >>> y = arma_generate_sample(arparams, maparams, nobs)
1494 >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc')
1495 >>> res.aic_min_order
1496 >>> res.bic_min_order
1497 """
1498 max_ar = int_like(max_ar, 'max_ar')
1499 max_ma = int_like(max_ma, 'max_ma')
1500 trend = string_like(trend, 'trend', options=('nc', 'c'))
1501 model_kw = dict_like(model_kw, 'model_kw', optional=True)
1502 fit_kw = dict_like(fit_kw, 'fit_kw', optional=True)
1504 ar_range = lrange(0, max_ar + 1)
1505 ma_range = lrange(0, max_ma + 1)
1506 if isinstance(ic, str):
1507 ic = [ic]
1508 elif not isinstance(ic, (list, tuple)):
1509 raise ValueError("Need a list or a tuple for ic if not a string.")
1511 results = np.zeros((len(ic), max_ar + 1, max_ma + 1))
1512 model_kw = {} if model_kw is None else model_kw
1513 fit_kw = {} if fit_kw is None else fit_kw
1514 y_arr = array_like(y, 'y', contiguous=True)
1515 for ar in ar_range:
1516 for ma in ma_range:
1517 if ar == 0 and ma == 0 and trend == 'nc':
1518 results[:, ar, ma] = np.nan
1519 continue
1521 mod = _safe_arma_fit(y_arr, (ar, ma), model_kw, trend, fit_kw)
1522 if mod is None:
1523 results[:, ar, ma] = np.nan
1524 continue
1526 for i, criteria in enumerate(ic):
1527 results[i, ar, ma] = getattr(mod, criteria)
1529 dfs = [pd.DataFrame(res, columns=ma_range, index=ar_range) for res in
1530 results]
1532 res = dict(zip(ic, dfs))
1534 # add the minimums to the results dict
1535 min_res = {}
1536 for i, result in iteritems(res):
1537 mins = np.where(result.min().min() == result)
1538 min_res.update({i + '_min_order': (mins[0][0], mins[1][0])})
1539 res.update(min_res)
1541 return Bunch(**res)
1544def has_missing(data):
1545 """
1546 Returns True if 'data' contains missing entries, otherwise False
1547 """
1548 return np.isnan(np.sum(data))
1551@deprecate_kwarg('lags', 'nlags')
1552def kpss(x, regression='c', nlags=None, store=False):
1553 """
1554 Kwiatkowski-Phillips-Schmidt-Shin test for stationarity.
1556 Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
1557 hypothesis that x is level or trend stationary.
1559 Parameters
1560 ----------
1561 x : array_like, 1d
1562 The data series to test.
1563 regression : str{'c', 'ct'}
1564 The null hypothesis for the KPSS test.
1566 * 'c' : The data is stationary around a constant (default).
1567 * 'ct' : The data is stationary around a trend.
1568 nlags : {None, str, int}, optional
1569 Indicates the number of lags to be used. If None (default), lags is
1570 calculated using the legacy method. If 'auto', lags is calculated
1571 using the data-dependent method of Hobijn et al. (1998). See also
1572 Andrews (1991), Newey & West (1994), and Schwert (1989). If set to
1573 'legacy', uses int(12 * (n / 100)**(1 / 4)) , as outlined in
1574 Schwert (1989).
1575 store : bool
1576 If True, then a result instance is returned additionally to
1577 the KPSS statistic (default is False).
1579 Returns
1580 -------
1581 kpss_stat : float
1582 The KPSS test statistic.
1583 p_value : float
1584 The p-value of the test. The p-value is interpolated from
1585 Table 1 in Kwiatkowski et al. (1992), and a boundary point
1586 is returned if the test statistic is outside the table of
1587 critical values, that is, if the p-value is outside the
1588 interval (0.01, 0.1).
1589 lags : int
1590 The truncation lag parameter.
1591 crit : dict
1592 The critical values at 10%, 5%, 2.5% and 1%. Based on
1593 Kwiatkowski et al. (1992).
1594 resstore : (optional) instance of ResultStore
1595 An instance of a dummy class with results attached as attributes.
1597 Notes
1598 -----
1599 To estimate sigma^2 the Newey-West estimator is used. If lags is None,
1600 the truncation lag parameter is set to int(12 * (n / 100) ** (1 / 4)),
1601 as outlined in Schwert (1989). The p-values are interpolated from
1602 Table 1 of Kwiatkowski et al. (1992). If the computed statistic is
1603 outside the table of critical values, then a warning message is
1604 generated.
1606 Missing values are not handled.
1608 References
1609 ----------
1610 .. [1] Andrews, D.W.K. (1991). Heteroskedasticity and autocorrelation
1611 consistent covariance matrix estimation. Econometrica, 59: 817-858.
1613 .. [2] Hobijn, B., Frances, B.H., & Ooms, M. (2004). Generalizations of the
1614 KPSS-test for stationarity. Statistica Neerlandica, 52: 483-502.
1616 .. [3] Kwiatkowski, D., Phillips, P.C.B., Schmidt, P., & Shin, Y. (1992).
1617 Testing the null hypothesis of stationarity against the alternative of a
1618 unit root. Journal of Econometrics, 54: 159-178.
1620 .. [4] Newey, W.K., & West, K.D. (1994). Automatic lag selection in
1621 covariance matrix estimation. Review of Economic Studies, 61: 631-653.
1623 .. [5] Schwert, G. W. (1989). Tests for unit roots: A Monte Carlo
1624 investigation. Journal of Business and Economic Statistics, 7 (2):
1625 147-159.
1626 """
1627 from warnings import warn
1629 x = array_like(x, 'x')
1630 regression = string_like(regression, 'regression', options=('c', 'ct'))
1631 store = bool_like(store, 'store')
1633 nobs = x.shape[0]
1634 hypo = regression
1636 # if m is not one, n != m * n
1637 if nobs != x.size:
1638 raise ValueError("x of shape {0} not understood".format(x.shape))
1640 if hypo == 'ct':
1641 # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t,
1642 # where beta is the trend, r_t a random walk and e_t a stationary
1643 # error term.
1644 resids = OLS(x, add_constant(np.arange(1, nobs + 1))).fit().resid
1645 crit = [0.119, 0.146, 0.176, 0.216]
1646 elif hypo == 'c':
1647 # special case of the model above, where beta = 0 (so the null
1648 # hypothesis is that the data is stationary around r_0).
1649 resids = x - x.mean()
1650 crit = [0.347, 0.463, 0.574, 0.739]
1652 if nlags is None:
1653 nlags = 'legacy'
1654 msg = 'The behavior of using lags=None will change in the next ' \
1655 'release. Currently lags=None is the same as ' \
1656 'lags=\'legacy\', and so a sample-size lag length is used. ' \
1657 'After the next release, the default will change to be the ' \
1658 'same as lags=\'auto\' which uses an automatic lag length ' \
1659 'selection method. To silence this warning, either use ' \
1660 '\'auto\' or \'legacy\''
1661 warn(msg, FutureWarning)
1662 if nlags == 'legacy':
1663 nlags = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.)))
1664 nlags = min(nlags, nobs - 1)
1665 elif nlags == 'auto':
1666 # autolag method of Hobijn et al. (1998)
1667 nlags = _kpss_autolag(resids, nobs)
1668 nlags = min(nlags, nobs - 1)
1669 else:
1670 nlags = int(nlags)
1672 if nlags >= nobs:
1673 raise ValueError("lags ({}) must be < number of observations ({})"
1674 .format(nlags, nobs))
1676 pvals = [0.10, 0.05, 0.025, 0.01]
1678 eta = np.sum(resids.cumsum()**2) / (nobs**2) # eq. 11, p. 165
1679 s_hat = _sigma_est_kpss(resids, nobs, nlags)
1681 kpss_stat = eta / s_hat
1682 p_value = np.interp(kpss_stat, crit, pvals)
1684 if p_value == pvals[-1]:
1685 warn("p-value is smaller than the indicated p-value", InterpolationWarning)
1686 elif p_value == pvals[0]:
1687 warn("p-value is greater than the indicated p-value", InterpolationWarning)
1689 crit_dict = {'10%': crit[0], '5%': crit[1], '2.5%': crit[2], '1%': crit[3]}
1691 if store:
1692 rstore = ResultsStore()
1693 rstore.lags = nlags
1694 rstore.nobs = nobs
1696 stationary_type = "level" if hypo == 'c' else "trend"
1697 rstore.H0 = "The series is {0} stationary".format(stationary_type)
1698 rstore.HA = "The series is not {0} stationary".format(stationary_type)
1700 return kpss_stat, p_value, crit_dict, rstore
1701 else:
1702 return kpss_stat, p_value, nlags, crit_dict
1705def _sigma_est_kpss(resids, nobs, lags):
1706 """
1707 Computes equation 10, p. 164 of Kwiatkowski et al. (1992). This is the
1708 consistent estimator for the variance.
1709 """
1710 s_hat = np.sum(resids**2)
1711 for i in range(1, lags + 1):
1712 resids_prod = np.dot(resids[i:], resids[:nobs - i])
1713 s_hat += 2 * resids_prod * (1. - (i / (lags + 1.)))
1714 return s_hat / nobs
1717def _kpss_autolag(resids, nobs):
1718 """
1719 Computes the number of lags for covariance matrix estimation in KPSS test
1720 using method of Hobijn et al (1998). See also Andrews (1991), Newey & West
1721 (1994), and Schwert (1989). Assumes Bartlett / Newey-West kernel.
1722 """
1723 covlags = int(np.power(nobs, 2. / 9.))
1724 s0 = np.sum(resids**2) / nobs
1725 s1 = 0
1726 for i in range(1, covlags + 1):
1727 resids_prod = np.dot(resids[i:], resids[:nobs - i])
1728 resids_prod /= (nobs / 2.)
1729 s0 += resids_prod
1730 s1 += i * resids_prod
1731 s_hat = s1 / s0
1732 pwr = 1. / 3.
1733 gamma_hat = 1.1447 * np.power(s_hat * s_hat, pwr)
1734 autolags = int(gamma_hat * np.power(nobs, pwr))
1735 return autolags
1738class ZivotAndrewsUnitRoot(object):
1739 """
1740 Class wrapper for Zivot-Andrews structural-break unit-root test
1741 """
1742 def __init__(self):
1743 """
1744 Critical values for the three different models specified for the
1745 Zivot-Andrews unit-root test.
1747 Notes
1748 -----
1749 The p-values are generated through Monte Carlo simulation using
1750 100,000 replications and 2000 data points.
1751 """
1752 self._za_critical_values = {}
1753 # constant-only model
1754 self._c = (
1755 (0.001, -6.78442), (0.100, -5.83192), (0.200, -5.68139),
1756 (0.300, -5.58461), (0.400, -5.51308), (0.500, -5.45043),
1757 (0.600, -5.39924), (0.700, -5.36023), (0.800, -5.33219),
1758 (0.900, -5.30294), (1.000, -5.27644), (2.500, -5.03340),
1759 (5.000, -4.81067), (7.500, -4.67636), (10.000, -4.56618),
1760 (12.500, -4.48130), (15.000, -4.40507), (17.500, -4.33947),
1761 (20.000, -4.28155), (22.500, -4.22683), (25.000, -4.17830),
1762 (27.500, -4.13101), (30.000, -4.08586), (32.500, -4.04455),
1763 (35.000, -4.00380), (37.500, -3.96144), (40.000, -3.92078),
1764 (42.500, -3.88178), (45.000, -3.84503), (47.500, -3.80549),
1765 (50.000, -3.77031), (52.500, -3.73209), (55.000, -3.69600),
1766 (57.500, -3.65985), (60.000, -3.62126), (65.000, -3.54580),
1767 (70.000, -3.46848), (75.000, -3.38533), (80.000, -3.29112),
1768 (85.000, -3.17832), (90.000, -3.04165), (92.500, -2.95146),
1769 (95.000, -2.83179), (96.000, -2.76465), (97.000, -2.68624),
1770 (98.000, -2.57884), (99.000, -2.40044), (99.900, -1.88932)
1771 )
1772 self._za_critical_values['c'] = np.asarray(self._c)
1773 # trend-only model
1774 self._t = (
1775 (0.001, -83.9094), (0.100, -13.8837), (0.200, -9.13205),
1776 (0.300, -6.32564), (0.400, -5.60803), (0.500, -5.38794),
1777 (0.600, -5.26585), (0.700, -5.18734), (0.800, -5.12756),
1778 (0.900, -5.07984), (1.000, -5.03421), (2.500, -4.65634),
1779 (5.000, -4.40580), (7.500, -4.25214), (10.000, -4.13678),
1780 (12.500, -4.03765), (15.000, -3.95185), (17.500, -3.87945),
1781 (20.000, -3.81295), (22.500, -3.75273), (25.000, -3.69836),
1782 (27.500, -3.64785), (30.000, -3.59819), (32.500, -3.55146),
1783 (35.000, -3.50522), (37.500, -3.45987), (40.000, -3.41672),
1784 (42.500, -3.37465), (45.000, -3.33394), (47.500, -3.29393),
1785 (50.000, -3.25316), (52.500, -3.21244), (55.000, -3.17124),
1786 (57.500, -3.13211), (60.000, -3.09204), (65.000, -3.01135),
1787 (70.000, -2.92897), (75.000, -2.83614), (80.000, -2.73893),
1788 (85.000, -2.62840), (90.000, -2.49611), (92.500, -2.41337),
1789 (95.000, -2.30820), (96.000, -2.25797), (97.000, -2.19648),
1790 (98.000, -2.11320), (99.000, -1.99138), (99.900, -1.67466)
1791 )
1792 self._za_critical_values['t'] = np.asarray(self._t)
1793 # constant + trend model
1794 self._ct = (
1795 (0.001, -38.17800), (0.100, -6.43107), (0.200, -6.07279),
1796 (0.300, -5.95496), (0.400, -5.86254), (0.500, -5.77081),
1797 (0.600, -5.72541), (0.700, -5.68406), (0.800, -5.65163),
1798 (0.900, -5.60419), (1.000, -5.57556), (2.500, -5.29704),
1799 (5.000, -5.07332), (7.500, -4.93003), (10.000, -4.82668),
1800 (12.500, -4.73711), (15.000, -4.66020), (17.500, -4.58970),
1801 (20.000, -4.52855), (22.500, -4.47100), (25.000, -4.42011),
1802 (27.500, -4.37387), (30.000, -4.32705), (32.500, -4.28126),
1803 (35.000, -4.23793), (37.500, -4.19822), (40.000, -4.15800),
1804 (42.500, -4.11946), (45.000, -4.08064), (47.500, -4.04286),
1805 (50.000, -4.00489), (52.500, -3.96837), (55.000, -3.93200),
1806 (57.500, -3.89496), (60.000, -3.85577), (65.000, -3.77795),
1807 (70.000, -3.69794), (75.000, -3.61852), (80.000, -3.52485),
1808 (85.000, -3.41665), (90.000, -3.28527), (92.500, -3.19724),
1809 (95.000, -3.08769), (96.000, -3.03088), (97.000, -2.96091),
1810 (98.000, -2.85581), (99.000, -2.71015), (99.900, -2.28767)
1811 )
1812 self._za_critical_values['ct'] = np.asarray(self._ct)
1814 def _za_crit(self, stat, model='c'):
1815 """
1816 Linear interpolation for Zivot-Andrews p-values and critical values
1818 Parameters
1819 ----------
1820 stat : float
1821 The ZA test statistic
1822 model : {'c','t','ct'}
1823 The model used when computing the ZA statistic. 'c' is default.
1825 Returns
1826 -------
1827 pvalue : float
1828 The interpolated p-value
1829 cvdict : dict
1830 Critical values for the test statistic at the 1%, 5%, and 10%
1831 levels
1833 Notes
1834 -----
1835 The p-values are linear interpolated from the quantiles of the
1836 simulated ZA test statistic distribution
1837 """
1838 table = self._za_critical_values[model]
1839 pcnts = table[:, 0]
1840 stats = table[:, 1]
1841 # ZA cv table contains quantiles multiplied by 100
1842 pvalue = np.interp(stat, stats, pcnts) / 100.0
1843 cv = [1.0, 5.0, 10.0]
1844 crit_value = np.interp(cv, pcnts, stats)
1845 cvdict = {"1%": crit_value[0], "5%": crit_value[1],
1846 "10%": crit_value[2]}
1847 return pvalue, cvdict
1849 def _quick_ols(self, endog, exog):
1850 """
1851 Minimal implementation of LS estimator for internal use
1852 """
1853 xpxi = np.linalg.inv(exog.T.dot(exog))
1854 xpy = exog.T.dot(endog)
1855 nobs, k_exog = exog.shape
1856 b = xpxi.dot(xpy)
1857 e = endog - exog.dot(b)
1858 sigma2 = e.T.dot(e) / (nobs - k_exog)
1859 return b / np.sqrt(np.diag(sigma2 * xpxi))
1861 def _format_regression_data(self, series, nobs, const, trend, cols, lags):
1862 """
1863 Create the endog/exog data for the auxiliary regressions
1864 from the original (standardized) series under test.
1865 """
1866 # first-diff y and standardize for numerical stability
1867 endog = np.diff(series, axis=0)
1868 endog /= np.sqrt(endog.T.dot(endog))
1869 series /= np.sqrt(series.T.dot(series))
1870 # reserve exog space
1871 exog = np.zeros((endog[lags:].shape[0], cols + lags))
1872 exog[:, 0] = const
1873 # lagged y and dy
1874 exog[:, cols - 1] = series[lags:(nobs - 1)]
1875 exog[:, cols:] = lagmat(
1876 endog, lags, trim='none')[lags:exog.shape[0] + lags]
1877 return endog, exog
1879 def _update_regression_exog(self, exog, regression, period, nobs, const,
1880 trend, cols, lags):
1881 """
1882 Update the exog array for the next regression.
1883 """
1884 cutoff = (period - (lags + 1))
1885 if regression != 't':
1886 exog[:cutoff, 1] = 0
1887 exog[cutoff:, 1] = const
1888 exog[:, 2] = trend[(lags + 2):(nobs + 1)]
1889 if regression == 'ct':
1890 exog[:cutoff, 3] = 0
1891 exog[cutoff:, 3] = trend[1:(nobs - period + 1)]
1892 else:
1893 exog[:, 1] = trend[(lags + 2):(nobs + 1)]
1894 exog[:(cutoff-1), 2] = 0
1895 exog[(cutoff-1):, 2] = trend[0:(nobs - period + 1)]
1896 return exog
1898 def run(self, x, trim=0.15, maxlag=None, regression='c', autolag='AIC'):
1899 """
1900 Zivot-Andrews structural-break unit-root test.
1902 The Zivot-Andrews test tests for a unit root in a univariate process
1903 in the presence of serial correlation and a single structural break.
1905 Parameters
1906 ----------
1907 x : array_like
1908 The data series to test.
1909 trim : float
1910 The percentage of series at begin/end to exclude from break-period
1911 calculation in range [0, 0.333] (default=0.15).
1912 maxlag : int
1913 The maximum lag which is included in test, default is
1914 12*(nobs/100)^{1/4} (Schwert, 1989).
1915 regression : {'c','t','ct'}
1916 Constant and trend order to include in regression.
1918 * 'c' : constant only (default).
1919 * 't' : trend only.
1920 * 'ct' : constant and trend.
1921 autolag : {'AIC', 'BIC', 't-stat', None}
1922 The method to select the lag length when using automatic selection.
1924 * if None, then maxlag lags are used,
1925 * if 'AIC' (default) or 'BIC', then the number of lags is chosen
1926 to minimize the corresponding information criterion,
1927 * 't-stat' based choice of maxlag. Starts with maxlag and drops a
1928 lag until the t-statistic on the last lag length is significant
1929 using a 5%-sized test.
1931 Returns
1932 -------
1933 zastat : float
1934 The test statistic.
1935 pvalue : float
1936 The pvalue based on MC-derived critical values.
1937 cvdict : dict
1938 The critical values for the test statistic at the 1%, 5%, and 10%
1939 levels.
1940 bpidx : int
1941 The index of x corresponding to endogenously calculated break period
1942 with values in the range [0..nobs-1].
1943 baselag : int
1944 The number of lags used for period regressions.
1946 Notes
1947 -----
1948 H0 = unit root with a single structural break
1950 Algorithm follows Baum (2004/2015) approximation to original
1951 Zivot-Andrews method. Rather than performing an autolag regression at
1952 each candidate break period (as per the original paper), a single
1953 autolag regression is run up-front on the base model (constant + trend
1954 with no dummies) to determine the best lag length. This lag length is
1955 then used for all subsequent break-period regressions. This results in
1956 significant run time reduction but also slightly more pessimistic test
1957 statistics than the original Zivot-Andrews method, although no attempt
1958 has been made to characterize the size/power trade-off.
1960 References
1961 ----------
1962 .. [1] Baum, C.F. (2004). ZANDREWS: Stata module to calculate
1963 Zivot-Andrews unit root test in presence of structural break,"
1964 Statistical Software Components S437301, Boston College Department
1965 of Economics, revised 2015.
1967 .. [2] Schwert, G.W. (1989). Tests for unit roots: A Monte Carlo
1968 investigation. Journal of Business & Economic Statistics, 7:
1969 147-159.
1971 .. [3] Zivot, E., and Andrews, D.W.K. (1992). Further evidence on the
1972 great crash, the oil-price shock, and the unit-root hypothesis.
1973 Journal of Business & Economic Studies, 10: 251-270.
1974 """
1975 x = array_like(x, 'x')
1976 trim = float_like(trim, 'trim')
1977 maxlag = int_like(maxlag, 'maxlag', optional=True)
1978 regression = string_like(regression, 'regression',
1979 options=('c', 't', 'ct'))
1980 autolag = string_like(autolag, 'autolag',
1981 options=('AIC', 'BIC', 't-stat'), optional=True)
1982 if trim < 0 or trim > (1. / 3.):
1983 raise ValueError('trim value must be a float in range [0, 1/3)')
1984 nobs = x.shape[0]
1985 if autolag:
1986 adf_res = adfuller(x, maxlag=maxlag, regression='ct',
1987 autolag=autolag)
1988 baselags = adf_res[2]
1989 elif maxlag:
1990 baselags = maxlag
1991 else:
1992 baselags = int(12. * np.power(nobs / 100., 1 / 4.))
1993 trimcnt = int(nobs * trim)
1994 start_period = trimcnt
1995 end_period = nobs - trimcnt
1996 if regression == 'ct':
1997 basecols = 5
1998 else:
1999 basecols = 4
2000 # normalize constant and trend terms for stability
2001 c_const = 1 / np.sqrt(nobs)
2002 t_const = np.arange(1.0, nobs + 2)
2003 t_const *= np.sqrt(3) / nobs ** (3 / 2)
2004 # format the auxiliary regression data
2005 endog, exog = self._format_regression_data(
2006 x, nobs, c_const, t_const, basecols, baselags)
2007 # iterate through the time periods
2008 stats = np.full(end_period + 1, np.inf)
2009 for bp in range(start_period + 1, end_period + 1):
2010 # update intercept dummy / trend / trend dummy
2011 exog = self._update_regression_exog(exog, regression, bp, nobs,
2012 c_const, t_const, basecols,
2013 baselags)
2014 # check exog rank on first iteration
2015 if bp == start_period + 1:
2016 o = OLS(endog[baselags:], exog, hasconst=1).fit()
2017 if o.df_model < exog.shape[1] - 1:
2018 raise ValueError(
2019 'ZA: auxiliary exog matrix is not full rank.\n'
2020 ' cols (exc intercept) = {} rank = {}'.format(
2021 exog.shape[1] - 1, o.df_model))
2022 stats[bp] = o.tvalues[basecols - 1]
2023 else:
2024 stats[bp] = self._quick_ols(endog[baselags:],
2025 exog)[basecols - 1]
2026 # return best seen
2027 zastat = np.min(stats)
2028 bpidx = np.argmin(stats) - 1
2029 crit = self._za_crit(zastat, regression)
2030 pval = crit[0]
2031 cvdict = crit[1]
2032 return zastat, pval, cvdict, baselags, bpidx
2034 def __call__(self, x, trim=0.15, maxlag=None, regression='c',
2035 autolag='AIC'):
2036 return self.run(x, trim=trim, maxlag=maxlag, regression=regression,
2037 autolag=autolag)
2040zivot_andrews = ZivotAndrewsUnitRoot()
2041zivot_andrews.__doc__ = zivot_andrews.run.__doc__