Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Statistical tools for time series analysis 

3""" 

4from statsmodels.compat.python import iteritems, lrange, lzip 

5from statsmodels.compat.pandas import deprecate_kwarg 

6from statsmodels.compat.numpy import lstsq 

7from statsmodels.compat.scipy import _next_regular 

8 

9import numpy as np 

10from numpy.linalg import LinAlgError 

11from scipy import stats 

12import pandas as pd 

13 

14from statsmodels.regression.linear_model import OLS, yule_walker 

15from statsmodels.tools.sm_exceptions import (InterpolationWarning, 

16 MissingDataError, 

17 CollinearityWarning) 

18from statsmodels.tools.tools import add_constant, Bunch 

19from statsmodels.tools.validation import (array_like, string_like, bool_like, 

20 int_like, dict_like, float_like) 

21from statsmodels.tsa._bds import bds 

22from statsmodels.tsa._innovations import innovations_filter, innovations_algo 

23from statsmodels.tsa.adfvalues import mackinnonp, mackinnoncrit 

24from statsmodels.tsa.arima_model import ARMA 

25from statsmodels.tsa.tsatools import lagmat, lagmat2ds, add_trend 

26 

27__all__ = ['acovf', 'acf', 'pacf', 'pacf_yw', 'pacf_ols', 'ccovf', 'ccf', 

28 'periodogram', 'q_stat', 'coint', 'arma_order_select_ic', 

29 'adfuller', 'kpss', 'bds', 'pacf_burg', 'innovations_algo', 

30 'innovations_filter', 'levinson_durbin_pacf', 'levinson_durbin', 

31 'zivot_andrews'] 

32 

33SQRTEPS = np.sqrt(np.finfo(np.double).eps) 

34 

35 

36#NOTE: now in two places to avoid circular import 

37#TODO: I like the bunch pattern for this too. 

38class ResultsStore(object): 

39 def __str__(self): 

40 return self._str # pylint: disable=E1101 

41 

42 

43def _autolag(mod, endog, exog, startlag, maxlag, method, modargs=(), 

44 fitargs=(), regresults=False): 

45 """ 

46 Returns the results for the lag length that maximizes the info criterion. 

47 

48 Parameters 

49 ---------- 

50 mod : Model class 

51 Model estimator class 

52 endog : array_like 

53 nobs array containing endogenous variable 

54 exog : array_like 

55 nobs by (startlag + maxlag) array containing lags and possibly other 

56 variables 

57 startlag : int 

58 The first zero-indexed column to hold a lag. See Notes. 

59 maxlag : int 

60 The highest lag order for lag length selection. 

61 method : {'aic', 'bic', 't-stat'} 

62 aic - Akaike Information Criterion 

63 bic - Bayes Information Criterion 

64 t-stat - Based on last lag 

65 modargs : tuple, optional 

66 args to pass to model. See notes. 

67 fitargs : tuple, optional 

68 args to pass to fit. See notes. 

69 regresults : bool, optional 

70 Flag indicating to return optional return results 

71 

72 Returns 

73 ------- 

74 icbest : float 

75 Best information criteria. 

76 bestlag : int 

77 The lag length that maximizes the information criterion. 

78 results : dict, optional 

79 Dictionary containing all estimation results 

80 

81 Notes 

82 ----- 

83 Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs) 

84 where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are 

85 assumed to be in contiguous columns from low to high lag length with 

86 the highest lag in the last column. 

87 """ 

88 #TODO: can tcol be replaced by maxlag + 2? 

89 #TODO: This could be changed to laggedRHS and exog keyword arguments if 

90 # this will be more general. 

91 

92 results = {} 

93 method = method.lower() 

94 for lag in range(startlag, startlag + maxlag + 1): 

95 mod_instance = mod(endog, exog[:, :lag], *modargs) 

96 results[lag] = mod_instance.fit() 

97 

98 if method == "aic": 

99 icbest, bestlag = min((v.aic, k) for k, v in iteritems(results)) 

100 elif method == "bic": 

101 icbest, bestlag = min((v.bic, k) for k, v in iteritems(results)) 

102 elif method == "t-stat": 

103 #stop = stats.norm.ppf(.95) 

104 stop = 1.6448536269514722 

105 for lag in range(startlag + maxlag, startlag - 1, -1): 

106 icbest = np.abs(results[lag].tvalues[-1]) 

107 if np.abs(icbest) >= stop: 

108 bestlag = lag 

109 icbest = icbest 

110 break 

111 else: 

112 raise ValueError("Information Criterion %s not understood.") % method 

113 

114 if not regresults: 

115 return icbest, bestlag 

116 else: 

117 return icbest, bestlag, results 

118 

119 

120#this needs to be converted to a class like HetGoldfeldQuandt, 

121# 3 different returns are a mess 

122# See: 

123#Ng and Perron(2001), Lag length selection and the construction of unit root 

124#tests with good size and power, Econometrica, Vol 69 (6) pp 1519-1554 

125#TODO: include drift keyword, only valid with regression == "c" 

126# just changes the distribution of the test statistic to a t distribution 

127#TODO: autolag is untested 

128def adfuller(x, maxlag=None, regression="c", autolag='AIC', 

129 store=False, regresults=False): 

130 """ 

131 Augmented Dickey-Fuller unit root test. 

132 

133 The Augmented Dickey-Fuller test can be used to test for a unit root in a 

134 univariate process in the presence of serial correlation. 

135 

136 Parameters 

137 ---------- 

138 x : array_like, 1d 

139 The data series to test. 

140 maxlag : int 

141 Maximum lag which is included in test, default 12*(nobs/100)^{1/4}. 

142 regression : {'c','ct','ctt','nc'} 

143 Constant and trend order to include in regression. 

144 

145 * 'c' : constant only (default). 

146 * 'ct' : constant and trend. 

147 * 'ctt' : constant, and linear and quadratic trend. 

148 * 'nc' : no constant, no trend. 

149 

150 autolag : {'AIC', 'BIC', 't-stat', None} 

151 Method to use when automatically determining the lag. 

152 

153 * if None, then maxlag lags are used. 

154 * if 'AIC' (default) or 'BIC', then the number of lags is chosen 

155 to minimize the corresponding information criterion. 

156 * 't-stat' based choice of maxlag. Starts with maxlag and drops a 

157 lag until the t-statistic on the last lag length is significant 

158 using a 5%-sized test. 

159 store : bool 

160 If True, then a result instance is returned additionally to 

161 the adf statistic. Default is False. 

162 regresults : bool, optional 

163 If True, the full regression results are returned. Default is False. 

164 

165 Returns 

166 ------- 

167 adf : float 

168 The test statistic. 

169 pvalue : float 

170 MacKinnon's approximate p-value based on MacKinnon (1994, 2010). 

171 usedlag : int 

172 The number of lags used. 

173 nobs : int 

174 The number of observations used for the ADF regression and calculation 

175 of the critical values. 

176 critical values : dict 

177 Critical values for the test statistic at the 1 %, 5 %, and 10 % 

178 levels. Based on MacKinnon (2010). 

179 icbest : float 

180 The maximized information criterion if autolag is not None. 

181 resstore : ResultStore, optional 

182 A dummy class with results attached as attributes. 

183 

184 Notes 

185 ----- 

186 The null hypothesis of the Augmented Dickey-Fuller is that there is a unit 

187 root, with the alternative that there is no unit root. If the pvalue is 

188 above a critical size, then we cannot reject that there is a unit root. 

189 

190 The p-values are obtained through regression surface approximation from 

191 MacKinnon 1994, but using the updated 2010 tables. If the p-value is close 

192 to significant, then the critical values should be used to judge whether 

193 to reject the null. 

194 

195 The autolag option and maxlag for it are described in Greene. 

196 

197 References 

198 ---------- 

199 .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. 

200 

201 .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. 

202 

203 .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for 

204 unit-root and cointegration tests. `Journal of Business and Economic 

205 Statistics` 12, 167-76. 

206 

207 .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's 

208 University, Dept of Economics, Working Papers. Available at 

209 http://ideas.repec.org/p/qed/wpaper/1227.html 

210 

211 Examples 

212 -------- 

213 See example notebook 

214 """ 

215 x = array_like(x, 'x') 

216 maxlag = int_like(maxlag, 'maxlag', optional=True) 

217 regression = string_like(regression, 'regression', 

218 options=('c', 'ct', 'ctt', 'nc')) 

219 autolag = string_like(autolag, 'autolag', optional=True, 

220 options=('aic', 'bic', 't-stat')) 

221 store = bool_like(store, 'store') 

222 regresults = bool_like(regresults, 'regresults') 

223 

224 if regresults: 

225 store = True 

226 

227 trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} 

228 if regression is None or isinstance(regression, int): 

229 regression = trenddict[regression] 

230 regression = regression.lower() 

231 nobs = x.shape[0] 

232 

233 ntrend = len(regression) if regression != 'nc' else 0 

234 if maxlag is None: 

235 # from Greene referencing Schwert 1989 

236 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) 

237 # -1 for the diff 

238 maxlag = min(nobs // 2 - ntrend - 1, maxlag) 

239 if maxlag < 0: 

240 raise ValueError('sample size is too short to use selected ' 

241 'regression component') 

242 elif maxlag > nobs // 2 - ntrend - 1: 

243 raise ValueError('maxlag must be less than (nobs/2 - 1 - ntrend) ' 

244 'where n trend is the number of included ' 

245 'deterministic regressors') 

246 xdiff = np.diff(x) 

247 xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in') 

248 nobs = xdall.shape[0] 

249 

250 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x 

251 xdshort = xdiff[-nobs:] 

252 

253 if store: 

254 resstore = ResultsStore() 

255 if autolag: 

256 if regression != 'nc': 

257 fullRHS = add_trend(xdall, regression, prepend=True) 

258 else: 

259 fullRHS = xdall 

260 startlag = fullRHS.shape[1] - xdall.shape[1] + 1 

261 # 1 for level 

262 # search for lag length with smallest information criteria 

263 # Note: use the same number of observations to have comparable IC 

264 # aic and bic: smaller is better 

265 

266 if not regresults: 

267 icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, 

268 maxlag, autolag) 

269 else: 

270 icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag, 

271 maxlag, autolag, 

272 regresults=regresults) 

273 resstore.autolag_results = alres 

274 

275 bestlag -= startlag # convert to lag not column index 

276 

277 # rerun ols with best autolag 

278 xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in') 

279 nobs = xdall.shape[0] 

280 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x 

281 xdshort = xdiff[-nobs:] 

282 usedlag = bestlag 

283 else: 

284 usedlag = maxlag 

285 icbest = None 

286 if regression != 'nc': 

287 resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], 

288 regression)).fit() 

289 else: 

290 resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() 

291 

292 adfstat = resols.tvalues[0] 

293# adfstat = (resols.params[0]-1.0)/resols.bse[0] 

294 # the "asymptotically correct" z statistic is obtained as 

295 # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) 

296 # I think this is the statistic that is used for series that are integrated 

297 # for orders higher than I(1), ie., not ADF but cointegration tests. 

298 

299 # Get approx p-value and critical values 

300 pvalue = mackinnonp(adfstat, regression=regression, N=1) 

301 critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) 

302 critvalues = {"1%" : critvalues[0], "5%" : critvalues[1], 

303 "10%" : critvalues[2]} 

304 if store: 

305 resstore.resols = resols 

306 resstore.maxlag = maxlag 

307 resstore.usedlag = usedlag 

308 resstore.adfstat = adfstat 

309 resstore.critvalues = critvalues 

310 resstore.nobs = nobs 

311 resstore.H0 = ("The coefficient on the lagged level equals 1 - " 

312 "unit root") 

313 resstore.HA = "The coefficient on the lagged level < 1 - stationary" 

314 resstore.icbest = icbest 

315 resstore._str = 'Augmented Dickey-Fuller Test Results' 

316 return adfstat, pvalue, critvalues, resstore 

317 else: 

318 if not autolag: 

319 return adfstat, pvalue, usedlag, nobs, critvalues 

320 else: 

321 return adfstat, pvalue, usedlag, nobs, critvalues, icbest 

322 

323 

324def acovf(x, unbiased=False, demean=True, fft=None, missing='none', nlag=None): 

325 """ 

326 Estimate autocovariances. 

327 

328 Parameters 

329 ---------- 

330 x : array_like 

331 Time series data. Must be 1d. 

332 unbiased : bool 

333 If True, then denominators is n-k, otherwise n. 

334 demean : bool 

335 If True, then subtract the mean x from each element of x. 

336 fft : bool 

337 If True, use FFT convolution. This method should be preferred 

338 for long time series. 

339 missing : str 

340 A string in ['none', 'raise', 'conservative', 'drop'] specifying how 

341 the NaNs are to be treated. 

342 nlag : {int, None} 

343 Limit the number of autocovariances returned. Size of returned 

344 array is nlag + 1. Setting nlag when fft is False uses a simple, 

345 direct estimator of the autocovariances that only computes the first 

346 nlag + 1 values. This can be much faster when the time series is long 

347 and only a small number of autocovariances are needed. 

348 

349 Returns 

350 ------- 

351 ndarray 

352 The estimated autocovariances. 

353 

354 References 

355 ----------- 

356 .. [1] Parzen, E., 1963. On spectral analysis with missing observations 

357 and amplitude modulation. Sankhya: The Indian Journal of 

358 Statistics, Series A, pp.383-392. 

359 """ 

360 unbiased = bool_like(unbiased, 'unbiased') 

361 demean = bool_like(demean, 'demean') 

362 fft = bool_like(fft, 'fft', optional=True) 

363 missing = string_like(missing, 'missing', 

364 options=('none', 'raise', 'conservative', 'drop')) 

365 nlag = int_like(nlag, 'nlag', optional=True) 

366 

367 if fft is None: 

368 import warnings 

369 msg = 'fft=True will become the default in a future version of ' \ 

370 'statsmodels. To suppress this warning, explicitly set ' \ 

371 'fft=False.' 

372 warnings.warn(msg, FutureWarning) 

373 fft = False 

374 

375 x = array_like(x, 'x', ndim=1) 

376 

377 missing = missing.lower() 

378 if missing not in ['none', 'raise', 'conservative', 'drop']: 

379 raise ValueError("missing option %s not understood" % missing) 

380 if missing == 'none': 

381 deal_with_masked = False 

382 else: 

383 deal_with_masked = has_missing(x) 

384 if deal_with_masked: 

385 if missing == 'raise': 

386 raise MissingDataError("NaNs were encountered in the data") 

387 notmask_bool = ~np.isnan(x) # bool 

388 if missing == 'conservative': 

389 # Must copy for thread safety 

390 x = x.copy() 

391 x[~notmask_bool] = 0 

392 else: # 'drop' 

393 x = x[notmask_bool] # copies non-missing 

394 notmask_int = notmask_bool.astype(int) # int 

395 

396 if demean and deal_with_masked: 

397 # whether 'drop' or 'conservative': 

398 xo = x - x.sum() / notmask_int.sum() 

399 if missing == 'conservative': 

400 xo[~notmask_bool] = 0 

401 elif demean: 

402 xo = x - x.mean() 

403 else: 

404 xo = x 

405 

406 n = len(x) 

407 lag_len = nlag 

408 if nlag is None: 

409 lag_len = n - 1 

410 elif nlag > n - 1: 

411 raise ValueError('nlag must be smaller than nobs - 1') 

412 

413 if not fft and nlag is not None: 

414 acov = np.empty(lag_len + 1) 

415 acov[0] = xo.dot(xo) 

416 for i in range(lag_len): 

417 acov[i + 1] = xo[i + 1:].dot(xo[:-(i + 1)]) 

418 if not deal_with_masked or missing == 'drop': 

419 if unbiased: 

420 acov /= (n - np.arange(lag_len + 1)) 

421 else: 

422 acov /= n 

423 else: 

424 if unbiased: 

425 divisor = np.empty(lag_len + 1, dtype=np.int64) 

426 divisor[0] = notmask_int.sum() 

427 for i in range(lag_len): 

428 divisor[i + 1] = notmask_int[i + 1:].dot(notmask_int[:-(i + 1)]) 

429 divisor[divisor == 0] = 1 

430 acov /= divisor 

431 else: # biased, missing data but npt 'drop' 

432 acov /= notmask_int.sum() 

433 return acov 

434 

435 if unbiased and deal_with_masked and missing == 'conservative': 

436 d = np.correlate(notmask_int, notmask_int, 'full') 

437 d[d == 0] = 1 

438 elif unbiased: 

439 xi = np.arange(1, n + 1) 

440 d = np.hstack((xi, xi[:-1][::-1])) 

441 elif deal_with_masked: # biased and NaNs given and ('drop' or 'conservative') 

442 d = notmask_int.sum() * np.ones(2 * n - 1) 

443 else: # biased and no NaNs or missing=='none' 

444 d = n * np.ones(2 * n - 1) 

445 

446 if fft: 

447 nobs = len(xo) 

448 n = _next_regular(2 * nobs + 1) 

449 Frf = np.fft.fft(xo, n=n) 

450 acov = np.fft.ifft(Frf * np.conjugate(Frf))[:nobs] / d[nobs - 1:] 

451 acov = acov.real 

452 else: 

453 acov = np.correlate(xo, xo, 'full')[n - 1:] / d[n - 1:] 

454 

455 if nlag is not None: 

456 # Copy to allow gc of full array rather than view 

457 return acov[:lag_len + 1].copy() 

458 return acov 

459 

460 

461def q_stat(x, nobs, type=None): 

462 """ 

463 Compute Ljung-Box Q Statistic. 

464 

465 Parameters 

466 ---------- 

467 x : array_like 

468 Array of autocorrelation coefficients. Can be obtained from acf. 

469 nobs : int, optional 

470 Number of observations in the entire sample (ie., not just the length 

471 of the autocorrelation function results. 

472 

473 Returns 

474 ------- 

475 q-stat : ndarray 

476 Ljung-Box Q-statistic for autocorrelation parameters. 

477 p-value : ndarray 

478 P-value of the Q statistic. 

479 

480 Notes 

481 ----- 

482 Designed to be used with acf. 

483 """ 

484 x = array_like(x, 'x') 

485 nobs = int_like(nobs, 'nobs') 

486 

487 if type is not None: 

488 import warnings 

489 warnings.warn('The `type` argument is deprecated and has no effect', 

490 FutureWarning) 

491 ret = (nobs * (nobs + 2) * 

492 np.cumsum((1. / (nobs - np.arange(1, len(x) + 1))) * x ** 2)) 

493 chi2 = stats.chi2.sf(ret, np.arange(1, len(x) + 1)) 

494 return ret, chi2 

495 

496 

497#NOTE: Changed unbiased to False 

498#see for example 

499# http://www.itl.nist.gov/div898/handbook/eda/section3/autocopl.htm 

500def acf(x, unbiased=False, nlags=40, qstat=False, fft=None, alpha=None, 

501 missing='none'): 

502 """ 

503 Calculate the autocorrelation function. 

504 

505 Parameters 

506 ---------- 

507 x : array_like 

508 The time series data. 

509 unbiased : bool 

510 If True, then denominators for autocovariance are n-k, otherwise n. 

511 nlags : int, optional 

512 Number of lags to return autocorrelation for. 

513 qstat : bool, optional 

514 If True, returns the Ljung-Box q statistic for each autocorrelation 

515 coefficient. See q_stat for more information. 

516 fft : bool, optional 

517 If True, computes the ACF via FFT. 

518 alpha : scalar, optional 

519 If a number is given, the confidence intervals for the given level are 

520 returned. For instance if alpha=.05, 95 % confidence intervals are 

521 returned where the standard deviation is computed according to 

522 Bartlett's formula. 

523 missing : str, optional 

524 A string in ['none', 'raise', 'conservative', 'drop'] specifying how the NaNs 

525 are to be treated. 

526 

527 Returns 

528 ------- 

529 acf : ndarray 

530 The autocorrelation function. 

531 confint : ndarray, optional 

532 Confidence intervals for the ACF. Returned if alpha is not None. 

533 qstat : ndarray, optional 

534 The Ljung-Box Q-Statistic. Returned if q_stat is True. 

535 pvalues : ndarray, optional 

536 The p-values associated with the Q-statistics. Returned if q_stat is 

537 True. 

538 

539 Notes 

540 ----- 

541 The acf at lag 0 (ie., 1) is returned. 

542 

543 For very long time series it is recommended to use fft convolution instead. 

544 When fft is False uses a simple, direct estimator of the autocovariances 

545 that only computes the first nlag + 1 values. This can be much faster when 

546 the time series is long and only a small number of autocovariances are 

547 needed. 

548 

549 If unbiased is true, the denominator for the autocovariance is adjusted 

550 but the autocorrelation is not an unbiased estimator. 

551 

552 References 

553 ---------- 

554 .. [1] Parzen, E., 1963. On spectral analysis with missing observations 

555 and amplitude modulation. Sankhya: The Indian Journal of 

556 Statistics, Series A, pp.383-392. 

557 """ 

558 unbiased = bool_like(unbiased, 'unbiased') 

559 nlags = int_like(nlags, 'nlags') 

560 qstat = bool_like(qstat, 'qstat') 

561 fft = bool_like(fft, 'fft', optional=True) 

562 alpha = float_like(alpha, 'alpha', optional=True) 

563 missing = string_like(missing, 'missing', 

564 options=('none', 'raise', 'conservative', 'drop')) 

565 

566 if fft is None: 

567 import warnings 

568 warnings.warn( 

569 'fft=True will become the default in a future version of ' 

570 'statsmodels. To suppress this warning, explicitly set ' 

571 'fft=False.', 

572 FutureWarning 

573 ) 

574 fft = False 

575 x = array_like(x, 'x') 

576 nobs = len(x) # TODO: should this shrink for missing='drop' and NaNs in x? 

577 avf = acovf(x, unbiased=unbiased, demean=True, fft=fft, missing=missing) 

578 acf = avf[:nlags + 1] / avf[0] 

579 if not (qstat or alpha): 

580 return acf 

581 if alpha is not None: 

582 varacf = np.ones(nlags + 1) / nobs 

583 varacf[0] = 0 

584 varacf[1] = 1. / nobs 

585 varacf[2:] *= 1 + 2 * np.cumsum(acf[1:-1]**2) 

586 interval = stats.norm.ppf(1 - alpha / 2.) * np.sqrt(varacf) 

587 confint = np.array(lzip(acf - interval, acf + interval)) 

588 if not qstat: 

589 return acf, confint 

590 if qstat: 

591 qstat, pvalue = q_stat(acf[1:], nobs=nobs) # drop lag 0 

592 if alpha is not None: 

593 return acf, confint, qstat, pvalue 

594 else: 

595 return acf, qstat, pvalue 

596 

597 

598def pacf_yw(x, nlags=40, method='unbiased'): 

599 """ 

600 Partial autocorrelation estimated with non-recursive yule_walker. 

601 

602 Parameters 

603 ---------- 

604 x : array_like 

605 The observations of time series for which pacf is calculated. 

606 nlags : int, optional 

607 The largest lag for which pacf is returned. 

608 method : {'unbiased', 'mle'} 

609 The method for the autocovariance calculations in yule walker. 

610 

611 Returns 

612 ------- 

613 ndarray 

614 The partial autocorrelations, maxlag+1 elements. 

615 

616 See Also 

617 -------- 

618 statsmodels.tsa.stattools.pacf 

619 Partial autocorrelation estimation. 

620 statsmodels.tsa.stattools.pacf_ols 

621 Partial autocorrelation estimation using OLS. 

622 statsmodels.tsa.stattools.pacf_burg 

623 Partial autocorrelation estimation using Burg's method. 

624 

625 Notes 

626 ----- 

627 This solves yule_walker for each desired lag and contains 

628 currently duplicate calculations. 

629 """ 

630 x = array_like(x, 'x') 

631 nlags = int_like(nlags, 'nlags') 

632 method = string_like(method, 'method', options=('unbiased', 'mle')) 

633 

634 pacf = [1.] 

635 for k in range(1, nlags + 1): 

636 pacf.append(yule_walker(x, k, method=method)[0][-1]) 

637 return np.array(pacf) 

638 

639 

640def pacf_burg(x, nlags=None, demean=True): 

641 """ 

642 Calculate Burg's partial autocorrelation estimator. 

643 

644 Parameters 

645 ---------- 

646 x : array_like 

647 Observations of time series for which pacf is calculated. 

648 nlags : int, optional 

649 Number of lags to compute the partial autocorrelations. If omitted, 

650 uses the smaller of 10(log10(nobs)) or nobs - 1. 

651 demean : bool, optional 

652 Flag indicating to demean that data. Set to False if x has been 

653 previously demeaned. 

654 

655 Returns 

656 ------- 

657 pacf : ndarray 

658 Partial autocorrelations for lags 0, 1, ..., nlag. 

659 sigma2 : ndarray 

660 Residual variance estimates where the value in position m is the 

661 residual variance in an AR model that includes m lags. 

662 

663 See Also 

664 -------- 

665 statsmodels.tsa.stattools.pacf 

666 Partial autocorrelation estimation. 

667 statsmodels.tsa.stattools.pacf_yw 

668 Partial autocorrelation estimation using Yule-Walker. 

669 statsmodels.tsa.stattools.pacf_ols 

670 Partial autocorrelation estimation using OLS. 

671 

672 References 

673 ---------- 

674 .. [1] Brockwell, P.J. and Davis, R.A., 2016. Introduction to time series 

675 and forecasting. Springer. 

676 """ 

677 x = array_like(x, 'x') 

678 if demean: 

679 x = x - x.mean() 

680 nobs = x.shape[0] 

681 p = nlags if nlags is not None else min(int(10 * np.log10(nobs)), nobs - 1) 

682 if p > nobs - 1: 

683 raise ValueError('nlags must be smaller than nobs - 1') 

684 d = np.zeros(p + 1) 

685 d[0] = 2 * x.dot(x) 

686 pacf = np.zeros(p + 1) 

687 u = x[::-1].copy() 

688 v = x[::-1].copy() 

689 d[1] = u[:-1].dot(u[:-1]) + v[1:].dot(v[1:]) 

690 pacf[1] = 2 / d[1] * v[1:].dot(u[:-1]) 

691 last_u = np.empty_like(u) 

692 last_v = np.empty_like(v) 

693 for i in range(1, p): 

694 last_u[:] = u 

695 last_v[:] = v 

696 u[1:] = last_u[:-1] - pacf[i] * last_v[1:] 

697 v[1:] = last_v[1:] - pacf[i] * last_u[:-1] 

698 d[i + 1] = (1 - pacf[i] ** 2) * d[i] - v[i] ** 2 - u[-1] ** 2 

699 pacf[i + 1] = 2 / d[i + 1] * v[i + 1:].dot(u[i:-1]) 

700 sigma2 = (1 - pacf ** 2) * d / (2. * (nobs - np.arange(0, p + 1))) 

701 pacf[0] = 1 # Insert the 0 lag partial autocorrel 

702 

703 return pacf, sigma2 

704 

705 

706def pacf_ols(x, nlags=40, efficient=True, unbiased=False): 

707 """ 

708 Calculate partial autocorrelations via OLS. 

709 

710 Parameters 

711 ---------- 

712 x : array_like 

713 Observations of time series for which pacf is calculated. 

714 nlags : int 

715 Number of lags for which pacf is returned. Lag 0 is not returned. 

716 efficient : bool, optional 

717 If true, uses the maximum number of available observations to compute 

718 each partial autocorrelation. If not, uses the same number of 

719 observations to compute all pacf values. 

720 unbiased : bool, optional 

721 Adjust each partial autocorrelation by n / (n - lag). 

722 

723 Returns 

724 ------- 

725 ndarray 

726 The partial autocorrelations, (maxlag,) array corresponding to lags 

727 0, 1, ..., maxlag. 

728 

729 See Also 

730 -------- 

731 statsmodels.tsa.stattools.pacf 

732 Partial autocorrelation estimation. 

733 statsmodels.tsa.stattools.pacf_yw 

734 Partial autocorrelation estimation using Yule-Walker. 

735 statsmodels.tsa.stattools.pacf_burg 

736 Partial autocorrelation estimation using Burg's method. 

737 

738 Notes 

739 ----- 

740 This solves a separate OLS estimation for each desired lag using method in 

741 [1]_. Setting efficient to True has two effects. First, it uses 

742 `nobs - lag` observations of estimate each pacf. Second, it re-estimates 

743 the mean in each regression. If efficient is False, then the data are first 

744 demeaned, and then `nobs - maxlag` observations are used to estimate each 

745 partial autocorrelation. 

746 

747 The inefficient estimator appears to have better finite sample properties. 

748 This option should only be used in time series that are covariance 

749 stationary. 

750 

751 OLS estimation of the pacf does not guarantee that all pacf values are 

752 between -1 and 1. 

753 

754 References 

755 ---------- 

756 .. [1] Box, G. E., Jenkins, G. M., Reinsel, G. C., & Ljung, G. M. (2015). 

757 Time series analysis: forecasting and control. John Wiley & Sons, p. 66 

758 """ 

759 x = array_like(x, 'x') 

760 nlags = int_like(nlags, 'nlags') 

761 efficient = bool_like(efficient, 'efficient') 

762 unbiased = bool_like(unbiased, 'unbiased') 

763 

764 pacf = np.empty(nlags + 1) 

765 pacf[0] = 1.0 

766 if efficient: 

767 xlags, x0 = lagmat(x, nlags, original='sep') 

768 xlags = add_constant(xlags) 

769 for k in range(1, nlags + 1): 

770 params = lstsq(xlags[k:, :k + 1], x0[k:], rcond=None)[0] 

771 pacf[k] = params[-1] 

772 else: 

773 x = x - np.mean(x) 

774 # Create a single set of lags for multivariate OLS 

775 xlags, x0 = lagmat(x, nlags, original='sep', trim='both') 

776 for k in range(1, nlags + 1): 

777 params = lstsq(xlags[:, :k], x0, rcond=None)[0] 

778 # Last coefficient corresponds to PACF value (see [1]) 

779 pacf[k] = params[-1] 

780 

781 if unbiased: 

782 n = len(x) 

783 pacf *= n / (n - np.arange(nlags + 1)) 

784 

785 return pacf 

786 

787 

788def pacf(x, nlags=40, method='ywunbiased', alpha=None): 

789 """ 

790 Partial autocorrelation estimate. 

791 

792 Parameters 

793 ---------- 

794 x : array_like 

795 Observations of time series for which pacf is calculated. 

796 nlags : int, optional 

797 The largest lag for which the pacf is returned. 

798 method : str, optional 

799 Specifies which method for the calculations to use. 

800 

801 - 'yw' or 'ywunbiased' : Yule-Walker with bias correction in 

802 denominator for acovf. Default. 

803 - 'ywm' or 'ywmle' : Yule-Walker without bias correction. 

804 - 'ols' : regression of time series on lags of it and on constant. 

805 - 'ols-inefficient' : regression of time series on lags using a single 

806 common sample to estimate all pacf coefficients. 

807 - 'ols-unbiased' : regression of time series on lags with a bias 

808 adjustment. 

809 - 'ld' or 'ldunbiased' : Levinson-Durbin recursion with bias 

810 correction. 

811 - 'ldb' or 'ldbiased' : Levinson-Durbin recursion without bias 

812 correction. 

813 

814 alpha : float, optional 

815 If a number is given, the confidence intervals for the given level are 

816 returned. For instance if alpha=.05, 95 % confidence intervals are 

817 returned where the standard deviation is computed according to 

818 1/sqrt(len(x)). 

819 

820 Returns 

821 ------- 

822 pacf : ndarray 

823 Partial autocorrelations, nlags elements, including lag zero. 

824 confint : ndarray, optional 

825 Confidence intervals for the PACF. Returned if confint is not None. 

826 

827 See Also 

828 -------- 

829 statsmodels.tsa.stattools.acf 

830 Estimate the autocorrelation function. 

831 statsmodels.tsa.stattools.pacf 

832 Partial autocorrelation estimation. 

833 statsmodels.tsa.stattools.pacf_yw 

834 Partial autocorrelation estimation using Yule-Walker. 

835 statsmodels.tsa.stattools.pacf_ols 

836 Partial autocorrelation estimation using OLS. 

837 statsmodels.tsa.stattools.pacf_burg 

838 Partial autocorrelation estimation using Burg's method. 

839 

840 Notes 

841 ----- 

842 Based on simulation evidence across a range of low-order ARMA models, 

843 the best methods based on root MSE are Yule-Walker (MLW), Levinson-Durbin 

844 (MLE) and Burg, respectively. The estimators with the lowest bias included 

845 included these three in addition to OLS and OLS-unbiased. 

846 

847 Yule-Walker (unbiased) and Levinson-Durbin (unbiased) performed 

848 consistently worse than the other options. 

849 """ 

850 nlags = int_like(nlags, 'nlags') 

851 methods = ('ols', 'ols-inefficient', 'ols-unbiased', 'yw', 'ywu', 'ld', 

852 'ywunbiased', 'yw_unbiased', 'ywm', 'ywmle', 'yw_mle', 'ldu', 

853 'ldunbiased', 'ld_unbiased', 'ldb', 'ldbiased', 'ld_biased') 

854 method = string_like(method, 'method', options=methods) 

855 

856 alpha = float_like(alpha, 'alpha', optional=True) 

857 

858 if method in ('ols', 'ols-inefficient', 'ols-unbiased'): 

859 efficient = 'inefficient' not in method 

860 unbiased = 'unbiased' in method 

861 ret = pacf_ols(x, nlags=nlags, efficient=efficient, unbiased=unbiased) 

862 elif method in ('yw', 'ywu', 'ywunbiased', 'yw_unbiased'): 

863 ret = pacf_yw(x, nlags=nlags, method='unbiased') 

864 elif method in ('ywm', 'ywmle', 'yw_mle'): 

865 ret = pacf_yw(x, nlags=nlags, method='mle') 

866 elif method in ('ld', 'ldu', 'ldunbiased', 'ld_unbiased'): 

867 acv = acovf(x, unbiased=True, fft=False) 

868 ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) 

869 ret = ld_[2] 

870 # inconsistent naming with ywmle 

871 else: # method in ('ldb', 'ldbiased', 'ld_biased') 

872 acv = acovf(x, unbiased=False, fft=False) 

873 ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) 

874 ret = ld_[2] 

875 

876 if alpha is not None: 

877 varacf = 1. / len(x) # for all lags >=1 

878 interval = stats.norm.ppf(1. - alpha / 2.) * np.sqrt(varacf) 

879 confint = np.array(lzip(ret - interval, ret + interval)) 

880 confint[0] = ret[0] # fix confidence interval for lag 0 to varpacf=0 

881 return ret, confint 

882 else: 

883 return ret 

884 

885 

886def ccovf(x, y, unbiased=True, demean=True): 

887 """ 

888 Calculate the crosscovariance between two series. 

889 

890 Parameters 

891 ---------- 

892 x, y : array_like 

893 The time series data to use in the calculation. 

894 unbiased : bool, optional 

895 If True, then denominators for autocovariance is n-k, otherwise n. 

896 demean : bool, optional 

897 Flag indicating whether to demean x and y. 

898 

899 Returns 

900 ------- 

901 ndarray 

902 The estimated crosscovariance function. 

903 

904 Notes 

905 ----- 

906 This uses np.correlate which does full convolution. For very long time 

907 series it is recommended to use fft convolution instead. 

908 """ 

909 x = array_like(x, 'x') 

910 y = array_like(y, 'y') 

911 unbiased = bool_like(unbiased, 'unbiased') 

912 demean = bool_like(demean, 'demean') 

913 

914 n = len(x) 

915 if demean: 

916 xo = x - x.mean() 

917 yo = y - y.mean() 

918 else: 

919 xo = x 

920 yo = y 

921 if unbiased: 

922 xi = np.ones(n) 

923 d = np.correlate(xi, xi, 'full') 

924 else: 

925 d = n 

926 return (np.correlate(xo, yo, 'full') / d)[n - 1:] 

927 

928 

929def ccf(x, y, unbiased=True): 

930 """ 

931 The cross-correlation function. 

932 

933 Parameters 

934 ---------- 

935 x, y : array_like 

936 The time series data to use in the calculation. 

937 unbiased : bool 

938 If True, then denominators for autocovariance is n-k, otherwise n. 

939 

940 Returns 

941 ------- 

942 ndarray 

943 The cross-correlation function of x and y. 

944 

945 Notes 

946 ----- 

947 This is based np.correlate which does full convolution. For very long time 

948 series it is recommended to use fft convolution instead. 

949 

950 If unbiased is true, the denominator for the autocovariance is adjusted 

951 but the autocorrelation is not an unbiased estimator. 

952 """ 

953 x = array_like(x, 'x') 

954 y = array_like(y, 'y') 

955 unbiased = bool_like(unbiased, 'unbiased') 

956 

957 cvf = ccovf(x, y, unbiased=unbiased, demean=True) 

958 return cvf / (np.std(x) * np.std(y)) 

959 

960 

961def periodogram(x): 

962 """ 

963 Compute the periodogram for the natural frequency of x. 

964 

965 .. deprecated:: 

966 Use scipy.signal.periodogram instead 

967 

968 Parameters 

969 ---------- 

970 x : array_like 

971 Array for which the periodogram is desired. 

972 

973 Returns 

974 ------- 

975 ndarray 

976 The periodogram defined as 1./len(x) * np.abs(np.fft.fft(x))**2. 

977 

978 References 

979 ---------- 

980 .. [1] Brockwell, P.J. and Davis, R.A., 2016. Introduction to time series 

981 and forecasting. Springer. 

982 """ 

983 # TODO: Remove after 0.11 

984 import warnings 

985 warnings.warn('periodogram is deprecated and will be removed after 0.11. ' 

986 'Use scipy.signal.periodogram instead.', FutureWarning) 

987 x = array_like(x, 'x') 

988 

989 pergr = 1. / len(x) * np.abs(np.fft.fft(x)) ** 2 

990 pergr[0] = 0. # what are the implications of this? 

991 return pergr 

992 

993 

994# moved from sandbox.tsa.examples.try_ld_nitime, via nitime 

995# TODO: check what to return, for testing and trying out returns everything 

996def levinson_durbin(s, nlags=10, isacov=False): 

997 """ 

998 Levinson-Durbin recursion for autoregressive processes. 

999 

1000 Parameters 

1001 ---------- 

1002 s : array_like 

1003 If isacov is False, then this is the time series. If iasacov is true 

1004 then this is interpreted as autocovariance starting with lag 0. 

1005 nlags : int, optional 

1006 The largest lag to include in recursion or order of the autoregressive 

1007 process. 

1008 isacov : bool, optional 

1009 Flag indicating whether the first argument, s, contains the 

1010 autocovariances or the data series. 

1011 

1012 Returns 

1013 ------- 

1014 sigma_v : float 

1015 The estimate of the error variance. 

1016 arcoefs : ndarray 

1017 The estimate of the autoregressive coefficients for a model including 

1018 nlags. 

1019 pacf : ndarray 

1020 The partial autocorrelation function. 

1021 sigma : ndarray 

1022 The entire sigma array from intermediate result, last value is sigma_v. 

1023 phi : ndarray 

1024 The entire phi array from intermediate result, last column contains 

1025 autoregressive coefficients for AR(nlags). 

1026 

1027 Notes 

1028 ----- 

1029 This function returns currently all results, but maybe we drop sigma and 

1030 phi from the returns. 

1031 

1032 If this function is called with the time series (isacov=False), then the 

1033 sample autocovariance function is calculated with the default options 

1034 (biased, no fft). 

1035 """ 

1036 s = array_like(s, 's') 

1037 nlags = int_like(nlags, 'nlags') 

1038 isacov = bool_like(isacov, 'isacov') 

1039 

1040 order = nlags 

1041 

1042 if isacov: 

1043 sxx_m = s 

1044 else: 

1045 sxx_m = acovf(s, fft=False)[:order + 1] # not tested 

1046 

1047 phi = np.zeros((order + 1, order + 1), 'd') 

1048 sig = np.zeros(order + 1) 

1049 # initial points for the recursion 

1050 phi[1, 1] = sxx_m[1] / sxx_m[0] 

1051 sig[1] = sxx_m[0] - phi[1, 1] * sxx_m[1] 

1052 for k in range(2, order + 1): 

1053 phi[k, k] = (sxx_m[k] - np.dot(phi[1:k, k-1], 

1054 sxx_m[1:k][::-1])) / sig[k-1] 

1055 for j in range(1, k): 

1056 phi[j, k] = phi[j, k-1] - phi[k, k] * phi[k-j, k-1] 

1057 sig[k] = sig[k-1] * (1 - phi[k, k]**2) 

1058 

1059 sigma_v = sig[-1] 

1060 arcoefs = phi[1:, -1] 

1061 pacf_ = np.diag(phi).copy() 

1062 pacf_[0] = 1. 

1063 return sigma_v, arcoefs, pacf_, sig, phi # return everything 

1064 

1065 

1066def levinson_durbin_pacf(pacf, nlags=None): 

1067 """ 

1068 Levinson-Durbin algorithm that returns the acf and ar coefficients. 

1069 

1070 Parameters 

1071 ---------- 

1072 pacf : array_like 

1073 Partial autocorrelation array for lags 0, 1, ... p. 

1074 nlags : int, optional 

1075 Number of lags in the AR model. If omitted, returns coefficients from 

1076 an AR(p) and the first p autocorrelations. 

1077 

1078 Returns 

1079 ------- 

1080 arcoefs : ndarray 

1081 AR coefficients computed from the partial autocorrelations. 

1082 acf : ndarray 

1083 The acf computed from the partial autocorrelations. Array returned 

1084 contains the autocorrelations corresponding to lags 0, 1, ..., p. 

1085 

1086 References 

1087 ---------- 

1088 .. [1] Brockwell, P.J. and Davis, R.A., 2016. Introduction to time series 

1089 and forecasting. Springer. 

1090 """ 

1091 pacf = array_like(pacf, 'pacf') 

1092 nlags = int_like(nlags, 'nlags', optional=True) 

1093 pacf = np.squeeze(np.asarray(pacf)) 

1094 

1095 if pacf[0] != 1: 

1096 raise ValueError('The first entry of the pacf corresponds to lags 0 ' 

1097 'and so must be 1.') 

1098 pacf = pacf[1:] 

1099 n = pacf.shape[0] 

1100 if nlags is not None: 

1101 if nlags > n: 

1102 raise ValueError('Must provide at least as many values from the ' 

1103 'pacf as the number of lags.') 

1104 pacf = pacf[:nlags] 

1105 n = pacf.shape[0] 

1106 

1107 acf = np.zeros(n + 1) 

1108 acf[1] = pacf[0] 

1109 nu = np.cumprod(1 - pacf ** 2) 

1110 arcoefs = pacf.copy() 

1111 for i in range(1, n): 

1112 prev = arcoefs[:-(n - i)].copy() 

1113 arcoefs[:-(n - i)] = prev - arcoefs[i] * prev[::-1] 

1114 acf[i + 1] = arcoefs[i] * nu[i-1] + prev.dot(acf[1:-(n - i)][::-1]) 

1115 acf[0] = 1 

1116 return arcoefs, acf 

1117 

1118 

1119def grangercausalitytests(x, maxlag, addconst=True, verbose=True): 

1120 """ 

1121 Four tests for granger non causality of 2 time series. 

1122 

1123 All four tests give similar results. `params_ftest` and `ssr_ftest` are 

1124 equivalent based on F test which is identical to lmtest:grangertest in R. 

1125 

1126 Parameters 

1127 ---------- 

1128 x : array_like 

1129 The data for test whether the time series in the second column Granger 

1130 causes the time series in the first column. Missing values are not 

1131 supported. 

1132 maxlag : {int, Iterable[int]} 

1133 If an integer, computes the test for all lags up to maxlag. If an 

1134 iterable, computes the tests only for the lags in maxlag. 

1135 addconst : bool 

1136 Include a constant in the model. 

1137 verbose : bool 

1138 Print results. 

1139 

1140 Returns 

1141 ------- 

1142 dict 

1143 All test results, dictionary keys are the number of lags. For each 

1144 lag the values are a tuple, with the first element a dictionary with 

1145 test statistic, pvalues, degrees of freedom, the second element are 

1146 the OLS estimation results for the restricted model, the unrestricted 

1147 model and the restriction (contrast) matrix for the parameter f_test. 

1148 

1149 Notes 

1150 ----- 

1151 TODO: convert to class and attach results properly 

1152 

1153 The Null hypothesis for grangercausalitytests is that the time series in 

1154 the second column, x2, does NOT Granger cause the time series in the first 

1155 column, x1. Grange causality means that past values of x2 have a 

1156 statistically significant effect on the current value of x1, taking past 

1157 values of x1 into account as regressors. We reject the null hypothesis 

1158 that x2 does not Granger cause x1 if the pvalues are below a desired size 

1159 of the test. 

1160 

1161 The null hypothesis for all four test is that the coefficients 

1162 corresponding to past values of the second time series are zero. 

1163 

1164 'params_ftest', 'ssr_ftest' are based on F distribution 

1165 

1166 'ssr_chi2test', 'lrtest' are based on chi-square distribution 

1167 

1168 References 

1169 ---------- 

1170 .. [1] https://en.wikipedia.org/wiki/Granger_causality 

1171 

1172 .. [2] Greene: Econometric Analysis 

1173 

1174 Examples 

1175 -------- 

1176 >>> import statsmodels.api as sm 

1177 >>> from statsmodels.tsa.stattools import grangercausalitytests 

1178 >>> import numpy as np 

1179 >>> data = sm.datasets.macrodata.load_pandas() 

1180 >>> data = data.data[['realgdp', 'realcons']].pct_change().dropna() 

1181 

1182 # All lags up to 4 

1183 >>> gc_res = grangercausalitytests(data, 4) 

1184 

1185 # Only lag 4 

1186 >>> gc_res = grangercausalitytests(data, [4]) 

1187 """ 

1188 x = array_like(x, 'x', ndim=2) 

1189 if not np.isfinite(x).all(): 

1190 raise ValueError('x contains NaN or inf values.') 

1191 addconst = bool_like(addconst, 'addconst') 

1192 verbose = bool_like(verbose, 'verbose') 

1193 try: 

1194 lags = np.array([int(lag) for lag in maxlag]) 

1195 maxlag = lags.max() 

1196 if lags.min() <= 0 or lags.size == 0: 

1197 raise ValueError('maxlag must be a non-empty list containing only ' 

1198 'positive integers') 

1199 except Exception: 

1200 maxlag = int_like(maxlag, 'maxlag') 

1201 if maxlag <= 0: 

1202 raise ValueError('maxlag must a a positive integer') 

1203 lags = np.arange(1, maxlag + 1) 

1204 

1205 if x.shape[0] <= 3 * maxlag + int(addconst): 

1206 raise ValueError("Insufficient observations. Maximum allowable " 

1207 "lag is {0}".format(int((x.shape[0] - int(addconst)) / 

1208 3) - 1)) 

1209 

1210 resli = {} 

1211 

1212 for mlg in lags: 

1213 result = {} 

1214 if verbose: 

1215 print('\nGranger Causality') 

1216 print('number of lags (no zero)', mlg) 

1217 mxlg = mlg 

1218 

1219 # create lagmat of both time series 

1220 dta = lagmat2ds(x, mxlg, trim='both', dropex=1) 

1221 

1222 # add constant 

1223 if addconst: 

1224 dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False) 

1225 dtajoint = add_constant(dta[:, 1:], prepend=False) 

1226 else: 

1227 raise NotImplementedError('Not Implemented') 

1228 # dtaown = dta[:, 1:mxlg] 

1229 # dtajoint = dta[:, 1:] 

1230 

1231 # Run ols on both models without and with lags of second variable 

1232 res2down = OLS(dta[:, 0], dtaown).fit() 

1233 res2djoint = OLS(dta[:, 0], dtajoint).fit() 

1234 

1235 # print results 

1236 # for ssr based tests see: 

1237 # http://support.sas.com/rnd/app/examples/ets/granger/index.htm 

1238 # the other tests are made-up 

1239 

1240 # Granger Causality test using ssr (F statistic) 

1241 fgc1 = ((res2down.ssr - res2djoint.ssr) / 

1242 res2djoint.ssr / mxlg * res2djoint.df_resid) 

1243 if verbose: 

1244 print('ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' 

1245 ' df_num=%d' % (fgc1, 

1246 stats.f.sf(fgc1, mxlg, 

1247 res2djoint.df_resid), 

1248 res2djoint.df_resid, mxlg)) 

1249 result['ssr_ftest'] = (fgc1, 

1250 stats.f.sf(fgc1, mxlg, res2djoint.df_resid), 

1251 res2djoint.df_resid, mxlg) 

1252 

1253 # Granger Causality test using ssr (ch2 statistic) 

1254 fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr 

1255 if verbose: 

1256 print('ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, ' 

1257 'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)) 

1258 result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) 

1259 

1260 # likelihood ratio test pvalue: 

1261 lr = -2 * (res2down.llf - res2djoint.llf) 

1262 if verbose: 

1263 print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % 

1264 (lr, stats.chi2.sf(lr, mxlg), mxlg)) 

1265 result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) 

1266 

1267 # F test that all lag coefficients of exog are zero 

1268 rconstr = np.column_stack((np.zeros((mxlg, mxlg)), 

1269 np.eye(mxlg, mxlg), 

1270 np.zeros((mxlg, 1)))) 

1271 ftres = res2djoint.f_test(rconstr) 

1272 if verbose: 

1273 print('parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' 

1274 ' df_num=%d' % (ftres.fvalue, ftres.pvalue, ftres.df_denom, 

1275 ftres.df_num)) 

1276 result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], 

1277 np.squeeze(ftres.pvalue)[()], 

1278 ftres.df_denom, ftres.df_num) 

1279 

1280 resli[mxlg] = (result, [res2down, res2djoint, rconstr]) 

1281 

1282 return resli 

1283 

1284 

1285def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', 

1286 return_results=None): 

1287 """ 

1288 Test for no-cointegration of a univariate equation. 

1289 

1290 The null hypothesis is no cointegration. Variables in y0 and y1 are 

1291 assumed to be integrated of order 1, I(1). 

1292 

1293 This uses the augmented Engle-Granger two-step cointegration test. 

1294 Constant or trend is included in 1st stage regression, i.e. in 

1295 cointegrating equation. 

1296 

1297 **Warning:** The autolag default has changed compared to statsmodels 0.8. 

1298 In 0.8 autolag was always None, no the keyword is used and defaults to 

1299 'aic'. Use `autolag=None` to avoid the lag search. 

1300 

1301 Parameters 

1302 ---------- 

1303 y0 : array_like 

1304 The first element in cointegrated system. Must be 1-d. 

1305 y1 : array_like 

1306 The remaining elements in cointegrated system. 

1307 trend : str {'c', 'ct'} 

1308 The trend term included in regression for cointegrating equation. 

1309 

1310 * 'c' : constant. 

1311 * 'ct' : constant and linear trend. 

1312 * also available quadratic trend 'ctt', and no constant 'nc'. 

1313 

1314 method : {'aeg'} 

1315 Only 'aeg' (augmented Engle-Granger) is available. 

1316 maxlag : None or int 

1317 Argument for `adfuller`, largest or given number of lags. 

1318 autolag : str 

1319 Argument for `adfuller`, lag selection criterion. 

1320 

1321 * If None, then maxlag lags are used without lag search. 

1322 * If 'AIC' (default) or 'BIC', then the number of lags is chosen 

1323 to minimize the corresponding information criterion. 

1324 * 't-stat' based choice of maxlag. Starts with maxlag and drops a 

1325 lag until the t-statistic on the last lag length is significant 

1326 using a 5%-sized test. 

1327 return_results : bool 

1328 For future compatibility, currently only tuple available. 

1329 If True, then a results instance is returned. Otherwise, a tuple 

1330 with the test outcome is returned. Set `return_results=False` to 

1331 avoid future changes in return. 

1332 

1333 Returns 

1334 ------- 

1335 coint_t : float 

1336 The t-statistic of unit-root test on residuals. 

1337 pvalue : float 

1338 MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994). 

1339 crit_value : dict 

1340 Critical values for the test statistic at the 1 %, 5 %, and 10 % 

1341 levels based on regression curve. This depends on the number of 

1342 observations. 

1343 

1344 Notes 

1345 ----- 

1346 The Null hypothesis is that there is no cointegration, the alternative 

1347 hypothesis is that there is cointegrating relationship. If the pvalue is 

1348 small, below a critical size, then we can reject the hypothesis that there 

1349 is no cointegrating relationship. 

1350 

1351 P-values and critical values are obtained through regression surface 

1352 approximation from MacKinnon 1994 and 2010. 

1353 

1354 If the two series are almost perfectly collinear, then computing the 

1355 test is numerically unstable. However, the two series will be cointegrated 

1356 under the maintained assumption that they are integrated. In this case 

1357 the t-statistic will be set to -inf and the pvalue to zero. 

1358 

1359 TODO: We could handle gaps in data by dropping rows with nans in the 

1360 Auxiliary regressions. Not implemented yet, currently assumes no nans 

1361 and no gaps in time series. 

1362 

1363 References 

1364 ---------- 

1365 .. [1] MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions 

1366 for Unit-Root and Cointegration Tests." Journal of Business & Economics 

1367 Statistics, 12.2, 167-76. 

1368 .. [2] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." 

1369 Queen's University, Dept of Economics Working Papers 1227. 

1370 http://ideas.repec.org/p/qed/wpaper/1227.html 

1371 """ 

1372 y0 = array_like(y0, 'y0') 

1373 y1 = array_like(y1, 'y1', ndim=2) 

1374 trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt')) 

1375 method = string_like(method, 'method', options=('aeg',)) 

1376 maxlag = int_like(maxlag, 'maxlag', optional=True) 

1377 autolag = string_like(autolag, 'autolag', optional=True, 

1378 options=('aic', 'bic', 't-stat')) 

1379 return_results = bool_like(return_results, 'return_results', optional=True) 

1380 

1381 nobs, k_vars = y1.shape 

1382 k_vars += 1 # add 1 for y0 

1383 

1384 if trend == 'nc': 

1385 xx = y1 

1386 else: 

1387 xx = add_trend(y1, trend=trend, prepend=False) 

1388 

1389 res_co = OLS(y0, xx).fit() 

1390 

1391 if res_co.rsquared < 1 - 100 * SQRTEPS: 

1392 res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=autolag, 

1393 regression='nc') 

1394 else: 

1395 import warnings 

1396 warnings.warn("y0 and y1 are (almost) perfectly colinear." 

1397 "Cointegration test is not reliable in this case.", 

1398 CollinearityWarning) 

1399 # Edge case where series are too similar 

1400 res_adf = (-np.inf,) 

1401 

1402 # no constant or trend, see egranger in Stata and MacKinnon 

1403 if trend == 'nc': 

1404 crit = [np.nan] * 3 # 2010 critical values not available 

1405 else: 

1406 crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1) 

1407 # nobs - 1, the -1 is to match egranger in Stata, I do not know why. 

1408 # TODO: check nobs or df = nobs - k 

1409 

1410 pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars) 

1411 return res_adf[0], pval_asy, crit 

1412 

1413 

1414def _safe_arma_fit(y, order, model_kw, trend, fit_kw, start_params=None): 

1415 try: 

1416 return ARMA(y, order=order, **model_kw).fit(disp=0, trend=trend, 

1417 start_params=start_params, 

1418 **fit_kw) 

1419 except LinAlgError: 

1420 # SVD convergence failure on badly misspecified models 

1421 return 

1422 

1423 except ValueError as error: 

1424 if start_params is not None: # do not recurse again 

1425 # user supplied start_params only get one chance 

1426 return 

1427 # try a little harder, should be handled in fit really 

1428 elif ('initial' not in error.args[0] or 'initial' in str(error)): 

1429 start_params = [.1] * sum(order) 

1430 if trend == 'c': 

1431 start_params = [.1] + start_params 

1432 return _safe_arma_fit(y, order, model_kw, trend, fit_kw, 

1433 start_params) 

1434 else: 

1435 return 

1436 except: # no idea what happened 

1437 return 

1438 

1439 

1440def arma_order_select_ic(y, max_ar=4, max_ma=2, ic='bic', trend='c', 

1441 model_kw=None, fit_kw=None): 

1442 """ 

1443 Compute information criteria for many ARMA models. 

1444 

1445 Parameters 

1446 ---------- 

1447 y : array_like 

1448 Array of time-series data. 

1449 max_ar : int 

1450 Maximum number of AR lags to use. Default 4. 

1451 max_ma : int 

1452 Maximum number of MA lags to use. Default 2. 

1453 ic : str, list 

1454 Information criteria to report. Either a single string or a list 

1455 of different criteria is possible. 

1456 trend : str 

1457 The trend to use when fitting the ARMA models. 

1458 model_kw : dict 

1459 Keyword arguments to be passed to the ``ARMA`` model. 

1460 fit_kw : dict 

1461 Keyword arguments to be passed to ``ARMA.fit``. 

1462 

1463 Returns 

1464 ------- 

1465 Bunch 

1466 Dict-like object with attribute access. Each ic is an attribute with a 

1467 DataFrame for the results. The AR order used is the row index. The ma 

1468 order used is the column index. The minimum orders are available as 

1469 ``ic_min_order``. 

1470 

1471 Notes 

1472 ----- 

1473 This method can be used to tentatively identify the order of an ARMA 

1474 process, provided that the time series is stationary and invertible. This 

1475 function computes the full exact MLE estimate of each model and can be, 

1476 therefore a little slow. An implementation using approximate estimates 

1477 will be provided in the future. In the meantime, consider passing 

1478 {method : 'css'} to fit_kw. 

1479 

1480 Examples 

1481 -------- 

1482 

1483 >>> from statsmodels.tsa.arima_process import arma_generate_sample 

1484 >>> import statsmodels.api as sm 

1485 >>> import numpy as np 

1486 

1487 >>> arparams = np.array([.75, -.25]) 

1488 >>> maparams = np.array([.65, .35]) 

1489 >>> arparams = np.r_[1, -arparams] 

1490 >>> maparam = np.r_[1, maparams] 

1491 >>> nobs = 250 

1492 >>> np.random.seed(2014) 

1493 >>> y = arma_generate_sample(arparams, maparams, nobs) 

1494 >>> res = sm.tsa.arma_order_select_ic(y, ic=['aic', 'bic'], trend='nc') 

1495 >>> res.aic_min_order 

1496 >>> res.bic_min_order 

1497 """ 

1498 max_ar = int_like(max_ar, 'max_ar') 

1499 max_ma = int_like(max_ma, 'max_ma') 

1500 trend = string_like(trend, 'trend', options=('nc', 'c')) 

1501 model_kw = dict_like(model_kw, 'model_kw', optional=True) 

1502 fit_kw = dict_like(fit_kw, 'fit_kw', optional=True) 

1503 

1504 ar_range = lrange(0, max_ar + 1) 

1505 ma_range = lrange(0, max_ma + 1) 

1506 if isinstance(ic, str): 

1507 ic = [ic] 

1508 elif not isinstance(ic, (list, tuple)): 

1509 raise ValueError("Need a list or a tuple for ic if not a string.") 

1510 

1511 results = np.zeros((len(ic), max_ar + 1, max_ma + 1)) 

1512 model_kw = {} if model_kw is None else model_kw 

1513 fit_kw = {} if fit_kw is None else fit_kw 

1514 y_arr = array_like(y, 'y', contiguous=True) 

1515 for ar in ar_range: 

1516 for ma in ma_range: 

1517 if ar == 0 and ma == 0 and trend == 'nc': 

1518 results[:, ar, ma] = np.nan 

1519 continue 

1520 

1521 mod = _safe_arma_fit(y_arr, (ar, ma), model_kw, trend, fit_kw) 

1522 if mod is None: 

1523 results[:, ar, ma] = np.nan 

1524 continue 

1525 

1526 for i, criteria in enumerate(ic): 

1527 results[i, ar, ma] = getattr(mod, criteria) 

1528 

1529 dfs = [pd.DataFrame(res, columns=ma_range, index=ar_range) for res in 

1530 results] 

1531 

1532 res = dict(zip(ic, dfs)) 

1533 

1534 # add the minimums to the results dict 

1535 min_res = {} 

1536 for i, result in iteritems(res): 

1537 mins = np.where(result.min().min() == result) 

1538 min_res.update({i + '_min_order': (mins[0][0], mins[1][0])}) 

1539 res.update(min_res) 

1540 

1541 return Bunch(**res) 

1542 

1543 

1544def has_missing(data): 

1545 """ 

1546 Returns True if 'data' contains missing entries, otherwise False 

1547 """ 

1548 return np.isnan(np.sum(data)) 

1549 

1550 

1551@deprecate_kwarg('lags', 'nlags') 

1552def kpss(x, regression='c', nlags=None, store=False): 

1553 """ 

1554 Kwiatkowski-Phillips-Schmidt-Shin test for stationarity. 

1555 

1556 Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null 

1557 hypothesis that x is level or trend stationary. 

1558 

1559 Parameters 

1560 ---------- 

1561 x : array_like, 1d 

1562 The data series to test. 

1563 regression : str{'c', 'ct'} 

1564 The null hypothesis for the KPSS test. 

1565 

1566 * 'c' : The data is stationary around a constant (default). 

1567 * 'ct' : The data is stationary around a trend. 

1568 nlags : {None, str, int}, optional 

1569 Indicates the number of lags to be used. If None (default), lags is 

1570 calculated using the legacy method. If 'auto', lags is calculated 

1571 using the data-dependent method of Hobijn et al. (1998). See also 

1572 Andrews (1991), Newey & West (1994), and Schwert (1989). If set to 

1573 'legacy', uses int(12 * (n / 100)**(1 / 4)) , as outlined in 

1574 Schwert (1989). 

1575 store : bool 

1576 If True, then a result instance is returned additionally to 

1577 the KPSS statistic (default is False). 

1578 

1579 Returns 

1580 ------- 

1581 kpss_stat : float 

1582 The KPSS test statistic. 

1583 p_value : float 

1584 The p-value of the test. The p-value is interpolated from 

1585 Table 1 in Kwiatkowski et al. (1992), and a boundary point 

1586 is returned if the test statistic is outside the table of 

1587 critical values, that is, if the p-value is outside the 

1588 interval (0.01, 0.1). 

1589 lags : int 

1590 The truncation lag parameter. 

1591 crit : dict 

1592 The critical values at 10%, 5%, 2.5% and 1%. Based on 

1593 Kwiatkowski et al. (1992). 

1594 resstore : (optional) instance of ResultStore 

1595 An instance of a dummy class with results attached as attributes. 

1596 

1597 Notes 

1598 ----- 

1599 To estimate sigma^2 the Newey-West estimator is used. If lags is None, 

1600 the truncation lag parameter is set to int(12 * (n / 100) ** (1 / 4)), 

1601 as outlined in Schwert (1989). The p-values are interpolated from 

1602 Table 1 of Kwiatkowski et al. (1992). If the computed statistic is 

1603 outside the table of critical values, then a warning message is 

1604 generated. 

1605 

1606 Missing values are not handled. 

1607 

1608 References 

1609 ---------- 

1610 .. [1] Andrews, D.W.K. (1991). Heteroskedasticity and autocorrelation 

1611 consistent covariance matrix estimation. Econometrica, 59: 817-858. 

1612 

1613 .. [2] Hobijn, B., Frances, B.H., & Ooms, M. (2004). Generalizations of the 

1614 KPSS-test for stationarity. Statistica Neerlandica, 52: 483-502. 

1615 

1616 .. [3] Kwiatkowski, D., Phillips, P.C.B., Schmidt, P., & Shin, Y. (1992). 

1617 Testing the null hypothesis of stationarity against the alternative of a 

1618 unit root. Journal of Econometrics, 54: 159-178. 

1619 

1620 .. [4] Newey, W.K., & West, K.D. (1994). Automatic lag selection in 

1621 covariance matrix estimation. Review of Economic Studies, 61: 631-653. 

1622 

1623 .. [5] Schwert, G. W. (1989). Tests for unit roots: A Monte Carlo 

1624 investigation. Journal of Business and Economic Statistics, 7 (2): 

1625 147-159. 

1626 """ 

1627 from warnings import warn 

1628 

1629 x = array_like(x, 'x') 

1630 regression = string_like(regression, 'regression', options=('c', 'ct')) 

1631 store = bool_like(store, 'store') 

1632 

1633 nobs = x.shape[0] 

1634 hypo = regression 

1635 

1636 # if m is not one, n != m * n 

1637 if nobs != x.size: 

1638 raise ValueError("x of shape {0} not understood".format(x.shape)) 

1639 

1640 if hypo == 'ct': 

1641 # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t, 

1642 # where beta is the trend, r_t a random walk and e_t a stationary 

1643 # error term. 

1644 resids = OLS(x, add_constant(np.arange(1, nobs + 1))).fit().resid 

1645 crit = [0.119, 0.146, 0.176, 0.216] 

1646 elif hypo == 'c': 

1647 # special case of the model above, where beta = 0 (so the null 

1648 # hypothesis is that the data is stationary around r_0). 

1649 resids = x - x.mean() 

1650 crit = [0.347, 0.463, 0.574, 0.739] 

1651 

1652 if nlags is None: 

1653 nlags = 'legacy' 

1654 msg = 'The behavior of using lags=None will change in the next ' \ 

1655 'release. Currently lags=None is the same as ' \ 

1656 'lags=\'legacy\', and so a sample-size lag length is used. ' \ 

1657 'After the next release, the default will change to be the ' \ 

1658 'same as lags=\'auto\' which uses an automatic lag length ' \ 

1659 'selection method. To silence this warning, either use ' \ 

1660 '\'auto\' or \'legacy\'' 

1661 warn(msg, FutureWarning) 

1662 if nlags == 'legacy': 

1663 nlags = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) 

1664 nlags = min(nlags, nobs - 1) 

1665 elif nlags == 'auto': 

1666 # autolag method of Hobijn et al. (1998) 

1667 nlags = _kpss_autolag(resids, nobs) 

1668 nlags = min(nlags, nobs - 1) 

1669 else: 

1670 nlags = int(nlags) 

1671 

1672 if nlags >= nobs: 

1673 raise ValueError("lags ({}) must be < number of observations ({})" 

1674 .format(nlags, nobs)) 

1675 

1676 pvals = [0.10, 0.05, 0.025, 0.01] 

1677 

1678 eta = np.sum(resids.cumsum()**2) / (nobs**2) # eq. 11, p. 165 

1679 s_hat = _sigma_est_kpss(resids, nobs, nlags) 

1680 

1681 kpss_stat = eta / s_hat 

1682 p_value = np.interp(kpss_stat, crit, pvals) 

1683 

1684 if p_value == pvals[-1]: 

1685 warn("p-value is smaller than the indicated p-value", InterpolationWarning) 

1686 elif p_value == pvals[0]: 

1687 warn("p-value is greater than the indicated p-value", InterpolationWarning) 

1688 

1689 crit_dict = {'10%': crit[0], '5%': crit[1], '2.5%': crit[2], '1%': crit[3]} 

1690 

1691 if store: 

1692 rstore = ResultsStore() 

1693 rstore.lags = nlags 

1694 rstore.nobs = nobs 

1695 

1696 stationary_type = "level" if hypo == 'c' else "trend" 

1697 rstore.H0 = "The series is {0} stationary".format(stationary_type) 

1698 rstore.HA = "The series is not {0} stationary".format(stationary_type) 

1699 

1700 return kpss_stat, p_value, crit_dict, rstore 

1701 else: 

1702 return kpss_stat, p_value, nlags, crit_dict 

1703 

1704 

1705def _sigma_est_kpss(resids, nobs, lags): 

1706 """ 

1707 Computes equation 10, p. 164 of Kwiatkowski et al. (1992). This is the 

1708 consistent estimator for the variance. 

1709 """ 

1710 s_hat = np.sum(resids**2) 

1711 for i in range(1, lags + 1): 

1712 resids_prod = np.dot(resids[i:], resids[:nobs - i]) 

1713 s_hat += 2 * resids_prod * (1. - (i / (lags + 1.))) 

1714 return s_hat / nobs 

1715 

1716 

1717def _kpss_autolag(resids, nobs): 

1718 """ 

1719 Computes the number of lags for covariance matrix estimation in KPSS test 

1720 using method of Hobijn et al (1998). See also Andrews (1991), Newey & West 

1721 (1994), and Schwert (1989). Assumes Bartlett / Newey-West kernel. 

1722 """ 

1723 covlags = int(np.power(nobs, 2. / 9.)) 

1724 s0 = np.sum(resids**2) / nobs 

1725 s1 = 0 

1726 for i in range(1, covlags + 1): 

1727 resids_prod = np.dot(resids[i:], resids[:nobs - i]) 

1728 resids_prod /= (nobs / 2.) 

1729 s0 += resids_prod 

1730 s1 += i * resids_prod 

1731 s_hat = s1 / s0 

1732 pwr = 1. / 3. 

1733 gamma_hat = 1.1447 * np.power(s_hat * s_hat, pwr) 

1734 autolags = int(gamma_hat * np.power(nobs, pwr)) 

1735 return autolags 

1736 

1737 

1738class ZivotAndrewsUnitRoot(object): 

1739 """ 

1740 Class wrapper for Zivot-Andrews structural-break unit-root test 

1741 """ 

1742 def __init__(self): 

1743 """ 

1744 Critical values for the three different models specified for the 

1745 Zivot-Andrews unit-root test. 

1746 

1747 Notes 

1748 ----- 

1749 The p-values are generated through Monte Carlo simulation using 

1750 100,000 replications and 2000 data points. 

1751 """ 

1752 self._za_critical_values = {} 

1753 # constant-only model 

1754 self._c = ( 

1755 (0.001, -6.78442), (0.100, -5.83192), (0.200, -5.68139), 

1756 (0.300, -5.58461), (0.400, -5.51308), (0.500, -5.45043), 

1757 (0.600, -5.39924), (0.700, -5.36023), (0.800, -5.33219), 

1758 (0.900, -5.30294), (1.000, -5.27644), (2.500, -5.03340), 

1759 (5.000, -4.81067), (7.500, -4.67636), (10.000, -4.56618), 

1760 (12.500, -4.48130), (15.000, -4.40507), (17.500, -4.33947), 

1761 (20.000, -4.28155), (22.500, -4.22683), (25.000, -4.17830), 

1762 (27.500, -4.13101), (30.000, -4.08586), (32.500, -4.04455), 

1763 (35.000, -4.00380), (37.500, -3.96144), (40.000, -3.92078), 

1764 (42.500, -3.88178), (45.000, -3.84503), (47.500, -3.80549), 

1765 (50.000, -3.77031), (52.500, -3.73209), (55.000, -3.69600), 

1766 (57.500, -3.65985), (60.000, -3.62126), (65.000, -3.54580), 

1767 (70.000, -3.46848), (75.000, -3.38533), (80.000, -3.29112), 

1768 (85.000, -3.17832), (90.000, -3.04165), (92.500, -2.95146), 

1769 (95.000, -2.83179), (96.000, -2.76465), (97.000, -2.68624), 

1770 (98.000, -2.57884), (99.000, -2.40044), (99.900, -1.88932) 

1771 ) 

1772 self._za_critical_values['c'] = np.asarray(self._c) 

1773 # trend-only model 

1774 self._t = ( 

1775 (0.001, -83.9094), (0.100, -13.8837), (0.200, -9.13205), 

1776 (0.300, -6.32564), (0.400, -5.60803), (0.500, -5.38794), 

1777 (0.600, -5.26585), (0.700, -5.18734), (0.800, -5.12756), 

1778 (0.900, -5.07984), (1.000, -5.03421), (2.500, -4.65634), 

1779 (5.000, -4.40580), (7.500, -4.25214), (10.000, -4.13678), 

1780 (12.500, -4.03765), (15.000, -3.95185), (17.500, -3.87945), 

1781 (20.000, -3.81295), (22.500, -3.75273), (25.000, -3.69836), 

1782 (27.500, -3.64785), (30.000, -3.59819), (32.500, -3.55146), 

1783 (35.000, -3.50522), (37.500, -3.45987), (40.000, -3.41672), 

1784 (42.500, -3.37465), (45.000, -3.33394), (47.500, -3.29393), 

1785 (50.000, -3.25316), (52.500, -3.21244), (55.000, -3.17124), 

1786 (57.500, -3.13211), (60.000, -3.09204), (65.000, -3.01135), 

1787 (70.000, -2.92897), (75.000, -2.83614), (80.000, -2.73893), 

1788 (85.000, -2.62840), (90.000, -2.49611), (92.500, -2.41337), 

1789 (95.000, -2.30820), (96.000, -2.25797), (97.000, -2.19648), 

1790 (98.000, -2.11320), (99.000, -1.99138), (99.900, -1.67466) 

1791 ) 

1792 self._za_critical_values['t'] = np.asarray(self._t) 

1793 # constant + trend model 

1794 self._ct = ( 

1795 (0.001, -38.17800), (0.100, -6.43107), (0.200, -6.07279), 

1796 (0.300, -5.95496), (0.400, -5.86254), (0.500, -5.77081), 

1797 (0.600, -5.72541), (0.700, -5.68406), (0.800, -5.65163), 

1798 (0.900, -5.60419), (1.000, -5.57556), (2.500, -5.29704), 

1799 (5.000, -5.07332), (7.500, -4.93003), (10.000, -4.82668), 

1800 (12.500, -4.73711), (15.000, -4.66020), (17.500, -4.58970), 

1801 (20.000, -4.52855), (22.500, -4.47100), (25.000, -4.42011), 

1802 (27.500, -4.37387), (30.000, -4.32705), (32.500, -4.28126), 

1803 (35.000, -4.23793), (37.500, -4.19822), (40.000, -4.15800), 

1804 (42.500, -4.11946), (45.000, -4.08064), (47.500, -4.04286), 

1805 (50.000, -4.00489), (52.500, -3.96837), (55.000, -3.93200), 

1806 (57.500, -3.89496), (60.000, -3.85577), (65.000, -3.77795), 

1807 (70.000, -3.69794), (75.000, -3.61852), (80.000, -3.52485), 

1808 (85.000, -3.41665), (90.000, -3.28527), (92.500, -3.19724), 

1809 (95.000, -3.08769), (96.000, -3.03088), (97.000, -2.96091), 

1810 (98.000, -2.85581), (99.000, -2.71015), (99.900, -2.28767) 

1811 ) 

1812 self._za_critical_values['ct'] = np.asarray(self._ct) 

1813 

1814 def _za_crit(self, stat, model='c'): 

1815 """ 

1816 Linear interpolation for Zivot-Andrews p-values and critical values 

1817 

1818 Parameters 

1819 ---------- 

1820 stat : float 

1821 The ZA test statistic 

1822 model : {'c','t','ct'} 

1823 The model used when computing the ZA statistic. 'c' is default. 

1824 

1825 Returns 

1826 ------- 

1827 pvalue : float 

1828 The interpolated p-value 

1829 cvdict : dict 

1830 Critical values for the test statistic at the 1%, 5%, and 10% 

1831 levels 

1832 

1833 Notes 

1834 ----- 

1835 The p-values are linear interpolated from the quantiles of the 

1836 simulated ZA test statistic distribution 

1837 """ 

1838 table = self._za_critical_values[model] 

1839 pcnts = table[:, 0] 

1840 stats = table[:, 1] 

1841 # ZA cv table contains quantiles multiplied by 100 

1842 pvalue = np.interp(stat, stats, pcnts) / 100.0 

1843 cv = [1.0, 5.0, 10.0] 

1844 crit_value = np.interp(cv, pcnts, stats) 

1845 cvdict = {"1%": crit_value[0], "5%": crit_value[1], 

1846 "10%": crit_value[2]} 

1847 return pvalue, cvdict 

1848 

1849 def _quick_ols(self, endog, exog): 

1850 """ 

1851 Minimal implementation of LS estimator for internal use 

1852 """ 

1853 xpxi = np.linalg.inv(exog.T.dot(exog)) 

1854 xpy = exog.T.dot(endog) 

1855 nobs, k_exog = exog.shape 

1856 b = xpxi.dot(xpy) 

1857 e = endog - exog.dot(b) 

1858 sigma2 = e.T.dot(e) / (nobs - k_exog) 

1859 return b / np.sqrt(np.diag(sigma2 * xpxi)) 

1860 

1861 def _format_regression_data(self, series, nobs, const, trend, cols, lags): 

1862 """ 

1863 Create the endog/exog data for the auxiliary regressions 

1864 from the original (standardized) series under test. 

1865 """ 

1866 # first-diff y and standardize for numerical stability 

1867 endog = np.diff(series, axis=0) 

1868 endog /= np.sqrt(endog.T.dot(endog)) 

1869 series /= np.sqrt(series.T.dot(series)) 

1870 # reserve exog space 

1871 exog = np.zeros((endog[lags:].shape[0], cols + lags)) 

1872 exog[:, 0] = const 

1873 # lagged y and dy 

1874 exog[:, cols - 1] = series[lags:(nobs - 1)] 

1875 exog[:, cols:] = lagmat( 

1876 endog, lags, trim='none')[lags:exog.shape[0] + lags] 

1877 return endog, exog 

1878 

1879 def _update_regression_exog(self, exog, regression, period, nobs, const, 

1880 trend, cols, lags): 

1881 """ 

1882 Update the exog array for the next regression. 

1883 """ 

1884 cutoff = (period - (lags + 1)) 

1885 if regression != 't': 

1886 exog[:cutoff, 1] = 0 

1887 exog[cutoff:, 1] = const 

1888 exog[:, 2] = trend[(lags + 2):(nobs + 1)] 

1889 if regression == 'ct': 

1890 exog[:cutoff, 3] = 0 

1891 exog[cutoff:, 3] = trend[1:(nobs - period + 1)] 

1892 else: 

1893 exog[:, 1] = trend[(lags + 2):(nobs + 1)] 

1894 exog[:(cutoff-1), 2] = 0 

1895 exog[(cutoff-1):, 2] = trend[0:(nobs - period + 1)] 

1896 return exog 

1897 

1898 def run(self, x, trim=0.15, maxlag=None, regression='c', autolag='AIC'): 

1899 """ 

1900 Zivot-Andrews structural-break unit-root test. 

1901 

1902 The Zivot-Andrews test tests for a unit root in a univariate process 

1903 in the presence of serial correlation and a single structural break. 

1904 

1905 Parameters 

1906 ---------- 

1907 x : array_like 

1908 The data series to test. 

1909 trim : float 

1910 The percentage of series at begin/end to exclude from break-period 

1911 calculation in range [0, 0.333] (default=0.15). 

1912 maxlag : int 

1913 The maximum lag which is included in test, default is 

1914 12*(nobs/100)^{1/4} (Schwert, 1989). 

1915 regression : {'c','t','ct'} 

1916 Constant and trend order to include in regression. 

1917 

1918 * 'c' : constant only (default). 

1919 * 't' : trend only. 

1920 * 'ct' : constant and trend. 

1921 autolag : {'AIC', 'BIC', 't-stat', None} 

1922 The method to select the lag length when using automatic selection. 

1923 

1924 * if None, then maxlag lags are used, 

1925 * if 'AIC' (default) or 'BIC', then the number of lags is chosen 

1926 to minimize the corresponding information criterion, 

1927 * 't-stat' based choice of maxlag. Starts with maxlag and drops a 

1928 lag until the t-statistic on the last lag length is significant 

1929 using a 5%-sized test. 

1930 

1931 Returns 

1932 ------- 

1933 zastat : float 

1934 The test statistic. 

1935 pvalue : float 

1936 The pvalue based on MC-derived critical values. 

1937 cvdict : dict 

1938 The critical values for the test statistic at the 1%, 5%, and 10% 

1939 levels. 

1940 bpidx : int 

1941 The index of x corresponding to endogenously calculated break period 

1942 with values in the range [0..nobs-1]. 

1943 baselag : int 

1944 The number of lags used for period regressions. 

1945 

1946 Notes 

1947 ----- 

1948 H0 = unit root with a single structural break 

1949 

1950 Algorithm follows Baum (2004/2015) approximation to original 

1951 Zivot-Andrews method. Rather than performing an autolag regression at 

1952 each candidate break period (as per the original paper), a single 

1953 autolag regression is run up-front on the base model (constant + trend 

1954 with no dummies) to determine the best lag length. This lag length is 

1955 then used for all subsequent break-period regressions. This results in 

1956 significant run time reduction but also slightly more pessimistic test 

1957 statistics than the original Zivot-Andrews method, although no attempt 

1958 has been made to characterize the size/power trade-off. 

1959 

1960 References 

1961 ---------- 

1962 .. [1] Baum, C.F. (2004). ZANDREWS: Stata module to calculate 

1963 Zivot-Andrews unit root test in presence of structural break," 

1964 Statistical Software Components S437301, Boston College Department 

1965 of Economics, revised 2015. 

1966 

1967 .. [2] Schwert, G.W. (1989). Tests for unit roots: A Monte Carlo 

1968 investigation. Journal of Business & Economic Statistics, 7: 

1969 147-159. 

1970 

1971 .. [3] Zivot, E., and Andrews, D.W.K. (1992). Further evidence on the 

1972 great crash, the oil-price shock, and the unit-root hypothesis. 

1973 Journal of Business & Economic Studies, 10: 251-270. 

1974 """ 

1975 x = array_like(x, 'x') 

1976 trim = float_like(trim, 'trim') 

1977 maxlag = int_like(maxlag, 'maxlag', optional=True) 

1978 regression = string_like(regression, 'regression', 

1979 options=('c', 't', 'ct')) 

1980 autolag = string_like(autolag, 'autolag', 

1981 options=('AIC', 'BIC', 't-stat'), optional=True) 

1982 if trim < 0 or trim > (1. / 3.): 

1983 raise ValueError('trim value must be a float in range [0, 1/3)') 

1984 nobs = x.shape[0] 

1985 if autolag: 

1986 adf_res = adfuller(x, maxlag=maxlag, regression='ct', 

1987 autolag=autolag) 

1988 baselags = adf_res[2] 

1989 elif maxlag: 

1990 baselags = maxlag 

1991 else: 

1992 baselags = int(12. * np.power(nobs / 100., 1 / 4.)) 

1993 trimcnt = int(nobs * trim) 

1994 start_period = trimcnt 

1995 end_period = nobs - trimcnt 

1996 if regression == 'ct': 

1997 basecols = 5 

1998 else: 

1999 basecols = 4 

2000 # normalize constant and trend terms for stability 

2001 c_const = 1 / np.sqrt(nobs) 

2002 t_const = np.arange(1.0, nobs + 2) 

2003 t_const *= np.sqrt(3) / nobs ** (3 / 2) 

2004 # format the auxiliary regression data 

2005 endog, exog = self._format_regression_data( 

2006 x, nobs, c_const, t_const, basecols, baselags) 

2007 # iterate through the time periods 

2008 stats = np.full(end_period + 1, np.inf) 

2009 for bp in range(start_period + 1, end_period + 1): 

2010 # update intercept dummy / trend / trend dummy 

2011 exog = self._update_regression_exog(exog, regression, bp, nobs, 

2012 c_const, t_const, basecols, 

2013 baselags) 

2014 # check exog rank on first iteration 

2015 if bp == start_period + 1: 

2016 o = OLS(endog[baselags:], exog, hasconst=1).fit() 

2017 if o.df_model < exog.shape[1] - 1: 

2018 raise ValueError( 

2019 'ZA: auxiliary exog matrix is not full rank.\n' 

2020 ' cols (exc intercept) = {} rank = {}'.format( 

2021 exog.shape[1] - 1, o.df_model)) 

2022 stats[bp] = o.tvalues[basecols - 1] 

2023 else: 

2024 stats[bp] = self._quick_ols(endog[baselags:], 

2025 exog)[basecols - 1] 

2026 # return best seen 

2027 zastat = np.min(stats) 

2028 bpidx = np.argmin(stats) - 1 

2029 crit = self._za_crit(zastat, regression) 

2030 pval = crit[0] 

2031 cvdict = crit[1] 

2032 return zastat, pval, cvdict, baselags, bpidx 

2033 

2034 def __call__(self, x, trim=0.15, maxlag=None, regression='c', 

2035 autolag='AIC'): 

2036 return self.run(x, trim=trim, maxlag=maxlag, regression=regression, 

2037 autolag=autolag) 

2038 

2039 

2040zivot_andrews = ZivotAndrewsUnitRoot() 

2041zivot_andrews.__doc__ = zivot_andrews.run.__doc__