Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/stattools.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Statistical tests to be used in conjunction with the models
4Notes
5-----
6These functions have not been formally tested.
7"""
9from scipy import stats
10import numpy as np
11from statsmodels.tools.sm_exceptions import ValueWarning
14# TODO: these are pretty straightforward but they should be tested
15def durbin_watson(resids, axis=0):
16 r"""
17 Calculates the Durbin-Watson statistic
19 Parameters
20 ----------
21 resids : array_like
23 Returns
24 -------
25 dw : float, array_like
26 The Durbin-Watson statistic.
28 Notes
29 -----
30 The null hypothesis of the test is that there is no serial correlation.
31 The Durbin-Watson test statistics is defined as:
33 .. math::
35 \sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2
37 The test statistic is approximately equal to 2*(1-r) where ``r`` is the
38 sample autocorrelation of the residuals. Thus, for r == 0, indicating no
39 serial correlation, the test statistic equals 2. This statistic will
40 always be between 0 and 4. The closer to 0 the statistic, the more
41 evidence for positive serial correlation. The closer to 4, the more
42 evidence for negative serial correlation.
43 """
44 resids = np.asarray(resids)
45 diff_resids = np.diff(resids, 1, axis=axis)
46 dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
47 return dw
50def omni_normtest(resids, axis=0):
51 """
52 Omnibus test for normality
54 Parameters
55 ----------
56 resid : array_like
57 axis : int, optional
58 Default is 0
60 Returns
61 -------
62 Chi^2 score, two-tail probability
63 """
64 # TODO: change to exception in summary branch and catch in summary()
65 # behavior changed between scipy 0.9 and 0.10
66 resids = np.asarray(resids)
67 n = resids.shape[axis]
68 if n < 8:
69 from warnings import warn
70 warn("omni_normtest is not valid with less than 8 observations; %i "
71 "samples were given." % int(n), ValueWarning)
72 return np.nan, np.nan
74 return stats.normaltest(resids, axis=axis)
77def jarque_bera(resids, axis=0):
78 r"""
79 The Jarque-Bera test of normality.
81 Parameters
82 ----------
83 resids : array_like
84 Data to test for normality. Usually regression model residuals that
85 are mean 0.
86 axis : int, optional
87 Axis to use if data has more than 1 dimension. Default is 0.
89 Returns
90 -------
91 JB : {float, ndarray}
92 The Jarque-Bera test statistic.
93 JBpv : {float, ndarray}
94 The pvalue of the test statistic.
95 skew : {float, ndarray}
96 Estimated skewness of the data.
97 kurtosis : {float, ndarray}
98 Estimated kurtosis of the data.
100 Notes
101 -----
102 Each output returned has 1 dimension fewer than data
104 The Jarque-Bera test statistic tests the null that the data is normally
105 distributed against an alternative that the data follow some other
106 distribution. The test statistic is based on two moments of the data,
107 the skewness, and the kurtosis, and has an asymptotic :math:`\chi^2_2`
108 distribution.
110 The test statistic is defined
112 .. math:: JB = n(S^2/6+(K-3)^2/24)
114 where n is the number of data points, S is the sample skewness, and K is
115 the sample kurtosis of the data.
116 """
117 resids = np.asarray(resids)
118 # Calculate residual skewness and kurtosis
119 skew = stats.skew(resids, axis=axis)
120 kurtosis = 3 + stats.kurtosis(resids, axis=axis)
122 # Calculate the Jarque-Bera test for normality
123 n = resids.shape[axis]
124 jb = (n / 6.) * (skew ** 2 + (1 / 4.) * (kurtosis - 3) ** 2)
125 jb_pv = stats.chi2.sf(jb, 2)
127 return jb, jb_pv, skew, kurtosis
130def robust_skewness(y, axis=0):
131 """
132 Calculates the four skewness measures in Kim & White
134 Parameters
135 ----------
136 y : array_like
137 Data to compute use in the estimator.
138 axis : int or None, optional
139 Axis along which the skewness measures are computed. If `None`, the
140 entire array is used.
142 Returns
143 -------
144 sk1 : ndarray
145 The standard skewness estimator.
146 sk2 : ndarray
147 Skewness estimator based on quartiles.
148 sk3 : ndarray
149 Skewness estimator based on mean-median difference, standardized by
150 absolute deviation.
151 sk4 : ndarray
152 Skewness estimator based on mean-median difference, standardized by
153 standard deviation.
155 Notes
156 -----
157 The robust skewness measures are defined
159 .. math::
161 SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right)
162 -\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}}
164 .. math::
166 SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}}
167 {\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]}
169 .. math::
171 SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}}
173 .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of
174 skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
175 March 2004.
176 """
178 if axis is None:
179 y = y.ravel()
180 axis = 0
182 y = np.sort(y, axis)
184 q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis)
186 mu = y.mean(axis)
187 shape = (y.size,)
188 if axis is not None:
189 shape = list(mu.shape)
190 shape.insert(axis, 1)
191 shape = tuple(shape)
193 mu_b = np.reshape(mu, shape)
194 q2_b = np.reshape(q2, shape)
196 sigma = np.sqrt(np.mean(((y - mu_b)**2), axis))
198 sk1 = stats.skew(y, axis=axis)
199 sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1)
200 sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis)
201 sk4 = (mu - q2) / sigma
203 return sk1, sk2, sk3, sk4
206def _kr3(y, alpha=5.0, beta=50.0):
207 """
208 KR3 estimator from Kim & White
210 Parameters
211 ----------
212 y : array_like, 1-d
213 Data to compute use in the estimator.
214 alpha : float, optional
215 Lower cut-off for measuring expectation in tail.
216 beta : float, optional
217 Lower cut-off for measuring expectation in center.
219 Returns
220 -------
221 kr3 : float
222 Robust kurtosis estimator based on standardized lower- and upper-tail
223 expected values
225 Notes
226 -----
227 .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of
228 skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
229 March 2004.
230 """
231 perc = (alpha, 100.0 - alpha, beta, 100.0 - beta)
232 lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc)
233 l_alpha = np.mean(y[y < lower_alpha])
234 u_alpha = np.mean(y[y > upper_alpha])
236 l_beta = np.mean(y[y < lower_beta])
237 u_beta = np.mean(y[y > upper_beta])
239 return (u_alpha - l_alpha) / (u_beta - l_beta)
242def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)):
243 """
244 Calculates the expected value of the robust kurtosis measures in Kim and
245 White assuming the data are normally distributed.
247 Parameters
248 ----------
249 ab : iterable, optional
250 Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
251 quantile cut-off for measuring the extreme tail and beta is the central
252 quantile cutoff for the standardization of the measure
253 db : iterable, optional
254 Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
255 quantile for measuring extreme values and gamma is the central quantile
256 used in the the standardization of the measure
258 Returns
259 -------
260 ekr : ndarray, 4-element
261 Contains the expected values of the 4 robust kurtosis measures
263 Notes
264 -----
265 See `robust_kurtosis` for definitions of the robust kurtosis measures
266 """
268 alpha, beta = ab
269 delta, gamma = dg
270 expected_value = np.zeros(4)
271 ppf = stats.norm.ppf
272 pdf = stats.norm.pdf
273 q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8)
274 expected_value[0] = 3
276 expected_value[1] = ((q7 - q5) + (q3 - q1)) / (q6 - q2)
278 q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0)))
279 expected_value[2] = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta)
281 q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0)))
282 expected_value[3] = (-2.0 * q_delta) / (-2.0 * q_gamma)
284 return expected_value
287def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True):
288 """
289 Calculates the four kurtosis measures in Kim & White
291 Parameters
292 ----------
293 y : array_like
294 Data to compute use in the estimator.
295 axis : int or None, optional
296 Axis along which the kurtosis are computed. If `None`, the
297 entire array is used.
298 a iterable, optional
299 Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
300 quantile cut-off for measuring the extreme tail and beta is the central
301 quantile cutoff for the standardization of the measure
302 db : iterable, optional
303 Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
304 quantile for measuring extreme values and gamma is the central quantile
305 used in the the standardization of the measure
306 excess : bool, optional
307 If true (default), computed values are excess of those for a standard
308 normal distribution.
310 Returns
311 -------
312 kr1 : ndarray
313 The standard kurtosis estimator.
314 kr2 : ndarray
315 Kurtosis estimator based on octiles.
316 kr3 : ndarray
317 Kurtosis estimators based on exceedance expectations.
318 kr4 : ndarray
319 Kurtosis measure based on the spread between high and low quantiles.
321 Notes
322 -----
323 The robust kurtosis measures are defined
325 .. math::
327 KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right)
328 +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)}
329 {\\hat{q}_{.75}-\\hat{q}_{.25}}
331 .. math::
333 KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right)
334 -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)}
335 {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right)
336 -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)}
338 .. math::
340 KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}}
341 {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}}
343 where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`.
345 .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of
346 skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
347 March 2004.
348 """
349 if (axis is None or
350 (y.squeeze().ndim == 1 and y.ndim != 1)):
351 y = y.ravel()
352 axis = 0
354 alpha, beta = ab
355 delta, gamma = dg
357 perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5,
358 delta, 100.0 - delta, gamma, 100.0 - gamma)
359 e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc,
360 axis=axis)
362 expected_value = (expected_robust_kurtosis(ab, dg)
363 if excess else np.zeros(4))
365 kr1 = stats.kurtosis(y, axis, False) - expected_value[0]
366 kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1]
367 if y.ndim == 1:
368 kr3 = _kr3(y, alpha, beta)
369 else:
370 kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta)
371 kr3 -= expected_value[2]
372 kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3]
373 return kr1, kr2, kr3, kr4
376def _medcouple_1d(y):
377 """
378 Calculates the medcouple robust measure of skew.
380 Parameters
381 ----------
382 y : array_like, 1-d
383 Data to compute use in the estimator.
385 Returns
386 -------
387 mc : float
388 The medcouple statistic
390 Notes
391 -----
392 The current algorithm requires a O(N**2) memory allocations, and so may
393 not work for very large arrays (N>10000).
395 .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
396 distributions" Computational Statistics & Data Analysis, vol. 52, pp.
397 5186-5201, August 2008.
398 """
400 # Parameter changes the algorithm to the slower for large n
402 y = np.squeeze(np.asarray(y))
403 if y.ndim != 1:
404 raise ValueError("y must be squeezable to a 1-d array")
406 y = np.sort(y)
408 n = y.shape[0]
409 if n % 2 == 0:
410 mf = (y[n // 2 - 1] + y[n // 2]) / 2
411 else:
412 mf = y[(n - 1) // 2]
414 z = y - mf
415 lower = z[z <= 0.0]
416 upper = z[z >= 0.0]
417 upper = upper[:, None]
418 standardization = upper - lower
419 is_zero = np.logical_and(lower == 0.0, upper == 0.0)
420 standardization[is_zero] = np.inf
421 spread = upper + lower
422 h = spread / standardization
423 # GH5395
424 num_ties = np.sum(lower == 0.0)
425 if num_ties:
426 # Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal,
427 # and 1 below the anti-diagonal
428 replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties)
429 replacements -= 2 * np.triu(replacements)
430 # Convert diagonal to anti-diagonal
431 replacements = np.fliplr(replacements)
432 # Always replace upper right block
433 h[:num_ties, -num_ties:] = replacements
435 return np.median(h)
438def medcouple(y, axis=0):
439 """
440 Calculate the medcouple robust measure of skew.
442 Parameters
443 ----------
444 y : array_like
445 Data to compute use in the estimator.
446 axis : {int, None}
447 Axis along which the medcouple statistic is computed. If `None`, the
448 entire array is used.
450 Returns
451 -------
452 mc : ndarray
453 The medcouple statistic with the same shape as `y`, with the specified
454 axis removed.
456 Notes
457 -----
458 The current algorithm requires a O(N**2) memory allocations, and so may
459 not work for very large arrays (N>10000).
461 .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
462 distributions" Computational Statistics & Data Analysis, vol. 52, pp.
463 5186-5201, August 2008.
464 """
465 y = np.asarray(y, dtype=np.double) # GH 4243
466 if axis is None:
467 return _medcouple_1d(y.ravel())
469 return np.apply_along_axis(_medcouple_1d, axis, y)