Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1'''extra statistical function and helper functions 

2 

3contains: 

4 

5* goodness-of-fit tests 

6 - powerdiscrepancy 

7 - gof_chisquare_discrete 

8 - gof_binning_discrete 

9 

10 

11 

12Author: Josef Perktold 

13License : BSD-3 

14 

15changes 

16------- 

172013-02-25 : add chisquare_power, effectsize and "value" 

18 

19''' 

20from statsmodels.compat.python import lrange 

21import numpy as np 

22from scipy import stats 

23 

24 

25# copied from regression/stats.utils 

26def powerdiscrepancy(observed, expected, lambd=0.0, axis=0, ddof=0): 

27 r"""Calculates power discrepancy, a class of goodness-of-fit tests 

28 as a measure of discrepancy between observed and expected data. 

29 

30 This contains several goodness-of-fit tests as special cases, see the 

31 description of lambd, the exponent of the power discrepancy. The pvalue 

32 is based on the asymptotic chi-square distribution of the test statistic. 

33 

34 freeman_tukey: 

35 D(x|\theta) = \sum_j (\sqrt{x_j} - \sqrt{e_j})^2 

36 

37 Parameters 

38 ---------- 

39 o : Iterable 

40 Observed values 

41 e : Iterable 

42 Expected values 

43 lambd : {float, str} 

44 * float : exponent `a` for power discrepancy 

45 * 'loglikeratio': a = 0 

46 * 'freeman_tukey': a = -0.5 

47 * 'pearson': a = 1 (standard chisquare test statistic) 

48 * 'modified_loglikeratio': a = -1 

49 * 'cressie_read': a = 2/3 

50 * 'neyman' : a = -2 (Neyman-modified chisquare, reference from a book?) 

51 axis : int 

52 axis for observations of one series 

53 ddof : int 

54 degrees of freedom correction, 

55 

56 Returns 

57 ------- 

58 D_obs : Discrepancy of observed values 

59 pvalue : pvalue 

60 

61 

62 References 

63 ---------- 

64 Cressie, Noel and Timothy R. C. Read, Multinomial Goodness-of-Fit Tests, 

65 Journal of the Royal Statistical Society. Series B (Methodological), 

66 Vol. 46, No. 3 (1984), pp. 440-464 

67 

68 Campbell B. Read: Freeman-Tukey chi-squared goodness-of-fit statistics, 

69 Statistics & Probability Letters 18 (1993) 271-278 

70 

71 Nobuhiro Taneichi, Yuri Sekiya, Akio Suzukawa, Asymptotic Approximations 

72 for the Distributions of the Multinomial Goodness-of-Fit Statistics 

73 under Local Alternatives, Journal of Multivariate Analysis 81, 335?359 (2002) 

74 Steele, M. 1,2, C. Hurst 3 and J. Chaseling, Simulated Power of Discrete 

75 Goodness-of-Fit Tests for Likert Type Data 

76 

77 Examples 

78 -------- 

79 

80 >>> observed = np.array([ 2., 4., 2., 1., 1.]) 

81 >>> expected = np.array([ 0.2, 0.2, 0.2, 0.2, 0.2]) 

82 

83 for checking correct dimension with multiple series 

84 

85 >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd='freeman_tukey',axis=1) 

86 (array([[ 2.745166, 2.745166]]), array([[ 0.6013346, 0.6013346]])) 

87 >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected,axis=1) 

88 (array([[ 2.77258872, 2.77258872]]), array([[ 0.59657359, 0.59657359]])) 

89 >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd=0,axis=1) 

90 (array([[ 2.77258872, 2.77258872]]), array([[ 0.59657359, 0.59657359]])) 

91 >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd=1,axis=1) 

92 (array([[ 3., 3.]]), array([[ 0.5578254, 0.5578254]])) 

93 >>> powerdiscrepancy(np.column_stack((observed,observed)).T, 10*expected, lambd=2/3.0,axis=1) 

94 (array([[ 2.89714546, 2.89714546]]), array([[ 0.57518277, 0.57518277]])) 

95 >>> powerdiscrepancy(np.column_stack((observed,observed)).T, expected, lambd=2/3.0,axis=1) 

96 (array([[ 2.89714546, 2.89714546]]), array([[ 0.57518277, 0.57518277]])) 

97 >>> powerdiscrepancy(np.column_stack((observed,observed)), expected, lambd=2/3.0, axis=0) 

98 (array([[ 2.89714546, 2.89714546]]), array([[ 0.57518277, 0.57518277]])) 

99 

100 each random variable can have different total count/sum 

101 

102 >>> powerdiscrepancy(np.column_stack((observed,2*observed)), expected, lambd=2/3.0, axis=0) 

103 (array([[ 2.89714546, 5.79429093]]), array([[ 0.57518277, 0.21504648]])) 

104 >>> powerdiscrepancy(np.column_stack((observed,2*observed)), expected, lambd=2/3.0, axis=0) 

105 (array([[ 2.89714546, 5.79429093]]), array([[ 0.57518277, 0.21504648]])) 

106 >>> powerdiscrepancy(np.column_stack((2*observed,2*observed)), expected, lambd=2/3.0, axis=0) 

107 (array([[ 5.79429093, 5.79429093]]), array([[ 0.21504648, 0.21504648]])) 

108 >>> powerdiscrepancy(np.column_stack((2*observed,2*observed)), 20*expected, lambd=2/3.0, axis=0) 

109 (array([[ 5.79429093, 5.79429093]]), array([[ 0.21504648, 0.21504648]])) 

110 >>> powerdiscrepancy(np.column_stack((observed,2*observed)), np.column_stack((10*expected,20*expected)), lambd=2/3.0, axis=0) 

111 (array([[ 2.89714546, 5.79429093]]), array([[ 0.57518277, 0.21504648]])) 

112 >>> powerdiscrepancy(np.column_stack((observed,2*observed)), np.column_stack((10*expected,20*expected)), lambd=-1, axis=0) 

113 (array([[ 2.77258872, 5.54517744]]), array([[ 0.59657359, 0.2357868 ]])) 

114 """ 

115 o = np.array(observed) 

116 e = np.array(expected) 

117 

118 if not isinstance(lambd, str): 

119 a = lambd 

120 else: 

121 if lambd == 'loglikeratio': 

122 a = 0 

123 elif lambd == 'freeman_tukey': 

124 a = -0.5 

125 elif lambd == 'pearson': 

126 a = 1 

127 elif lambd == 'modified_loglikeratio': 

128 a = -1 

129 elif lambd == 'cressie_read': 

130 a = 2/3.0 

131 else: 

132 raise ValueError('lambd has to be a number or one of ' 

133 'loglikeratio, freeman_tukey, pearson, ' 

134 'modified_loglikeratio or cressie_read') 

135 

136 n = np.sum(o, axis=axis) 

137 nt = n 

138 if n.size>1: 

139 n = np.atleast_2d(n) 

140 if axis == 1: 

141 nt = n.T # need both for 2d, n and nt for broadcasting 

142 if e.ndim == 1: 

143 e = np.atleast_2d(e) 

144 if axis == 0: 

145 e = e.T 

146 

147 if np.all(np.sum(e, axis=axis) == n): 

148 p = e/(1.0*nt) 

149 elif np.all(np.sum(e, axis=axis) == 1): 

150 p = e 

151 e = nt * e 

152 else: 

153 raise ValueError('observed and expected need to have the same ' 

154 'number of observations, or e needs to add to 1') 

155 k = o.shape[axis] 

156 if e.shape[axis] != k: 

157 raise ValueError('observed and expected need to have the same ' 

158 'number of bins') 

159 

160 # Note: taken from formulas, to simplify cancel n 

161 if a == 0: # log likelihood ratio 

162 D_obs = 2*n * np.sum(o/(1.0*nt) * np.log(o/e), axis=axis) 

163 elif a == -1: # modified log likelihood ratio 

164 D_obs = 2*n * np.sum(e/(1.0*nt) * np.log(e/o), axis=axis) 

165 else: 

166 D_obs = 2*n/a/(a+1) * np.sum(o/(1.0*nt) * ((o/e)**a - 1), axis=axis) 

167 

168 return D_obs, stats.chi2.sf(D_obs,k-1-ddof) 

169 

170 

171 

172#todo: need also binning for continuous distribution 

173# and separated binning function to be used for powerdiscrepancy 

174 

175def gof_chisquare_discrete(distfn, arg, rvs, alpha, msg): 

176 '''perform chisquare test for random sample of a discrete distribution 

177 

178 Parameters 

179 ---------- 

180 distname : str 

181 name of distribution function 

182 arg : sequence 

183 parameters of distribution 

184 alpha : float 

185 significance level, threshold for p-value 

186 

187 Returns 

188 ------- 

189 result : bool 

190 0 if test passes, 1 if test fails 

191 

192 Notes 

193 ----- 

194 originally written for scipy.stats test suite, 

195 still needs to be checked for standalone usage, insufficient input checking 

196 may not run yet (after copy/paste) 

197 

198 refactor: maybe a class, check returns, or separate binning from 

199 test results 

200 ''' 

201 

202 # define parameters for test 

203## n=2000 

204 n = len(rvs) 

205 nsupp = 20 

206 wsupp = 1.0/nsupp 

207 

208## distfn = getattr(stats, distname) 

209## np.random.seed(9765456) 

210## rvs = distfn.rvs(size=n,*arg) 

211 

212 # construct intervals with minimum mass 1/nsupp 

213 # intervalls are left-half-open as in a cdf difference 

214 distsupport = lrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1) 

215 last = 0 

216 distsupp = [max(distfn.a, -1000)] 

217 distmass = [] 

218 for ii in distsupport: 

219 current = distfn.cdf(ii,*arg) 

220 if current - last >= wsupp-1e-14: 

221 distsupp.append(ii) 

222 distmass.append(current - last) 

223 last = current 

224 if current > (1-wsupp): 

225 break 

226 if distsupp[-1] < distfn.b: 

227 distsupp.append(distfn.b) 

228 distmass.append(1-last) 

229 distsupp = np.array(distsupp) 

230 distmass = np.array(distmass) 

231 

232 # convert intervals to right-half-open as required by histogram 

233 histsupp = distsupp+1e-8 

234 histsupp[0] = distfn.a 

235 

236 # find sample frequencies and perform chisquare test 

237 #TODO: move to compatibility.py 

238 freq, hsupp = np.histogram(rvs,histsupp) 

239 cdfs = distfn.cdf(distsupp,*arg) 

240 (chis,pval) = stats.chisquare(np.array(freq),n*distmass) 

241 

242 return chis, pval, (pval > alpha), 'chisquare - test for %s' \ 

243 'at arg = %s with pval = %s' % (msg,str(arg),str(pval)) 

244 

245# copy/paste, remove code duplication when it works 

246def gof_binning_discrete(rvs, distfn, arg, nsupp=20): 

247 '''get bins for chisquare type gof tests for a discrete distribution 

248 

249 Parameters 

250 ---------- 

251 rvs : ndarray 

252 sample data 

253 distname : str 

254 name of distribution function 

255 arg : sequence 

256 parameters of distribution 

257 nsupp : int 

258 number of bins. The algorithm tries to find bins with equal weights. 

259 depending on the distribution, the actual number of bins can be smaller. 

260 

261 Returns 

262 ------- 

263 freq : ndarray 

264 empirical frequencies for sample; not normalized, adds up to sample size 

265 expfreq : ndarray 

266 theoretical frequencies according to distribution 

267 histsupp : ndarray 

268 bin boundaries for histogram, (added 1e-8 for numerical robustness) 

269 

270 Notes 

271 ----- 

272 The results can be used for a chisquare test :: 

273 

274 (chis,pval) = stats.chisquare(freq, expfreq) 

275 

276 originally written for scipy.stats test suite, 

277 still needs to be checked for standalone usage, insufficient input checking 

278 may not run yet (after copy/paste) 

279 

280 refactor: maybe a class, check returns, or separate binning from 

281 test results 

282 todo : 

283 optimal number of bins ? (check easyfit), 

284 recommendation in literature at least 5 expected observations in each bin 

285 

286 ''' 

287 

288 # define parameters for test 

289## n=2000 

290 n = len(rvs) 

291 

292 wsupp = 1.0/nsupp 

293 

294## distfn = getattr(stats, distname) 

295## np.random.seed(9765456) 

296## rvs = distfn.rvs(size=n,*arg) 

297 

298 # construct intervals with minimum mass 1/nsupp 

299 # intervalls are left-half-open as in a cdf difference 

300 distsupport = lrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1) 

301 last = 0 

302 distsupp = [max(distfn.a, -1000)] 

303 distmass = [] 

304 for ii in distsupport: 

305 current = distfn.cdf(ii,*arg) 

306 if current - last >= wsupp-1e-14: 

307 distsupp.append(ii) 

308 distmass.append(current - last) 

309 last = current 

310 if current > (1-wsupp): 

311 break 

312 if distsupp[-1] < distfn.b: 

313 distsupp.append(distfn.b) 

314 distmass.append(1-last) 

315 distsupp = np.array(distsupp) 

316 distmass = np.array(distmass) 

317 

318 # convert intervals to right-half-open as required by histogram 

319 histsupp = distsupp+1e-8 

320 histsupp[0] = distfn.a 

321 

322 # find sample frequencies and perform chisquare test 

323 freq,hsupp = np.histogram(rvs,histsupp) 

324 #freq,hsupp = np.histogram(rvs,histsupp,new=True) 

325 cdfs = distfn.cdf(distsupp,*arg) 

326 return np.array(freq), n*distmass, histsupp 

327 

328 

329# -*- coding: utf-8 -*- 

330"""Extension to chisquare goodness-of-fit test 

331 

332Created on Mon Feb 25 13:46:53 2013 

333 

334Author: Josef Perktold 

335License: BSD-3 

336""" 

337 

338 

339 

340def chisquare(f_obs, f_exp=None, value=0, ddof=0, return_basic=True): 

341 '''chisquare goodness-of-fit test 

342 

343 The null hypothesis is that the distance between the expected distribution 

344 and the observed frequencies is ``value``. The alternative hypothesis is 

345 that the distance is larger than ``value``. ``value`` is normalized in 

346 terms of effect size. 

347 

348 The standard chisquare test has the null hypothesis that ``value=0``, that 

349 is the distributions are the same. 

350 

351 

352 Notes 

353 ----- 

354 The case with value greater than zero is similar to an equivalence test, 

355 that the exact null hypothesis is replaced by an approximate hypothesis. 

356 However, TOST "reverses" null and alternative hypothesis, while here the 

357 alternative hypothesis is that the distance (divergence) is larger than a 

358 threshold. 

359 

360 References 

361 ---------- 

362 McLaren, ... 

363 Drost,... 

364 

365 See Also 

366 -------- 

367 powerdiscrepancy 

368 scipy.stats.chisquare 

369 

370 ''' 

371 

372 f_obs = np.asarray(f_obs) 

373 n_bins = len(f_obs) 

374 nobs = f_obs.sum(0) 

375 if f_exp is None: 

376 # uniform distribution 

377 f_exp = np.empty(n_bins, float) 

378 f_exp.fill(nobs / float(n_bins)) 

379 

380 f_exp = np.asarray(f_exp, float) 

381 

382 chisq = ((f_obs - f_exp)**2 / f_exp).sum(0) 

383 if value == 0: 

384 pvalue = stats.chi2.sf(chisq, n_bins - 1 - ddof) 

385 else: 

386 pvalue = stats.ncx2.sf(chisq, n_bins - 1 - ddof, value**2 * nobs) 

387 

388 if return_basic: 

389 return chisq, pvalue 

390 else: 

391 return chisq, pvalue #TODO: replace with TestResults 

392 

393 

394def chisquare_power(effect_size, nobs, n_bins, alpha=0.05, ddof=0): 

395 '''power of chisquare goodness of fit test 

396 

397 effect size is sqrt of chisquare statistic divided by nobs 

398 

399 Parameters 

400 ---------- 

401 effect_size : float 

402 This is the deviation from the Null of the normalized chi_square 

403 statistic. This follows Cohen's definition (sqrt). 

404 nobs : int or float 

405 number of observations 

406 n_bins : int (or float) 

407 number of bins, or points in the discrete distribution 

408 alpha : float in (0,1) 

409 significance level of the test, default alpha=0.05 

410 

411 Returns 

412 ------- 

413 power : float 

414 power of the test at given significance level at effect size 

415 

416 Notes 

417 ----- 

418 This function also works vectorized if all arguments broadcast. 

419 

420 This can also be used to calculate the power for power divergence test. 

421 However, for the range of more extreme values of the power divergence 

422 parameter, this power is not a very good approximation for samples of 

423 small to medium size (Drost et al. 1989) 

424 

425 References 

426 ---------- 

427 Drost, ... 

428 

429 See Also 

430 -------- 

431 chisquare_effectsize 

432 statsmodels.stats.GofChisquarePower 

433 

434 ''' 

435 crit = stats.chi2.isf(alpha, n_bins - 1 - ddof) 

436 power = stats.ncx2.sf(crit, n_bins - 1 - ddof, effect_size**2 * nobs) 

437 return power 

438 

439 

440def chisquare_effectsize(probs0, probs1, correction=None, cohen=True, axis=0): 

441 '''effect size for a chisquare goodness-of-fit test 

442 

443 Parameters 

444 ---------- 

445 probs0 : array_like 

446 probabilities or cell frequencies under the Null hypothesis 

447 probs1 : array_like 

448 probabilities or cell frequencies under the Alternative hypothesis 

449 probs0 and probs1 need to have the same length in the ``axis`` dimension. 

450 and broadcast in the other dimensions 

451 Both probs0 and probs1 are normalized to add to one (in the ``axis`` 

452 dimension). 

453 correction : None or tuple 

454 If None, then the effect size is the chisquare statistic divide by 

455 the number of observations. 

456 If the correction is a tuple (nobs, df), then the effectsize is 

457 corrected to have less bias and a smaller variance. However, the 

458 correction can make the effectsize negative. In that case, the 

459 effectsize is set to zero. 

460 Pederson and Johnson (1990) as referenced in McLaren et all. (1994) 

461 cohen : bool 

462 If True, then the square root is returned as in the definition of the 

463 effect size by Cohen (1977), If False, then the original effect size 

464 is returned. 

465 axis : int 

466 If the probability arrays broadcast to more than 1 dimension, then 

467 this is the axis over which the sums are taken. 

468 

469 Returns 

470 ------- 

471 effectsize : float 

472 effect size of chisquare test 

473 

474 ''' 

475 probs0 = np.asarray(probs0, float) 

476 probs1 = np.asarray(probs1, float) 

477 probs0 = probs0 / probs0.sum(axis) 

478 probs1 = probs1 / probs1.sum(axis) 

479 

480 d2 = ((probs1 - probs0)**2 / probs0).sum(axis) 

481 

482 if correction is not None: 

483 nobs, df = correction 

484 diff = ((probs1 - probs0) / probs0).sum(axis) 

485 d2 = np.maximum((d2 * nobs - diff - df) / (nobs - 1.), 0) 

486 

487 if cohen: 

488 return np.sqrt(d2) 

489 else: 

490 return d2