Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2An extension of scipy.stats.stats to support masked arrays 

3 

4""" 

5# Original author (2007): Pierre GF Gerard-Marchant 

6 

7# TODO : f_value_wilks_lambda looks botched... what are dfnum & dfden for ? 

8# TODO : ttest_rel looks botched: what are x1,x2,v1,v2 for ? 

9 

10 

11__all__ = ['argstoarray', 

12 'count_tied_groups', 

13 'describe', 

14 'f_oneway', 'find_repeats','friedmanchisquare', 

15 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis', 

16 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest', 

17 'ks_1samp', 'kstest', 

18 'linregress', 

19 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign', 

20 'normaltest', 

21 'obrientransform', 

22 'pearsonr','plotting_positions','pointbiserialr', 

23 'rankdata', 

24 'scoreatpercentile','sem', 

25 'sen_seasonal_slopes','skew','skewtest','spearmanr', 

26 'siegelslopes', 'theilslopes', 

27 'tmax','tmean','tmin','trim','trimboth', 

28 'trimtail','trima','trimr','trimmed_mean','trimmed_std', 

29 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp', 

30 'ttest_ind','ttest_rel','tvar', 

31 'variation', 

32 'winsorize', 

33 'brunnermunzel', 

34 ] 

35 

36import numpy as np 

37from numpy import ndarray 

38import numpy.ma as ma 

39from numpy.ma import masked, nomask 

40 

41import itertools 

42import warnings 

43from collections import namedtuple 

44 

45from . import distributions 

46import scipy.special as special 

47import scipy.stats.stats 

48 

49from ._stats_mstats_common import ( 

50 _find_repeats, 

51 linregress as stats_linregress, 

52 theilslopes as stats_theilslopes, 

53 siegelslopes as stats_siegelslopes 

54 ) 

55 

56def _chk_asarray(a, axis): 

57 # Always returns a masked array, raveled for axis=None 

58 a = ma.asanyarray(a) 

59 if axis is None: 

60 a = ma.ravel(a) 

61 outaxis = 0 

62 else: 

63 outaxis = axis 

64 return a, outaxis 

65 

66 

67def _chk2_asarray(a, b, axis): 

68 a = ma.asanyarray(a) 

69 b = ma.asanyarray(b) 

70 if axis is None: 

71 a = ma.ravel(a) 

72 b = ma.ravel(b) 

73 outaxis = 0 

74 else: 

75 outaxis = axis 

76 return a, b, outaxis 

77 

78 

79def _chk_size(a, b): 

80 a = ma.asanyarray(a) 

81 b = ma.asanyarray(b) 

82 (na, nb) = (a.size, b.size) 

83 if na != nb: 

84 raise ValueError("The size of the input array should match!" 

85 " (%s <> %s)" % (na, nb)) 

86 return (a, b, na) 

87 

88 

89def argstoarray(*args): 

90 """ 

91 Constructs a 2D array from a group of sequences. 

92 

93 Sequences are filled with missing values to match the length of the longest 

94 sequence. 

95 

96 Parameters 

97 ---------- 

98 args : sequences 

99 Group of sequences. 

100 

101 Returns 

102 ------- 

103 argstoarray : MaskedArray 

104 A ( `m` x `n` ) masked array, where `m` is the number of arguments and 

105 `n` the length of the longest argument. 

106 

107 Notes 

108 ----- 

109 `numpy.ma.row_stack` has identical behavior, but is called with a sequence 

110 of sequences. 

111 

112 Examples 

113 -------- 

114 A 2D masked array constructed from a group of sequences is returned. 

115 

116 >>> from scipy.stats.mstats import argstoarray 

117 >>> argstoarray([1, 2, 3], [4, 5, 6]) 

118 masked_array( 

119 data=[[1.0, 2.0, 3.0], 

120 [4.0, 5.0, 6.0]], 

121 mask=[[False, False, False], 

122 [False, False, False]], 

123 fill_value=1e+20) 

124 

125 The returned masked array filled with missing values when the lengths of 

126 sequences are different. 

127 

128 >>> argstoarray([1, 3], [4, 5, 6]) 

129 masked_array( 

130 data=[[1.0, 3.0, --], 

131 [4.0, 5.0, 6.0]], 

132 mask=[[False, False, True], 

133 [False, False, False]], 

134 fill_value=1e+20) 

135 

136 """ 

137 if len(args) == 1 and not isinstance(args[0], ndarray): 

138 output = ma.asarray(args[0]) 

139 if output.ndim != 2: 

140 raise ValueError("The input should be 2D") 

141 else: 

142 n = len(args) 

143 m = max([len(k) for k in args]) 

144 output = ma.array(np.empty((n,m), dtype=float), mask=True) 

145 for (k,v) in enumerate(args): 

146 output[k,:len(v)] = v 

147 

148 output[np.logical_not(np.isfinite(output._data))] = masked 

149 return output 

150 

151 

152def find_repeats(arr): 

153 """Find repeats in arr and return a tuple (repeats, repeat_count). 

154 

155 The input is cast to float64. Masked values are discarded. 

156 

157 Parameters 

158 ---------- 

159 arr : sequence 

160 Input array. The array is flattened if it is not 1D. 

161 

162 Returns 

163 ------- 

164 repeats : ndarray 

165 Array of repeated values. 

166 counts : ndarray 

167 Array of counts. 

168 

169 """ 

170 # Make sure we get a copy. ma.compressed promises a "new array", but can 

171 # actually return a reference. 

172 compr = np.asarray(ma.compressed(arr), dtype=np.float64) 

173 try: 

174 need_copy = np.may_share_memory(compr, arr) 

175 except AttributeError: 

176 # numpy < 1.8.2 bug: np.may_share_memory([], []) raises, 

177 # while in numpy 1.8.2 and above it just (correctly) returns False. 

178 need_copy = False 

179 if need_copy: 

180 compr = compr.copy() 

181 return _find_repeats(compr) 

182 

183 

184def count_tied_groups(x, use_missing=False): 

185 """ 

186 Counts the number of tied values. 

187 

188 Parameters 

189 ---------- 

190 x : sequence 

191 Sequence of data on which to counts the ties 

192 use_missing : bool, optional 

193 Whether to consider missing values as tied. 

194 

195 Returns 

196 ------- 

197 count_tied_groups : dict 

198 Returns a dictionary (nb of ties: nb of groups). 

199 

200 Examples 

201 -------- 

202 >>> from scipy.stats import mstats 

203 >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6] 

204 >>> mstats.count_tied_groups(z) 

205 {2: 1, 3: 2} 

206 

207 In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x). 

208 

209 >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6]) 

210 >>> mstats.count_tied_groups(z) 

211 {2: 2, 3: 1} 

212 >>> z[[1,-1]] = np.ma.masked 

213 >>> mstats.count_tied_groups(z, use_missing=True) 

214 {2: 2, 3: 1} 

215 

216 """ 

217 nmasked = ma.getmask(x).sum() 

218 # We need the copy as find_repeats will overwrite the initial data 

219 data = ma.compressed(x).copy() 

220 (ties, counts) = find_repeats(data) 

221 nties = {} 

222 if len(ties): 

223 nties = dict(zip(np.unique(counts), itertools.repeat(1))) 

224 nties.update(dict(zip(*find_repeats(counts)))) 

225 

226 if nmasked and use_missing: 

227 try: 

228 nties[nmasked] += 1 

229 except KeyError: 

230 nties[nmasked] = 1 

231 

232 return nties 

233 

234 

235def rankdata(data, axis=None, use_missing=False): 

236 """Returns the rank (also known as order statistics) of each data point 

237 along the given axis. 

238 

239 If some values are tied, their rank is averaged. 

240 If some values are masked, their rank is set to 0 if use_missing is False, 

241 or set to the average rank of the unmasked values if use_missing is True. 

242 

243 Parameters 

244 ---------- 

245 data : sequence 

246 Input data. The data is transformed to a masked array 

247 axis : {None,int}, optional 

248 Axis along which to perform the ranking. 

249 If None, the array is first flattened. An exception is raised if 

250 the axis is specified for arrays with a dimension larger than 2 

251 use_missing : bool, optional 

252 Whether the masked values have a rank of 0 (False) or equal to the 

253 average rank of the unmasked values (True). 

254 

255 """ 

256 def _rank1d(data, use_missing=False): 

257 n = data.count() 

258 rk = np.empty(data.size, dtype=float) 

259 idx = data.argsort() 

260 rk[idx[:n]] = np.arange(1,n+1) 

261 

262 if use_missing: 

263 rk[idx[n:]] = (n+1)/2. 

264 else: 

265 rk[idx[n:]] = 0 

266 

267 repeats = find_repeats(data.copy()) 

268 for r in repeats[0]: 

269 condition = (data == r).filled(False) 

270 rk[condition] = rk[condition].mean() 

271 return rk 

272 

273 data = ma.array(data, copy=False) 

274 if axis is None: 

275 if data.ndim > 1: 

276 return _rank1d(data.ravel(), use_missing).reshape(data.shape) 

277 else: 

278 return _rank1d(data, use_missing) 

279 else: 

280 return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray) 

281 

282 

283ModeResult = namedtuple('ModeResult', ('mode', 'count')) 

284 

285 

286def mode(a, axis=0): 

287 """ 

288 Returns an array of the modal (most common) value in the passed array. 

289 

290 Parameters 

291 ---------- 

292 a : array_like 

293 n-dimensional array of which to find mode(s). 

294 axis : int or None, optional 

295 Axis along which to operate. Default is 0. If None, compute over 

296 the whole array `a`. 

297 

298 Returns 

299 ------- 

300 mode : ndarray 

301 Array of modal values. 

302 count : ndarray 

303 Array of counts for each mode. 

304 

305 Notes 

306 ----- 

307 For more details, see `stats.mode`. 

308 

309 Examples 

310 -------- 

311 >>> from scipy import stats 

312 >>> from scipy.stats import mstats 

313 >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0]) 

314 >>> stats.mode(m_arr) 

315 ModeResult(mode=array([0]), count=array([4])) 

316 >>> mstats.mode(m_arr) 

317 ModeResult(mode=array([1.]), count=array([2.])) 

318 

319 """ 

320 a, axis = _chk_asarray(a, axis) 

321 

322 def _mode1D(a): 

323 (rep,cnt) = find_repeats(a) 

324 if not cnt.ndim: 

325 return (0, 0) 

326 elif cnt.size: 

327 return (rep[cnt.argmax()], cnt.max()) 

328 else: 

329 return (a.min(), 1) 

330 

331 if axis is None: 

332 output = _mode1D(ma.ravel(a)) 

333 output = (ma.array(output[0]), ma.array(output[1])) 

334 else: 

335 output = ma.apply_along_axis(_mode1D, axis, a) 

336 newshape = list(a.shape) 

337 newshape[axis] = 1 

338 slices = [slice(None)] * output.ndim 

339 slices[axis] = 0 

340 modes = output[tuple(slices)].reshape(newshape) 

341 slices[axis] = 1 

342 counts = output[tuple(slices)].reshape(newshape) 

343 output = (modes, counts) 

344 

345 return ModeResult(*output) 

346 

347 

348def _betai(a, b, x): 

349 x = np.asanyarray(x) 

350 x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 

351 return special.betainc(a, b, x) 

352 

353 

354def msign(x): 

355 """Returns the sign of x, or 0 if x is masked.""" 

356 return ma.filled(np.sign(x), 0) 

357 

358 

359def pearsonr(x,y): 

360 """ 

361 Calculates a Pearson correlation coefficient and the p-value for testing 

362 non-correlation. 

363 

364 The Pearson correlation coefficient measures the linear relationship 

365 between two datasets. Strictly speaking, Pearson's correlation requires 

366 that each dataset be normally distributed. Like other correlation 

367 coefficients, this one varies between -1 and +1 with 0 implying no 

368 correlation. Correlations of -1 or +1 imply an exact linear 

369 relationship. Positive correlations imply that as `x` increases, so does 

370 `y`. Negative correlations imply that as `x` increases, `y` decreases. 

371 

372 The p-value roughly indicates the probability of an uncorrelated system 

373 producing datasets that have a Pearson correlation at least as extreme 

374 as the one computed from these datasets. The p-values are not entirely 

375 reliable but are probably reasonable for datasets larger than 500 or so. 

376 

377 Parameters 

378 ---------- 

379 x : 1-D array_like 

380 Input 

381 y : 1-D array_like 

382 Input 

383 

384 Returns 

385 ------- 

386 pearsonr : float 

387 Pearson's correlation coefficient, 2-tailed p-value. 

388 

389 References 

390 ---------- 

391 http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation 

392 

393 """ 

394 (x, y, n) = _chk_size(x, y) 

395 (x, y) = (x.ravel(), y.ravel()) 

396 # Get the common mask and the total nb of unmasked elements 

397 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

398 n -= m.sum() 

399 df = n-2 

400 if df < 0: 

401 return (masked, masked) 

402 

403 (mx, my) = (x.mean(), y.mean()) 

404 (xm, ym) = (x-mx, y-my) 

405 

406 r_num = ma.add.reduce(xm*ym) 

407 r_den = ma.sqrt(ma.dot(xm,xm) * ma.dot(ym,ym)) 

408 r = r_num / r_den 

409 # Presumably, if r > 1, then it is only some small artifact of floating 

410 # point arithmetic. 

411 r = min(r, 1.0) 

412 r = max(r, -1.0) 

413 

414 if r is masked or abs(r) == 1.0: 

415 prob = 0. 

416 else: 

417 t_squared = (df / ((1.0 - r) * (1.0 + r))) * r * r 

418 prob = _betai(0.5*df, 0.5, df/(df + t_squared)) 

419 

420 return r, prob 

421 

422 

423SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue')) 

424 

425 

426def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate'): 

427 """ 

428 Calculates a Spearman rank-order correlation coefficient and the p-value 

429 to test for non-correlation. 

430 

431 The Spearman correlation is a nonparametric measure of the linear 

432 relationship between two datasets. Unlike the Pearson correlation, the 

433 Spearman correlation does not assume that both datasets are normally 

434 distributed. Like other correlation coefficients, this one varies 

435 between -1 and +1 with 0 implying no correlation. Correlations of -1 or 

436 +1 imply a monotonic relationship. Positive correlations imply that 

437 as `x` increases, so does `y`. Negative correlations imply that as `x` 

438 increases, `y` decreases. 

439 

440 Missing values are discarded pair-wise: if a value is missing in `x`, the 

441 corresponding value in `y` is masked. 

442 

443 The p-value roughly indicates the probability of an uncorrelated system 

444 producing datasets that have a Spearman correlation at least as extreme 

445 as the one computed from these datasets. The p-values are not entirely 

446 reliable but are probably reasonable for datasets larger than 500 or so. 

447 

448 Parameters 

449 ---------- 

450 x, y : 1D or 2D array_like, y is optional 

451 One or two 1-D or 2-D arrays containing multiple variables and 

452 observations. When these are 1-D, each represents a vector of 

453 observations of a single variable. For the behavior in the 2-D case, 

454 see under ``axis``, below. 

455 use_ties : bool, optional 

456 DO NOT USE. Does not do anything, keyword is only left in place for 

457 backwards compatibility reasons. 

458 axis : int or None, optional 

459 If axis=0 (default), then each column represents a variable, with 

460 observations in the rows. If axis=1, the relationship is transposed: 

461 each row represents a variable, while the columns contain observations. 

462 If axis=None, then both arrays will be raveled. 

463 nan_policy : {'propagate', 'raise', 'omit'}, optional 

464 Defines how to handle when input contains nan. 'propagate' returns nan, 

465 'raise' throws an error, 'omit' performs the calculations ignoring nan 

466 values. Default is 'propagate'. 

467 

468 Returns 

469 ------- 

470 correlation : float 

471 Spearman correlation coefficient 

472 pvalue : float 

473 2-tailed p-value. 

474 

475 References 

476 ---------- 

477 [CRCProbStat2000] section 14.7 

478 

479 """ 

480 if not use_ties: 

481 raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0") 

482 

483 # Always returns a masked array, raveled if axis=None 

484 x, axisout = _chk_asarray(x, axis) 

485 if y is not None: 

486 # Deal only with 2-D `x` case. 

487 y, _ = _chk_asarray(y, axis) 

488 if axisout == 0: 

489 x = ma.column_stack((x, y)) 

490 else: 

491 x = ma.row_stack((x, y)) 

492 

493 if axisout == 1: 

494 # To simplify the code that follow (always use `n_obs, n_vars` shape) 

495 x = x.T 

496 

497 if nan_policy == 'omit': 

498 x = ma.masked_invalid(x) 

499 

500 def _spearmanr_2cols(x): 

501 # Mask the same observations for all variables, and then drop those 

502 # observations (can't leave them masked, rankdata is weird). 

503 x = ma.mask_rowcols(x, axis=0) 

504 x = x[~x.mask.any(axis=1), :] 

505 

506 m = ma.getmask(x) 

507 n_obs = x.shape[0] 

508 dof = n_obs - 2 - int(m.sum(axis=0)[0]) 

509 if dof < 0: 

510 raise ValueError("The input must have at least 3 entries!") 

511 

512 # Gets the ranks and rank differences 

513 x_ranked = rankdata(x, axis=0) 

514 rs = ma.corrcoef(x_ranked, rowvar=False).data 

515 

516 # rs can have elements equal to 1, so avoid zero division warnings 

517 with np.errstate(divide='ignore'): 

518 # clip the small negative values possibly caused by rounding 

519 # errors before taking the square root 

520 t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0)) 

521 

522 prob = 2 * distributions.t.sf(np.abs(t), dof) 

523 

524 # For backwards compatibility, return scalars when comparing 2 columns 

525 if rs.shape == (2, 2): 

526 return SpearmanrResult(rs[1, 0], prob[1, 0]) 

527 else: 

528 return SpearmanrResult(rs, prob) 

529 

530 # Need to do this per pair of variables, otherwise the dropped observations 

531 # in a third column mess up the result for a pair. 

532 n_vars = x.shape[1] 

533 if n_vars == 2: 

534 return _spearmanr_2cols(x) 

535 else: 

536 rs = np.ones((n_vars, n_vars), dtype=float) 

537 prob = np.zeros((n_vars, n_vars), dtype=float) 

538 for var1 in range(n_vars - 1): 

539 for var2 in range(var1+1, n_vars): 

540 result = _spearmanr_2cols(x[:, [var1, var2]]) 

541 rs[var1, var2] = result.correlation 

542 rs[var2, var1] = result.correlation 

543 prob[var1, var2] = result.pvalue 

544 prob[var2, var1] = result.pvalue 

545 

546 return SpearmanrResult(rs, prob) 

547 

548 

549KendalltauResult = namedtuple('KendalltauResult', ('correlation', 'pvalue')) 

550 

551 

552def kendalltau(x, y, use_ties=True, use_missing=False, method='auto'): 

553 """ 

554 Computes Kendall's rank correlation tau on two variables *x* and *y*. 

555 

556 Parameters 

557 ---------- 

558 x : sequence 

559 First data list (for example, time). 

560 y : sequence 

561 Second data list. 

562 use_ties : {True, False}, optional 

563 Whether ties correction should be performed. 

564 use_missing : {False, True}, optional 

565 Whether missing data should be allocated a rank of 0 (False) or the 

566 average rank (True) 

567 method: {'auto', 'asymptotic', 'exact'}, optional 

568 Defines which method is used to calculate the p-value [1]_. 

569 'asymptotic' uses a normal approximation valid for large samples. 

570 'exact' computes the exact p-value, but can only be used if no ties 

571 are present. 'auto' is the default and selects the appropriate 

572 method based on a trade-off between speed and accuracy. 

573 

574 Returns 

575 ------- 

576 correlation : float 

577 Kendall tau 

578 pvalue : float 

579 Approximate 2-side p-value. 

580 

581 References 

582 ---------- 

583 .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), 

584 Charles Griffin & Co., 1970. 

585 

586 """ 

587 (x, y, n) = _chk_size(x, y) 

588 (x, y) = (x.flatten(), y.flatten()) 

589 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

590 if m is not nomask: 

591 x = ma.array(x, mask=m, copy=True) 

592 y = ma.array(y, mask=m, copy=True) 

593 # need int() here, otherwise numpy defaults to 32 bit 

594 # integer on all Windows architectures, causing overflow. 

595 # int() will keep it infinite precision. 

596 n -= int(m.sum()) 

597 

598 if n < 2: 

599 return KendalltauResult(np.nan, np.nan) 

600 

601 rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0) 

602 ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0) 

603 idx = rx.argsort() 

604 (rx, ry) = (rx[idx], ry[idx]) 

605 C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum() 

606 for i in range(len(ry)-1)], dtype=float) 

607 D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum() 

608 for i in range(len(ry)-1)], dtype=float) 

609 xties = count_tied_groups(x) 

610 yties = count_tied_groups(y) 

611 if use_ties: 

612 corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float) 

613 corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float) 

614 denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.) 

615 else: 

616 denom = n*(n-1)/2. 

617 tau = (C-D) / denom 

618 

619 if method == 'exact' and (xties or yties): 

620 raise ValueError("Ties found, exact method cannot be used.") 

621 

622 if method == 'auto': 

623 if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1): 

624 method = 'exact' 

625 else: 

626 method = 'asymptotic' 

627 

628 if not xties and not yties and method == 'exact': 

629 # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods" (4th Edition), Charles Griffin & Co., 1970. 

630 c = int(min(C, (n*(n-1))/2-C)) 

631 if n <= 0: 

632 raise ValueError 

633 elif c < 0 or 2*c > n*(n-1): 

634 raise ValueError 

635 elif n == 1: 

636 prob = 1.0 

637 elif n == 2: 

638 prob = 1.0 

639 elif c == 0: 

640 prob = 2.0/np.math.factorial(n) 

641 elif c == 1: 

642 prob = 2.0/np.math.factorial(n-1) 

643 elif 2*c == (n*(n-1))//2: 

644 prob = 1.0 

645 else: 

646 old = [0.0]*(c+1) 

647 new = [0.0]*(c+1) 

648 new[0] = 1.0 

649 new[1] = 1.0 

650 for j in range(3,n+1): 

651 old = new[:] 

652 for k in range(1,min(j,c+1)): 

653 new[k] += new[k-1] 

654 for k in range(j,c+1): 

655 new[k] += new[k-1] - old[k-j] 

656 prob = 2.0*sum(new)/np.math.factorial(n) 

657 elif method == 'asymptotic': 

658 var_s = n*(n-1)*(2*n+5) 

659 if use_ties: 

660 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()]) 

661 var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()]) 

662 v1 = np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *\ 

663 np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float) 

664 v1 /= 2.*n*(n-1) 

665 if n > 2: 

666 v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()], 

667 dtype=float) * \ 

668 np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()], 

669 dtype=float) 

670 v2 /= 9.*n*(n-1)*(n-2) 

671 else: 

672 v2 = 0 

673 else: 

674 v1 = v2 = 0 

675 

676 var_s /= 18. 

677 var_s += (v1 + v2) 

678 z = (C-D)/np.sqrt(var_s) 

679 prob = special.erfc(abs(z)/np.sqrt(2)) 

680 else: 

681 raise ValueError("Unknown method "+str(method)+" specified, please use auto, exact or asymptotic.") 

682 

683 return KendalltauResult(tau, prob) 

684 

685 

686def kendalltau_seasonal(x): 

687 """ 

688 Computes a multivariate Kendall's rank correlation tau, for seasonal data. 

689 

690 Parameters 

691 ---------- 

692 x : 2-D ndarray 

693 Array of seasonal data, with seasons in columns. 

694 

695 """ 

696 x = ma.array(x, subok=True, copy=False, ndmin=2) 

697 (n,m) = x.shape 

698 n_p = x.count(0) 

699 

700 S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n)) 

701 S_tot = S_szn.sum() 

702 

703 n_tot = x.count() 

704 ties = count_tied_groups(x.compressed()) 

705 corr_ties = sum(v*k*(k-1) for (k,v) in ties.items()) 

706 denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2. 

707 

708 R = rankdata(x, axis=0, use_missing=True) 

709 K = ma.empty((m,m), dtype=int) 

710 covmat = ma.empty((m,m), dtype=float) 

711 denom_szn = ma.empty(m, dtype=float) 

712 for j in range(m): 

713 ties_j = count_tied_groups(x[:,j].compressed()) 

714 corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items()) 

715 cmb = n_p[j]*(n_p[j]-1) 

716 for k in range(j,m,1): 

717 K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum() 

718 for i in range(n)) 

719 covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() - 

720 n*(n_p[j]+1)*(n_p[k]+1))/3. 

721 K[k,j] = K[j,k] 

722 covmat[k,j] = covmat[j,k] 

723 

724 denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2. 

725 

726 var_szn = covmat.diagonal() 

727 

728 z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn) 

729 z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum()) 

730 z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum()) 

731 

732 prob_szn = special.erfc(abs(z_szn)/np.sqrt(2)) 

733 prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2)) 

734 prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2)) 

735 

736 chi2_tot = (z_szn*z_szn).sum() 

737 chi2_trd = m * z_szn.mean()**2 

738 output = {'seasonal tau': S_szn/denom_szn, 

739 'global tau': S_tot/denom_tot, 

740 'global tau (alt)': S_tot/denom_szn.sum(), 

741 'seasonal p-value': prob_szn, 

742 'global p-value (indep)': prob_tot_ind, 

743 'global p-value (dep)': prob_tot_dep, 

744 'chi2 total': chi2_tot, 

745 'chi2 trend': chi2_trd, 

746 } 

747 return output 

748 

749 

750PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation', 

751 'pvalue')) 

752 

753 

754def pointbiserialr(x, y): 

755 """Calculates a point biserial correlation coefficient and its p-value. 

756 

757 Parameters 

758 ---------- 

759 x : array_like of bools 

760 Input array. 

761 y : array_like 

762 Input array. 

763 

764 Returns 

765 ------- 

766 correlation : float 

767 R value 

768 pvalue : float 

769 2-tailed p-value 

770 

771 Notes 

772 ----- 

773 Missing values are considered pair-wise: if a value is missing in x, 

774 the corresponding value in y is masked. 

775 

776 For more details on `pointbiserialr`, see `stats.pointbiserialr`. 

777 

778 """ 

779 x = ma.fix_invalid(x, copy=True).astype(bool) 

780 y = ma.fix_invalid(y, copy=True).astype(float) 

781 # Get rid of the missing data 

782 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

783 if m is not nomask: 

784 unmask = np.logical_not(m) 

785 x = x[unmask] 

786 y = y[unmask] 

787 

788 n = len(x) 

789 # phat is the fraction of x values that are True 

790 phat = x.sum() / float(n) 

791 y0 = y[~x] # y-values where x is False 

792 y1 = y[x] # y-values where x is True 

793 y0m = y0.mean() 

794 y1m = y1.mean() 

795 

796 rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std() 

797 

798 df = n-2 

799 t = rpb*ma.sqrt(df/(1.0-rpb**2)) 

800 prob = _betai(0.5*df, 0.5, df/(df+t*t)) 

801 

802 return PointbiserialrResult(rpb, prob) 

803 

804 

805LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept', 

806 'rvalue', 'pvalue', 

807 'stderr')) 

808 

809 

810def linregress(x, y=None): 

811 """ 

812 Linear regression calculation 

813 

814 Note that the non-masked version is used, and that this docstring is 

815 replaced by the non-masked docstring + some info on missing data. 

816 

817 """ 

818 if y is None: 

819 x = ma.array(x) 

820 if x.shape[0] == 2: 

821 x, y = x 

822 elif x.shape[1] == 2: 

823 x, y = x.T 

824 else: 

825 msg = ("If only `x` is given as input, it has to be of shape " 

826 "(2, N) or (N, 2), provided shape was %s" % str(x.shape)) 

827 raise ValueError(msg) 

828 else: 

829 x = ma.array(x) 

830 y = ma.array(y) 

831 

832 x = x.flatten() 

833 y = y.flatten() 

834 

835 m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False) 

836 if m is not nomask: 

837 x = ma.array(x, mask=m) 

838 y = ma.array(y, mask=m) 

839 if np.any(~m): 

840 slope, intercept, r, prob, sterrest = stats_linregress(x.data[~m], 

841 y.data[~m]) 

842 else: 

843 # All data is masked 

844 return None, None, None, None, None 

845 else: 

846 slope, intercept, r, prob, sterrest = stats_linregress(x.data, y.data) 

847 

848 return LinregressResult(slope, intercept, r, prob, sterrest) 

849 

850 

851def theilslopes(y, x=None, alpha=0.95): 

852 r""" 

853 Computes the Theil-Sen estimator for a set of points (x, y). 

854 

855 `theilslopes` implements a method for robust linear regression. It 

856 computes the slope as the median of all slopes between paired values. 

857 

858 Parameters 

859 ---------- 

860 y : array_like 

861 Dependent variable. 

862 x : array_like or None, optional 

863 Independent variable. If None, use ``arange(len(y))`` instead. 

864 alpha : float, optional 

865 Confidence degree between 0 and 1. Default is 95% confidence. 

866 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are 

867 interpreted as "find the 90% confidence interval". 

868 

869 Returns 

870 ------- 

871 medslope : float 

872 Theil slope. 

873 medintercept : float 

874 Intercept of the Theil line, as ``median(y) - medslope*median(x)``. 

875 lo_slope : float 

876 Lower bound of the confidence interval on `medslope`. 

877 up_slope : float 

878 Upper bound of the confidence interval on `medslope`. 

879 

880 See also 

881 -------- 

882 siegelslopes : a similar technique with repeated medians 

883 

884 

885 Notes 

886 ----- 

887 For more details on `theilslopes`, see `stats.theilslopes`. 

888 

889 """ 

890 y = ma.asarray(y).flatten() 

891 if x is None: 

892 x = ma.arange(len(y), dtype=float) 

893 else: 

894 x = ma.asarray(x).flatten() 

895 if len(x) != len(y): 

896 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x))) 

897 

898 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

899 y._mask = x._mask = m 

900 # Disregard any masked elements of x or y 

901 y = y.compressed() 

902 x = x.compressed().astype(float) 

903 # We now have unmasked arrays so can use `stats.theilslopes` 

904 return stats_theilslopes(y, x, alpha=alpha) 

905 

906 

907def siegelslopes(y, x=None, method="hierarchical"): 

908 r""" 

909 Computes the Siegel estimator for a set of points (x, y). 

910 

911 `siegelslopes` implements a method for robust linear regression 

912 using repeated medians to fit a line to the points (x, y). 

913 The method is robust to outliers with an asymptotic breakdown point 

914 of 50%. 

915 

916 Parameters 

917 ---------- 

918 y : array_like 

919 Dependent variable. 

920 x : array_like or None, optional 

921 Independent variable. If None, use ``arange(len(y))`` instead. 

922 method : {'hierarchical', 'separate'} 

923 If 'hierarchical', estimate the intercept using the estimated 

924 slope ``medslope`` (default option). 

925 If 'separate', estimate the intercept independent of the estimated 

926 slope. See Notes for details. 

927 

928 Returns 

929 ------- 

930 medslope : float 

931 Estimate of the slope of the regression line. 

932 medintercept : float 

933 Estimate of the intercept of the regression line. 

934 

935 See also 

936 -------- 

937 theilslopes : a similar technique without repeated medians 

938 

939 Notes 

940 ----- 

941 For more details on `siegelslopes`, see `scipy.stats.siegelslopes`. 

942 

943 """ 

944 y = ma.asarray(y).ravel() 

945 if x is None: 

946 x = ma.arange(len(y), dtype=float) 

947 else: 

948 x = ma.asarray(x).ravel() 

949 if len(x) != len(y): 

950 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x))) 

951 

952 m = ma.mask_or(ma.getmask(x), ma.getmask(y)) 

953 y._mask = x._mask = m 

954 # Disregard any masked elements of x or y 

955 y = y.compressed() 

956 x = x.compressed().astype(float) 

957 # We now have unmasked arrays so can use `stats.siegelslopes` 

958 return stats_siegelslopes(y, x) 

959 

960 

961def sen_seasonal_slopes(x): 

962 x = ma.array(x, subok=True, copy=False, ndmin=2) 

963 (n,_) = x.shape 

964 # Get list of slopes per season 

965 szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None] 

966 for i in range(n)]) 

967 szn_medslopes = ma.median(szn_slopes, axis=0) 

968 medslope = ma.median(szn_slopes, axis=None) 

969 return szn_medslopes, medslope 

970 

971 

972Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue')) 

973 

974 

975def ttest_1samp(a, popmean, axis=0): 

976 """ 

977 Calculates the T-test for the mean of ONE group of scores. 

978 

979 Parameters 

980 ---------- 

981 a : array_like 

982 sample observation 

983 popmean : float or array_like 

984 expected value in null hypothesis, if array_like than it must have the 

985 same shape as `a` excluding the axis dimension 

986 axis : int or None, optional 

987 Axis along which to compute test. If None, compute over the whole 

988 array `a`. 

989 

990 Returns 

991 ------- 

992 statistic : float or array 

993 t-statistic 

994 pvalue : float or array 

995 two-tailed p-value 

996 

997 Notes 

998 ----- 

999 For more details on `ttest_1samp`, see `stats.ttest_1samp`. 

1000 

1001 """ 

1002 a, axis = _chk_asarray(a, axis) 

1003 if a.size == 0: 

1004 return (np.nan, np.nan) 

1005 

1006 x = a.mean(axis=axis) 

1007 v = a.var(axis=axis, ddof=1) 

1008 n = a.count(axis=axis) 

1009 # force df to be an array for masked division not to throw a warning 

1010 df = ma.asanyarray(n - 1.0) 

1011 svar = ((n - 1.0) * v) / df 

1012 with np.errstate(divide='ignore', invalid='ignore'): 

1013 t = (x - popmean) / ma.sqrt(svar / n) 

1014 prob = special.betainc(0.5*df, 0.5, df/(df + t*t)) 

1015 

1016 return Ttest_1sampResult(t, prob) 

1017 

1018 

1019ttest_onesamp = ttest_1samp 

1020 

1021 

1022Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue')) 

1023 

1024 

1025def ttest_ind(a, b, axis=0, equal_var=True): 

1026 """ 

1027 Calculates the T-test for the means of TWO INDEPENDENT samples of scores. 

1028 

1029 Parameters 

1030 ---------- 

1031 a, b : array_like 

1032 The arrays must have the same shape, except in the dimension 

1033 corresponding to `axis` (the first, by default). 

1034 axis : int or None, optional 

1035 Axis along which to compute test. If None, compute over the whole 

1036 arrays, `a`, and `b`. 

1037 equal_var : bool, optional 

1038 If True, perform a standard independent 2 sample test that assumes equal 

1039 population variances. 

1040 If False, perform Welch's t-test, which does not assume equal population 

1041 variance. 

1042 

1043 .. versionadded:: 0.17.0 

1044 

1045 Returns 

1046 ------- 

1047 statistic : float or array 

1048 The calculated t-statistic. 

1049 pvalue : float or array 

1050 The two-tailed p-value. 

1051 

1052 Notes 

1053 ----- 

1054 For more details on `ttest_ind`, see `stats.ttest_ind`. 

1055 

1056 """ 

1057 a, b, axis = _chk2_asarray(a, b, axis) 

1058 

1059 if a.size == 0 or b.size == 0: 

1060 return Ttest_indResult(np.nan, np.nan) 

1061 

1062 (x1, x2) = (a.mean(axis), b.mean(axis)) 

1063 (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1)) 

1064 (n1, n2) = (a.count(axis), b.count(axis)) 

1065 

1066 if equal_var: 

1067 # force df to be an array for masked division not to throw a warning 

1068 df = ma.asanyarray(n1 + n2 - 2.0) 

1069 svar = ((n1-1)*v1+(n2-1)*v2) / df 

1070 denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here! 

1071 else: 

1072 vn1 = v1/n1 

1073 vn2 = v2/n2 

1074 with np.errstate(divide='ignore', invalid='ignore'): 

1075 df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1)) 

1076 

1077 # If df is undefined, variances are zero. 

1078 # It doesn't matter what df is as long as it is not NaN. 

1079 df = np.where(np.isnan(df), 1, df) 

1080 denom = ma.sqrt(vn1 + vn2) 

1081 

1082 with np.errstate(divide='ignore', invalid='ignore'): 

1083 t = (x1-x2) / denom 

1084 probs = special.betainc(0.5*df, 0.5, df/(df + t*t)).reshape(t.shape) 

1085 

1086 return Ttest_indResult(t, probs.squeeze()) 

1087 

1088 

1089Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue')) 

1090 

1091 

1092def ttest_rel(a, b, axis=0): 

1093 """ 

1094 Calculates the T-test on TWO RELATED samples of scores, a and b. 

1095 

1096 Parameters 

1097 ---------- 

1098 a, b : array_like 

1099 The arrays must have the same shape. 

1100 axis : int or None, optional 

1101 Axis along which to compute test. If None, compute over the whole 

1102 arrays, `a`, and `b`. 

1103 

1104 Returns 

1105 ------- 

1106 statistic : float or array 

1107 t-statistic 

1108 pvalue : float or array 

1109 two-tailed p-value 

1110 

1111 Notes 

1112 ----- 

1113 For more details on `ttest_rel`, see `stats.ttest_rel`. 

1114 

1115 """ 

1116 a, b, axis = _chk2_asarray(a, b, axis) 

1117 if len(a) != len(b): 

1118 raise ValueError('unequal length arrays') 

1119 

1120 if a.size == 0 or b.size == 0: 

1121 return Ttest_relResult(np.nan, np.nan) 

1122 

1123 n = a.count(axis) 

1124 df = ma.asanyarray(n-1.0) 

1125 d = (a-b).astype('d') 

1126 dm = d.mean(axis) 

1127 v = d.var(axis=axis, ddof=1) 

1128 denom = ma.sqrt(v / n) 

1129 with np.errstate(divide='ignore', invalid='ignore'): 

1130 t = dm / denom 

1131 

1132 probs = special.betainc(0.5*df, 0.5, df/(df + t*t)).reshape(t.shape).squeeze() 

1133 

1134 return Ttest_relResult(t, probs) 

1135 

1136 

1137MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 

1138 'pvalue')) 

1139 

1140 

1141def mannwhitneyu(x,y, use_continuity=True): 

1142 """ 

1143 Computes the Mann-Whitney statistic 

1144 

1145 Missing values in `x` and/or `y` are discarded. 

1146 

1147 Parameters 

1148 ---------- 

1149 x : sequence 

1150 Input 

1151 y : sequence 

1152 Input 

1153 use_continuity : {True, False}, optional 

1154 Whether a continuity correction (1/2.) should be taken into account. 

1155 

1156 Returns 

1157 ------- 

1158 statistic : float 

1159 The Mann-Whitney statistics 

1160 pvalue : float 

1161 Approximate p-value assuming a normal distribution. 

1162 

1163 """ 

1164 x = ma.asarray(x).compressed().view(ndarray) 

1165 y = ma.asarray(y).compressed().view(ndarray) 

1166 ranks = rankdata(np.concatenate([x,y])) 

1167 (nx, ny) = (len(x), len(y)) 

1168 nt = nx + ny 

1169 U = ranks[:nx].sum() - nx*(nx+1)/2. 

1170 U = max(U, nx*ny - U) 

1171 u = nx*ny - U 

1172 

1173 mu = (nx*ny)/2. 

1174 sigsq = (nt**3 - nt)/12. 

1175 ties = count_tied_groups(ranks) 

1176 sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12. 

1177 sigsq *= nx*ny/float(nt*(nt-1)) 

1178 

1179 if use_continuity: 

1180 z = (U - 1/2. - mu) / ma.sqrt(sigsq) 

1181 else: 

1182 z = (U - mu) / ma.sqrt(sigsq) 

1183 

1184 prob = special.erfc(abs(z)/np.sqrt(2)) 

1185 return MannwhitneyuResult(u, prob) 

1186 

1187 

1188KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue')) 

1189 

1190 

1191def kruskal(*args): 

1192 """ 

1193 Compute the Kruskal-Wallis H-test for independent samples 

1194 

1195 Parameters 

1196 ---------- 

1197 sample1, sample2, ... : array_like 

1198 Two or more arrays with the sample measurements can be given as 

1199 arguments. 

1200 

1201 Returns 

1202 ------- 

1203 statistic : float 

1204 The Kruskal-Wallis H statistic, corrected for ties 

1205 pvalue : float 

1206 The p-value for the test using the assumption that H has a chi 

1207 square distribution 

1208 

1209 Notes 

1210 ----- 

1211 For more details on `kruskal`, see `stats.kruskal`. 

1212 

1213 Examples 

1214 -------- 

1215 >>> from scipy.stats.mstats import kruskal 

1216 

1217 Random samples from three different brands of batteries were tested 

1218 to see how long the charge lasted. Results were as follows: 

1219 

1220 >>> a = [6.3, 5.4, 5.7, 5.2, 5.0] 

1221 >>> b = [6.9, 7.0, 6.1, 7.9] 

1222 >>> c = [7.2, 6.9, 6.1, 6.5] 

1223 

1224 Test the hypotesis that the distribution functions for all of the brands' 

1225 durations are identical. Use 5% level of significance. 

1226 

1227 >>> kruskal(a, b, c) 

1228 KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164) 

1229 

1230 The null hypothesis is rejected at the 5% level of significance 

1231 because the returned p-value is less than the critical value of 5%. 

1232 

1233 """ 

1234 output = argstoarray(*args) 

1235 ranks = ma.masked_equal(rankdata(output, use_missing=False), 0) 

1236 sumrk = ranks.sum(-1) 

1237 ngrp = ranks.count(-1) 

1238 ntot = ranks.count() 

1239 H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1) 

1240 # Tie correction 

1241 ties = count_tied_groups(ranks) 

1242 T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot) 

1243 if T == 0: 

1244 raise ValueError('All numbers are identical in kruskal') 

1245 

1246 H /= T 

1247 df = len(output) - 1 

1248 prob = distributions.chi2.sf(H, df) 

1249 return KruskalResult(H, prob) 

1250 

1251 

1252kruskalwallis = kruskal 

1253 

1254 

1255def ks_1samp(x, cdf, args=(), alternative="two-sided", mode='auto'): 

1256 """ 

1257 Computes the Kolmogorov-Smirnov test on one sample of masked values. 

1258 

1259 Missing values in `x` are discarded. 

1260 

1261 Parameters 

1262 ---------- 

1263 x : array_like 

1264 a 1-D array of observations of random variables. 

1265 cdf : str or callable 

1266 If a string, it should be the name of a distribution in `scipy.stats`. 

1267 If a callable, that callable is used to calculate the cdf. 

1268 args : tuple, sequence, optional 

1269 Distribution parameters, used if `cdf` is a string. 

1270 alternative : {'two-sided', 'less', 'greater'}, optional 

1271 Indicates the alternative hypothesis. Default is 'two-sided'. 

1272 mode : {'auto', 'exact', 'asymp'}, optional 

1273 Defines the method used for calculating the p-value. 

1274 The following options are available (default is 'auto'): 

1275 

1276 * 'auto' : use 'exact' for small size arrays, 'asymp' for large 

1277 * 'exact' : use approximation to exact distribution of test statistic 

1278 * 'asymp' : use asymptotic distribution of test statistic 

1279 

1280 Returns 

1281 ------- 

1282 d : float 

1283 Value of the Kolmogorov Smirnov test 

1284 p : float 

1285 Corresponding p-value. 

1286 

1287 """ 

1288 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

1289 alternative.lower()[0], alternative) 

1290 return scipy.stats.stats.ks_1samp( 

1291 x, cdf, args=args, alternative=alternative, mode=mode) 

1292 

1293 

1294def ks_2samp(data1, data2, alternative="two-sided", mode='auto'): 

1295 """ 

1296 Computes the Kolmogorov-Smirnov test on two samples. 

1297 

1298 Missing values in `x` and/or `y` are discarded. 

1299 

1300 Parameters 

1301 ---------- 

1302 data1 : array_like 

1303 First data set 

1304 data2 : array_like 

1305 Second data set 

1306 alternative : {'two-sided', 'less', 'greater'}, optional 

1307 Indicates the alternative hypothesis. Default is 'two-sided'. 

1308 mode : {'auto', 'exact', 'asymp'}, optional 

1309 Defines the method used for calculating the p-value. 

1310 The following options are available (default is 'auto'): 

1311 

1312 * 'auto' : use 'exact' for small size arrays, 'asymp' for large 

1313 * 'exact' : use approximation to exact distribution of test statistic 

1314 * 'asymp' : use asymptotic distribution of test statistic 

1315 

1316 Returns 

1317 ------- 

1318 d : float 

1319 Value of the Kolmogorov Smirnov test 

1320 p : float 

1321 Corresponding p-value. 

1322 

1323 """ 

1324 # Ideally this would be accomplished by 

1325 # ks_2samp = scipy.stats.stats.ks_2samp 

1326 # but the circular dependencies between mstats_basic and stats prevent that. 

1327 alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( 

1328 alternative.lower()[0], alternative) 

1329 return scipy.stats.stats.ks_2samp(data1, data2, alternative=alternative, mode=mode) 

1330 

1331 

1332ks_twosamp = ks_2samp 

1333 

1334 

1335def kstest(data1, data2, args=(), alternative='two-sided', mode='auto'): 

1336 """ 

1337 

1338 Parameters 

1339 ---------- 

1340 data1 : array_like 

1341 data2 : str, callable or array_like 

1342 args : tuple, sequence, optional 

1343 Distribution parameters, used if `data1` or `data2` are strings. 

1344 alternative : str, as documented in stats.kstest 

1345 mode : str, as documented in stats.kstest 

1346 

1347 Returns 

1348 ------- 

1349 tuple of (K-S statistic, probability) 

1350 

1351 """ 

1352 return scipy.stats.stats.kstest(data1, data2, args, alternative=alternative, mode=mode) 

1353 

1354 

1355def trima(a, limits=None, inclusive=(True,True)): 

1356 """ 

1357 Trims an array by masking the data outside some given limits. 

1358 

1359 Returns a masked version of the input array. 

1360 

1361 Parameters 

1362 ---------- 

1363 a : array_like 

1364 Input array. 

1365 limits : {None, tuple}, optional 

1366 Tuple of (lower limit, upper limit) in absolute values. 

1367 Values of the input array lower (greater) than the lower (upper) limit 

1368 will be masked. A limit is None indicates an open interval. 

1369 inclusive : (bool, bool) tuple, optional 

1370 Tuple of (lower flag, upper flag), indicating whether values exactly 

1371 equal to the lower (upper) limit are allowed. 

1372 

1373 Examples 

1374 -------- 

1375 >>> from scipy.stats.mstats import trima 

1376 

1377 >>> a = np.arange(10) 

1378 

1379 The interval is left-closed and right-open, i.e., `[2, 8)`. 

1380 Trim the array by keeping only values in the interval. 

1381 

1382 >>> trima(a, limits=(2, 8), inclusive=(True, False)) 

1383 masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --], 

1384 mask=[ True, True, False, False, False, False, False, False, 

1385 True, True], 

1386 fill_value=999999) 

1387 

1388 """ 

1389 a = ma.asarray(a) 

1390 a.unshare_mask() 

1391 if (limits is None) or (limits == (None, None)): 

1392 return a 

1393 

1394 (lower_lim, upper_lim) = limits 

1395 (lower_in, upper_in) = inclusive 

1396 condition = False 

1397 if lower_lim is not None: 

1398 if lower_in: 

1399 condition |= (a < lower_lim) 

1400 else: 

1401 condition |= (a <= lower_lim) 

1402 

1403 if upper_lim is not None: 

1404 if upper_in: 

1405 condition |= (a > upper_lim) 

1406 else: 

1407 condition |= (a >= upper_lim) 

1408 

1409 a[condition.filled(True)] = masked 

1410 return a 

1411 

1412 

1413def trimr(a, limits=None, inclusive=(True, True), axis=None): 

1414 """ 

1415 Trims an array by masking some proportion of the data on each end. 

1416 Returns a masked version of the input array. 

1417 

1418 Parameters 

1419 ---------- 

1420 a : sequence 

1421 Input array. 

1422 limits : {None, tuple}, optional 

1423 Tuple of the percentages to cut on each side of the array, with respect 

1424 to the number of unmasked data, as floats between 0. and 1. 

1425 Noting n the number of unmasked data before trimming, the 

1426 (n*limits[0])th smallest data and the (n*limits[1])th largest data are 

1427 masked, and the total number of unmasked data after trimming is 

1428 n*(1.-sum(limits)). The value of one limit can be set to None to 

1429 indicate an open interval. 

1430 inclusive : {(True,True) tuple}, optional 

1431 Tuple of flags indicating whether the number of data being masked on 

1432 the left (right) end should be truncated (True) or rounded (False) to 

1433 integers. 

1434 axis : {None,int}, optional 

1435 Axis along which to trim. If None, the whole array is trimmed, but its 

1436 shape is maintained. 

1437 

1438 """ 

1439 def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive): 

1440 n = a.count() 

1441 idx = a.argsort() 

1442 if low_limit: 

1443 if low_inclusive: 

1444 lowidx = int(low_limit*n) 

1445 else: 

1446 lowidx = int(np.round(low_limit*n)) 

1447 a[idx[:lowidx]] = masked 

1448 if up_limit is not None: 

1449 if up_inclusive: 

1450 upidx = n - int(n*up_limit) 

1451 else: 

1452 upidx = n - int(np.round(n*up_limit)) 

1453 a[idx[upidx:]] = masked 

1454 return a 

1455 

1456 a = ma.asarray(a) 

1457 a.unshare_mask() 

1458 if limits is None: 

1459 return a 

1460 

1461 # Check the limits 

1462 (lolim, uplim) = limits 

1463 errmsg = "The proportion to cut from the %s should be between 0. and 1." 

1464 if lolim is not None: 

1465 if lolim > 1. or lolim < 0: 

1466 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) 

1467 if uplim is not None: 

1468 if uplim > 1. or uplim < 0: 

1469 raise ValueError(errmsg % 'end' + "(got %s)" % uplim) 

1470 

1471 (loinc, upinc) = inclusive 

1472 

1473 if axis is None: 

1474 shp = a.shape 

1475 return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp) 

1476 else: 

1477 return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc) 

1478 

1479 

1480trimdoc = """ 

1481 Parameters 

1482 ---------- 

1483 a : sequence 

1484 Input array 

1485 limits : {None, tuple}, optional 

1486 If `relative` is False, tuple (lower limit, upper limit) in absolute values. 

1487 Values of the input array lower (greater) than the lower (upper) limit are 

1488 masked. 

1489 

1490 If `relative` is True, tuple (lower percentage, upper percentage) to cut 

1491 on each side of the array, with respect to the number of unmasked data. 

1492 

1493 Noting n the number of unmasked data before trimming, the (n*limits[0])th 

1494 smallest data and the (n*limits[1])th largest data are masked, and the 

1495 total number of unmasked data after trimming is n*(1.-sum(limits)) 

1496 In each case, the value of one limit can be set to None to indicate an 

1497 open interval. 

1498 

1499 If limits is None, no trimming is performed 

1500 inclusive : {(bool, bool) tuple}, optional 

1501 If `relative` is False, tuple indicating whether values exactly equal 

1502 to the absolute limits are allowed. 

1503 If `relative` is True, tuple indicating whether the number of data 

1504 being masked on each side should be rounded (True) or truncated 

1505 (False). 

1506 relative : bool, optional 

1507 Whether to consider the limits as absolute values (False) or proportions 

1508 to cut (True). 

1509 axis : int, optional 

1510 Axis along which to trim. 

1511""" 

1512 

1513 

1514def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None): 

1515 """ 

1516 Trims an array by masking the data outside some given limits. 

1517 

1518 Returns a masked version of the input array. 

1519 

1520 %s 

1521 

1522 Examples 

1523 -------- 

1524 >>> from scipy.stats.mstats import trim 

1525 >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10] 

1526 >>> print(trim(z,(3,8))) 

1527 [-- -- 3 4 5 6 7 8 -- --] 

1528 >>> print(trim(z,(0.1,0.2),relative=True)) 

1529 [-- 2 3 4 5 6 7 8 -- --] 

1530 

1531 """ 

1532 if relative: 

1533 return trimr(a, limits=limits, inclusive=inclusive, axis=axis) 

1534 else: 

1535 return trima(a, limits=limits, inclusive=inclusive) 

1536 

1537 

1538if trim.__doc__: 

1539 trim.__doc__ = trim.__doc__ % trimdoc 

1540 

1541 

1542def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None): 

1543 """ 

1544 Trims the smallest and largest data values. 

1545 

1546 Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and 

1547 ``int(proportiontocut * n)`` largest values of data along the given axis, 

1548 where n is the number of unmasked values before trimming. 

1549 

1550 Parameters 

1551 ---------- 

1552 data : ndarray 

1553 Data to trim. 

1554 proportiontocut : float, optional 

1555 Percentage of trimming (as a float between 0 and 1). 

1556 If n is the number of unmasked values before trimming, the number of 

1557 values after trimming is ``(1 - 2*proportiontocut) * n``. 

1558 Default is 0.2. 

1559 inclusive : {(bool, bool) tuple}, optional 

1560 Tuple indicating whether the number of data being masked on each side 

1561 should be rounded (True) or truncated (False). 

1562 axis : int, optional 

1563 Axis along which to perform the trimming. 

1564 If None, the input array is first flattened. 

1565 

1566 """ 

1567 return trimr(data, limits=(proportiontocut,proportiontocut), 

1568 inclusive=inclusive, axis=axis) 

1569 

1570 

1571def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True), 

1572 axis=None): 

1573 """ 

1574 Trims the data by masking values from one tail. 

1575 

1576 Parameters 

1577 ---------- 

1578 data : array_like 

1579 Data to trim. 

1580 proportiontocut : float, optional 

1581 Percentage of trimming. If n is the number of unmasked values 

1582 before trimming, the number of values after trimming is 

1583 ``(1 - proportiontocut) * n``. Default is 0.2. 

1584 tail : {'left','right'}, optional 

1585 If 'left' the `proportiontocut` lowest values will be masked. 

1586 If 'right' the `proportiontocut` highest values will be masked. 

1587 Default is 'left'. 

1588 inclusive : {(bool, bool) tuple}, optional 

1589 Tuple indicating whether the number of data being masked on each side 

1590 should be rounded (True) or truncated (False). Default is 

1591 (True, True). 

1592 axis : int, optional 

1593 Axis along which to perform the trimming. 

1594 If None, the input array is first flattened. Default is None. 

1595 

1596 Returns 

1597 ------- 

1598 trimtail : ndarray 

1599 Returned array of same shape as `data` with masked tail values. 

1600 

1601 """ 

1602 tail = str(tail).lower()[0] 

1603 if tail == 'l': 

1604 limits = (proportiontocut,None) 

1605 elif tail == 'r': 

1606 limits = (None, proportiontocut) 

1607 else: 

1608 raise TypeError("The tail argument should be in ('left','right')") 

1609 

1610 return trimr(data, limits=limits, axis=axis, inclusive=inclusive) 

1611 

1612 

1613trim1 = trimtail 

1614 

1615 

1616def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, 

1617 axis=None): 

1618 """Returns the trimmed mean of the data along the given axis. 

1619 

1620 %s 

1621 

1622 """ 

1623 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

1624 limits = (limits, limits) 

1625 if relative: 

1626 return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis) 

1627 else: 

1628 return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis) 

1629 

1630 

1631if trimmed_mean.__doc__: 

1632 trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc 

1633 

1634 

1635def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, 

1636 axis=None, ddof=0): 

1637 """Returns the trimmed variance of the data along the given axis. 

1638 

1639 %s 

1640 ddof : {0,integer}, optional 

1641 Means Delta Degrees of Freedom. The denominator used during computations 

1642 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- 

1643 biased estimate of the variance. 

1644 

1645 """ 

1646 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

1647 limits = (limits, limits) 

1648 if relative: 

1649 out = trimr(a,limits=limits, inclusive=inclusive,axis=axis) 

1650 else: 

1651 out = trima(a,limits=limits,inclusive=inclusive) 

1652 

1653 return out.var(axis=axis, ddof=ddof) 

1654 

1655 

1656if trimmed_var.__doc__: 

1657 trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc 

1658 

1659 

1660def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True, 

1661 axis=None, ddof=0): 

1662 """Returns the trimmed standard deviation of the data along the given axis. 

1663 

1664 %s 

1665 ddof : {0,integer}, optional 

1666 Means Delta Degrees of Freedom. The denominator used during computations 

1667 is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un- 

1668 biased estimate of the variance. 

1669 

1670 """ 

1671 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

1672 limits = (limits, limits) 

1673 if relative: 

1674 out = trimr(a,limits=limits,inclusive=inclusive,axis=axis) 

1675 else: 

1676 out = trima(a,limits=limits,inclusive=inclusive) 

1677 return out.std(axis=axis,ddof=ddof) 

1678 

1679 

1680if trimmed_std.__doc__: 

1681 trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc 

1682 

1683 

1684def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None): 

1685 """ 

1686 Returns the standard error of the trimmed mean along the given axis. 

1687 

1688 Parameters 

1689 ---------- 

1690 a : sequence 

1691 Input array 

1692 limits : {(0.1,0.1), tuple of float}, optional 

1693 tuple (lower percentage, upper percentage) to cut on each side of the 

1694 array, with respect to the number of unmasked data. 

1695 

1696 If n is the number of unmasked data before trimming, the values 

1697 smaller than ``n * limits[0]`` and the values larger than 

1698 ``n * `limits[1]`` are masked, and the total number of unmasked 

1699 data after trimming is ``n * (1.-sum(limits))``. In each case, 

1700 the value of one limit can be set to None to indicate an open interval. 

1701 If `limits` is None, no trimming is performed. 

1702 inclusive : {(bool, bool) tuple} optional 

1703 Tuple indicating whether the number of data being masked on each side 

1704 should be rounded (True) or truncated (False). 

1705 axis : int, optional 

1706 Axis along which to trim. 

1707 

1708 Returns 

1709 ------- 

1710 trimmed_stde : scalar or ndarray 

1711 

1712 """ 

1713 def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive): 

1714 "Returns the standard error of the trimmed mean for a 1D input data." 

1715 n = a.count() 

1716 idx = a.argsort() 

1717 if low_limit: 

1718 if low_inclusive: 

1719 lowidx = int(low_limit*n) 

1720 else: 

1721 lowidx = np.round(low_limit*n) 

1722 a[idx[:lowidx]] = masked 

1723 if up_limit is not None: 

1724 if up_inclusive: 

1725 upidx = n - int(n*up_limit) 

1726 else: 

1727 upidx = n - np.round(n*up_limit) 

1728 a[idx[upidx:]] = masked 

1729 a[idx[:lowidx]] = a[idx[lowidx]] 

1730 a[idx[upidx:]] = a[idx[upidx-1]] 

1731 winstd = a.std(ddof=1) 

1732 return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a))) 

1733 

1734 a = ma.array(a, copy=True, subok=True) 

1735 a.unshare_mask() 

1736 if limits is None: 

1737 return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis)) 

1738 if (not isinstance(limits,tuple)) and isinstance(limits,float): 

1739 limits = (limits, limits) 

1740 

1741 # Check the limits 

1742 (lolim, uplim) = limits 

1743 errmsg = "The proportion to cut from the %s should be between 0. and 1." 

1744 if lolim is not None: 

1745 if lolim > 1. or lolim < 0: 

1746 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) 

1747 if uplim is not None: 

1748 if uplim > 1. or uplim < 0: 

1749 raise ValueError(errmsg % 'end' + "(got %s)" % uplim) 

1750 

1751 (loinc, upinc) = inclusive 

1752 if (axis is None): 

1753 return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc) 

1754 else: 

1755 if a.ndim > 2: 

1756 raise ValueError("Array 'a' must be at most two dimensional, but got a.ndim = %d" % a.ndim) 

1757 return ma.apply_along_axis(_trimmed_stde_1D, axis, a, 

1758 lolim,uplim,loinc,upinc) 

1759 

1760 

1761def _mask_to_limits(a, limits, inclusive): 

1762 """Mask an array for values outside of given limits. 

1763 

1764 This is primarily a utility function. 

1765 

1766 Parameters 

1767 ---------- 

1768 a : array 

1769 limits : (float or None, float or None) 

1770 A tuple consisting of the (lower limit, upper limit). Values in the 

1771 input array less than the lower limit or greater than the upper limit 

1772 will be masked out. None implies no limit. 

1773 inclusive : (bool, bool) 

1774 A tuple consisting of the (lower flag, upper flag). These flags 

1775 determine whether values exactly equal to lower or upper are allowed. 

1776 

1777 Returns 

1778 ------- 

1779 A MaskedArray. 

1780 

1781 Raises 

1782 ------ 

1783 A ValueError if there are no values within the given limits. 

1784 """ 

1785 lower_limit, upper_limit = limits 

1786 lower_include, upper_include = inclusive 

1787 am = ma.MaskedArray(a) 

1788 if lower_limit is not None: 

1789 if lower_include: 

1790 am = ma.masked_less(am, lower_limit) 

1791 else: 

1792 am = ma.masked_less_equal(am, lower_limit) 

1793 

1794 if upper_limit is not None: 

1795 if upper_include: 

1796 am = ma.masked_greater(am, upper_limit) 

1797 else: 

1798 am = ma.masked_greater_equal(am, upper_limit) 

1799 

1800 if am.count() == 0: 

1801 raise ValueError("No array values within given limits") 

1802 

1803 return am 

1804 

1805 

1806def tmean(a, limits=None, inclusive=(True, True), axis=None): 

1807 """ 

1808 Compute the trimmed mean. 

1809 

1810 Parameters 

1811 ---------- 

1812 a : array_like 

1813 Array of values. 

1814 limits : None or (lower limit, upper limit), optional 

1815 Values in the input array less than the lower limit or greater than the 

1816 upper limit will be ignored. When limits is None (default), then all 

1817 values are used. Either of the limit values in the tuple can also be 

1818 None representing a half-open interval. 

1819 inclusive : (bool, bool), optional 

1820 A tuple consisting of the (lower flag, upper flag). These flags 

1821 determine whether values exactly equal to the lower or upper limits 

1822 are included. The default value is (True, True). 

1823 axis : int or None, optional 

1824 Axis along which to operate. If None, compute over the 

1825 whole array. Default is None. 

1826 

1827 Returns 

1828 ------- 

1829 tmean : float 

1830 

1831 Notes 

1832 ----- 

1833 For more details on `tmean`, see `stats.tmean`. 

1834 

1835 Examples 

1836 -------- 

1837 >>> from scipy.stats import mstats 

1838 >>> a = np.array([[6, 8, 3, 0], 

1839 ... [3, 9, 1, 2], 

1840 ... [8, 7, 8, 2], 

1841 ... [5, 6, 0, 2], 

1842 ... [4, 5, 5, 2]]) 

1843 ... 

1844 ... 

1845 >>> mstats.tmean(a, (2,5)) 

1846 3.3 

1847 >>> mstats.tmean(a, (2,5), axis=0) 

1848 masked_array(data=[4.0, 5.0, 4.0, 2.0], 

1849 mask=[False, False, False, False], 

1850 fill_value=1e+20) 

1851 

1852 """ 

1853 return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis) 

1854 

1855 

1856def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

1857 """ 

1858 Compute the trimmed variance 

1859 

1860 This function computes the sample variance of an array of values, 

1861 while ignoring values which are outside of given `limits`. 

1862 

1863 Parameters 

1864 ---------- 

1865 a : array_like 

1866 Array of values. 

1867 limits : None or (lower limit, upper limit), optional 

1868 Values in the input array less than the lower limit or greater than the 

1869 upper limit will be ignored. When limits is None, then all values are 

1870 used. Either of the limit values in the tuple can also be None 

1871 representing a half-open interval. The default value is None. 

1872 inclusive : (bool, bool), optional 

1873 A tuple consisting of the (lower flag, upper flag). These flags 

1874 determine whether values exactly equal to the lower or upper limits 

1875 are included. The default value is (True, True). 

1876 axis : int or None, optional 

1877 Axis along which to operate. If None, compute over the 

1878 whole array. Default is zero. 

1879 ddof : int, optional 

1880 Delta degrees of freedom. Default is 1. 

1881 

1882 Returns 

1883 ------- 

1884 tvar : float 

1885 Trimmed variance. 

1886 

1887 Notes 

1888 ----- 

1889 For more details on `tvar`, see `stats.tvar`. 

1890 

1891 """ 

1892 a = a.astype(float).ravel() 

1893 if limits is None: 

1894 n = (~a.mask).sum() # todo: better way to do that? 

1895 return np.ma.var(a) * n/(n-1.) 

1896 am = _mask_to_limits(a, limits=limits, inclusive=inclusive) 

1897 

1898 return np.ma.var(am, axis=axis, ddof=ddof) 

1899 

1900 

1901def tmin(a, lowerlimit=None, axis=0, inclusive=True): 

1902 """ 

1903 Compute the trimmed minimum 

1904 

1905 Parameters 

1906 ---------- 

1907 a : array_like 

1908 array of values 

1909 lowerlimit : None or float, optional 

1910 Values in the input array less than the given limit will be ignored. 

1911 When lowerlimit is None, then all values are used. The default value 

1912 is None. 

1913 axis : int or None, optional 

1914 Axis along which to operate. Default is 0. If None, compute over the 

1915 whole array `a`. 

1916 inclusive : {True, False}, optional 

1917 This flag determines whether values exactly equal to the lower limit 

1918 are included. The default value is True. 

1919 

1920 Returns 

1921 ------- 

1922 tmin : float, int or ndarray 

1923 

1924 Notes 

1925 ----- 

1926 For more details on `tmin`, see `stats.tmin`. 

1927 

1928 Examples 

1929 -------- 

1930 >>> from scipy.stats import mstats 

1931 >>> a = np.array([[6, 8, 3, 0], 

1932 ... [3, 2, 1, 2], 

1933 ... [8, 1, 8, 2], 

1934 ... [5, 3, 0, 2], 

1935 ... [4, 7, 5, 2]]) 

1936 ... 

1937 >>> mstats.tmin(a, 5) 

1938 masked_array(data=[5, 7, 5, --], 

1939 mask=[False, False, False, True], 

1940 fill_value=999999) 

1941 

1942 """ 

1943 a, axis = _chk_asarray(a, axis) 

1944 am = trima(a, (lowerlimit, None), (inclusive, False)) 

1945 return ma.minimum.reduce(am, axis) 

1946 

1947 

1948def tmax(a, upperlimit=None, axis=0, inclusive=True): 

1949 """ 

1950 Compute the trimmed maximum 

1951 

1952 This function computes the maximum value of an array along a given axis, 

1953 while ignoring values larger than a specified upper limit. 

1954 

1955 Parameters 

1956 ---------- 

1957 a : array_like 

1958 array of values 

1959 upperlimit : None or float, optional 

1960 Values in the input array greater than the given limit will be ignored. 

1961 When upperlimit is None, then all values are used. The default value 

1962 is None. 

1963 axis : int or None, optional 

1964 Axis along which to operate. Default is 0. If None, compute over the 

1965 whole array `a`. 

1966 inclusive : {True, False}, optional 

1967 This flag determines whether values exactly equal to the upper limit 

1968 are included. The default value is True. 

1969 

1970 Returns 

1971 ------- 

1972 tmax : float, int or ndarray 

1973 

1974 Notes 

1975 ----- 

1976 For more details on `tmax`, see `stats.tmax`. 

1977 

1978 Examples 

1979 -------- 

1980 >>> from scipy.stats import mstats 

1981 >>> a = np.array([[6, 8, 3, 0], 

1982 ... [3, 9, 1, 2], 

1983 ... [8, 7, 8, 2], 

1984 ... [5, 6, 0, 2], 

1985 ... [4, 5, 5, 2]]) 

1986 ... 

1987 ... 

1988 >>> mstats.tmax(a, 4) 

1989 masked_array(data=[4, --, 3, 2], 

1990 mask=[False, True, False, False], 

1991 fill_value=999999) 

1992 

1993 """ 

1994 a, axis = _chk_asarray(a, axis) 

1995 am = trima(a, (None, upperlimit), (False, inclusive)) 

1996 return ma.maximum.reduce(am, axis) 

1997 

1998 

1999def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1): 

2000 """ 

2001 Compute the trimmed standard error of the mean. 

2002 

2003 This function finds the standard error of the mean for given 

2004 values, ignoring values outside the given `limits`. 

2005 

2006 Parameters 

2007 ---------- 

2008 a : array_like 

2009 array of values 

2010 limits : None or (lower limit, upper limit), optional 

2011 Values in the input array less than the lower limit or greater than the 

2012 upper limit will be ignored. When limits is None, then all values are 

2013 used. Either of the limit values in the tuple can also be None 

2014 representing a half-open interval. The default value is None. 

2015 inclusive : (bool, bool), optional 

2016 A tuple consisting of the (lower flag, upper flag). These flags 

2017 determine whether values exactly equal to the lower or upper limits 

2018 are included. The default value is (True, True). 

2019 axis : int or None, optional 

2020 Axis along which to operate. If None, compute over the 

2021 whole array. Default is zero. 

2022 ddof : int, optional 

2023 Delta degrees of freedom. Default is 1. 

2024 

2025 Returns 

2026 ------- 

2027 tsem : float 

2028 

2029 Notes 

2030 ----- 

2031 For more details on `tsem`, see `stats.tsem`. 

2032 

2033 """ 

2034 a = ma.asarray(a).ravel() 

2035 if limits is None: 

2036 n = float(a.count()) 

2037 return a.std(axis=axis, ddof=ddof)/ma.sqrt(n) 

2038 

2039 am = trima(a.ravel(), limits, inclusive) 

2040 sd = np.sqrt(am.var(axis=axis, ddof=ddof)) 

2041 return sd / np.sqrt(am.count()) 

2042 

2043 

2044def winsorize(a, limits=None, inclusive=(True, True), inplace=False, 

2045 axis=None, nan_policy='propagate'): 

2046 """Returns a Winsorized version of the input array. 

2047 

2048 The (limits[0])th lowest values are set to the (limits[0])th percentile, 

2049 and the (limits[1])th highest values are set to the (1 - limits[1])th 

2050 percentile. 

2051 Masked values are skipped. 

2052 

2053 

2054 Parameters 

2055 ---------- 

2056 a : sequence 

2057 Input array. 

2058 limits : {None, tuple of float}, optional 

2059 Tuple of the percentages to cut on each side of the array, with respect 

2060 to the number of unmasked data, as floats between 0. and 1. 

2061 Noting n the number of unmasked data before trimming, the 

2062 (n*limits[0])th smallest data and the (n*limits[1])th largest data are 

2063 masked, and the total number of unmasked data after trimming 

2064 is n*(1.-sum(limits)) The value of one limit can be set to None to 

2065 indicate an open interval. 

2066 inclusive : {(True, True) tuple}, optional 

2067 Tuple indicating whether the number of data being masked on each side 

2068 should be truncated (True) or rounded (False). 

2069 inplace : {False, True}, optional 

2070 Whether to winsorize in place (True) or to use a copy (False) 

2071 axis : {None, int}, optional 

2072 Axis along which to trim. If None, the whole array is trimmed, but its 

2073 shape is maintained. 

2074 nan_policy : {'propagate', 'raise', 'omit'}, optional 

2075 Defines how to handle when input contains nan. 

2076 The following options are available (default is 'propagate'): 

2077 

2078 * 'propagate': allows nan values and may overwrite or propagate them 

2079 * 'raise': throws an error 

2080 * 'omit': performs the calculations ignoring nan values 

2081 

2082 Notes 

2083 ----- 

2084 This function is applied to reduce the effect of possibly spurious outliers 

2085 by limiting the extreme values. 

2086 

2087 Examples 

2088 -------- 

2089 >>> from scipy.stats.mstats import winsorize 

2090 

2091 A shuffled array contains integers from 1 to 10. 

2092 

2093 >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) 

2094 

2095 The 10% of the lowest value (i.e., `1`) and the 20% of the highest 

2096 values (i.e., `9` and `10`) are replaced. 

2097 

2098 >>> winsorize(a, limits=[0.1, 0.2]) 

2099 masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6], 

2100 mask=False, 

2101 fill_value=999999) 

2102 

2103 """ 

2104 def _winsorize1D(a, low_limit, up_limit, low_include, up_include, 

2105 contains_nan, nan_policy): 

2106 n = a.count() 

2107 idx = a.argsort() 

2108 if contains_nan: 

2109 nan_count = np.count_nonzero(np.isnan(a)) 

2110 if low_limit: 

2111 if low_include: 

2112 lowidx = int(low_limit * n) 

2113 else: 

2114 lowidx = np.round(low_limit * n).astype(int) 

2115 if contains_nan and nan_policy == 'omit': 

2116 lowidx = min(lowidx, n-nan_count-1) 

2117 a[idx[:lowidx]] = a[idx[lowidx]] 

2118 if up_limit is not None: 

2119 if up_include: 

2120 upidx = n - int(n * up_limit) 

2121 else: 

2122 upidx = n - np.round(n * up_limit).astype(int) 

2123 if contains_nan and nan_policy == 'omit': 

2124 a[idx[upidx:-nan_count]] = a[idx[upidx - 1]] 

2125 else: 

2126 a[idx[upidx:]] = a[idx[upidx - 1]] 

2127 return a 

2128 

2129 contains_nan, nan_policy = scipy.stats.stats._contains_nan(a, nan_policy) 

2130 # We are going to modify a: better make a copy 

2131 a = ma.array(a, copy=np.logical_not(inplace)) 

2132 

2133 if limits is None: 

2134 return a 

2135 if (not isinstance(limits, tuple)) and isinstance(limits, float): 

2136 limits = (limits, limits) 

2137 

2138 # Check the limits 

2139 (lolim, uplim) = limits 

2140 errmsg = "The proportion to cut from the %s should be between 0. and 1." 

2141 if lolim is not None: 

2142 if lolim > 1. or lolim < 0: 

2143 raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim) 

2144 if uplim is not None: 

2145 if uplim > 1. or uplim < 0: 

2146 raise ValueError(errmsg % 'end' + "(got %s)" % uplim) 

2147 

2148 (loinc, upinc) = inclusive 

2149 

2150 if axis is None: 

2151 shp = a.shape 

2152 return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc, 

2153 contains_nan, nan_policy).reshape(shp) 

2154 else: 

2155 return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc, 

2156 upinc, contains_nan, nan_policy) 

2157 

2158 

2159def moment(a, moment=1, axis=0): 

2160 """ 

2161 Calculates the nth moment about the mean for a sample. 

2162 

2163 Parameters 

2164 ---------- 

2165 a : array_like 

2166 data 

2167 moment : int, optional 

2168 order of central moment that is returned 

2169 axis : int or None, optional 

2170 Axis along which the central moment is computed. Default is 0. 

2171 If None, compute over the whole array `a`. 

2172 

2173 Returns 

2174 ------- 

2175 n-th central moment : ndarray or float 

2176 The appropriate moment along the given axis or over all values if axis 

2177 is None. The denominator for the moment calculation is the number of 

2178 observations, no degrees of freedom correction is done. 

2179 

2180 Notes 

2181 ----- 

2182 For more details about `moment`, see `stats.moment`. 

2183 

2184 """ 

2185 a, axis = _chk_asarray(a, axis) 

2186 if moment == 1: 

2187 # By definition the first moment about the mean is 0. 

2188 shape = list(a.shape) 

2189 del shape[axis] 

2190 if shape: 

2191 # return an actual array of the appropriate shape 

2192 return np.zeros(shape, dtype=float) 

2193 else: 

2194 # the input was 1D, so return a scalar instead of a rank-0 array 

2195 return np.float64(0.0) 

2196 else: 

2197 # Exponentiation by squares: form exponent sequence 

2198 n_list = [moment] 

2199 current_n = moment 

2200 while current_n > 2: 

2201 if current_n % 2: 

2202 current_n = (current_n-1)/2 

2203 else: 

2204 current_n /= 2 

2205 n_list.append(current_n) 

2206 

2207 # Starting point for exponentiation by squares 

2208 a_zero_mean = a - ma.expand_dims(a.mean(axis), axis) 

2209 if n_list[-1] == 1: 

2210 s = a_zero_mean.copy() 

2211 else: 

2212 s = a_zero_mean**2 

2213 

2214 # Perform multiplications 

2215 for n in n_list[-2::-1]: 

2216 s = s**2 

2217 if n % 2: 

2218 s *= a_zero_mean 

2219 return s.mean(axis) 

2220 

2221 

2222def variation(a, axis=0): 

2223 """ 

2224 Computes the coefficient of variation, the ratio of the biased standard 

2225 deviation to the mean. 

2226 

2227 Parameters 

2228 ---------- 

2229 a : array_like 

2230 Input array. 

2231 axis : int or None, optional 

2232 Axis along which to calculate the coefficient of variation. Default 

2233 is 0. If None, compute over the whole array `a`. 

2234 

2235 Returns 

2236 ------- 

2237 variation : ndarray 

2238 The calculated variation along the requested axis. 

2239  

2240 Notes 

2241 ----- 

2242 For more details about `variation`, see `stats.variation`. 

2243  

2244 Examples 

2245 -------- 

2246 >>> from scipy.stats.mstats import variation 

2247 >>> a = np.array([2,8,4]) 

2248 >>> variation(a) 

2249 0.5345224838248487 

2250 >>> b = np.array([2,8,3,4]) 

2251 >>> c = np.ma.masked_array(b, mask=[0,0,1,0]) 

2252 >>> variation(c) 

2253 0.5345224838248487 

2254 

2255 In the example above, it can be seen that this works the same as 

2256 `stats.variation` except 'stats.mstats.variation' ignores masked  

2257 array elements. 

2258 

2259 """ 

2260 a, axis = _chk_asarray(a, axis) 

2261 return a.std(axis)/a.mean(axis) 

2262 

2263 

2264def skew(a, axis=0, bias=True): 

2265 """ 

2266 Computes the skewness of a data set. 

2267 

2268 Parameters 

2269 ---------- 

2270 a : ndarray 

2271 data 

2272 axis : int or None, optional 

2273 Axis along which skewness is calculated. Default is 0. 

2274 If None, compute over the whole array `a`. 

2275 bias : bool, optional 

2276 If False, then the calculations are corrected for statistical bias. 

2277 

2278 Returns 

2279 ------- 

2280 skewness : ndarray 

2281 The skewness of values along an axis, returning 0 where all values are 

2282 equal. 

2283 

2284 Notes 

2285 ----- 

2286 For more details about `skew`, see `stats.skew`. 

2287 

2288 """ 

2289 a, axis = _chk_asarray(a,axis) 

2290 n = a.count(axis) 

2291 m2 = moment(a, 2, axis) 

2292 m3 = moment(a, 3, axis) 

2293 with np.errstate(all='ignore'): 

2294 vals = ma.where(m2 == 0, 0, m3 / m2**1.5) 

2295 

2296 if not bias: 

2297 can_correct = (n > 2) & (m2 > 0) 

2298 if can_correct.any(): 

2299 m2 = np.extract(can_correct, m2) 

2300 m3 = np.extract(can_correct, m3) 

2301 nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5 

2302 np.place(vals, can_correct, nval) 

2303 return vals 

2304 

2305 

2306def kurtosis(a, axis=0, fisher=True, bias=True): 

2307 """ 

2308 Computes the kurtosis (Fisher or Pearson) of a dataset. 

2309 

2310 Kurtosis is the fourth central moment divided by the square of the 

2311 variance. If Fisher's definition is used, then 3.0 is subtracted from 

2312 the result to give 0.0 for a normal distribution. 

2313 

2314 If bias is False then the kurtosis is calculated using k statistics to 

2315 eliminate bias coming from biased moment estimators 

2316 

2317 Use `kurtosistest` to see if result is close enough to normal. 

2318 

2319 Parameters 

2320 ---------- 

2321 a : array 

2322 data for which the kurtosis is calculated 

2323 axis : int or None, optional 

2324 Axis along which the kurtosis is calculated. Default is 0. 

2325 If None, compute over the whole array `a`. 

2326 fisher : bool, optional 

2327 If True, Fisher's definition is used (normal ==> 0.0). If False, 

2328 Pearson's definition is used (normal ==> 3.0). 

2329 bias : bool, optional 

2330 If False, then the calculations are corrected for statistical bias. 

2331 

2332 Returns 

2333 ------- 

2334 kurtosis : array 

2335 The kurtosis of values along an axis. If all values are equal, 

2336 return -3 for Fisher's definition and 0 for Pearson's definition. 

2337 

2338 Notes 

2339 ----- 

2340 For more details about `kurtosis`, see `stats.kurtosis`. 

2341 

2342 """ 

2343 a, axis = _chk_asarray(a, axis) 

2344 m2 = moment(a, 2, axis) 

2345 m4 = moment(a, 4, axis) 

2346 with np.errstate(all='ignore'): 

2347 vals = ma.where(m2 == 0, 0, m4 / m2**2.0) 

2348 

2349 if not bias: 

2350 n = a.count(axis) 

2351 can_correct = (n > 3) & (m2 is not ma.masked and m2 > 0) 

2352 if can_correct.any(): 

2353 n = np.extract(can_correct, n) 

2354 m2 = np.extract(can_correct, m2) 

2355 m4 = np.extract(can_correct, m4) 

2356 nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0) 

2357 np.place(vals, can_correct, nval+3.0) 

2358 if fisher: 

2359 return vals - 3 

2360 else: 

2361 return vals 

2362 

2363 

2364DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean', 

2365 'variance', 'skewness', 

2366 'kurtosis')) 

2367 

2368 

2369def describe(a, axis=0, ddof=0, bias=True): 

2370 """ 

2371 Computes several descriptive statistics of the passed array. 

2372 

2373 Parameters 

2374 ---------- 

2375 a : array_like 

2376 Data array 

2377 axis : int or None, optional 

2378 Axis along which to calculate statistics. Default 0. If None, 

2379 compute over the whole array `a`. 

2380 ddof : int, optional 

2381 degree of freedom (default 0); note that default ddof is different 

2382 from the same routine in stats.describe 

2383 bias : bool, optional 

2384 If False, then the skewness and kurtosis calculations are corrected for 

2385 statistical bias. 

2386 

2387 Returns 

2388 ------- 

2389 nobs : int 

2390 (size of the data (discarding missing values) 

2391 

2392 minmax : (int, int) 

2393 min, max 

2394 

2395 mean : float 

2396 arithmetic mean 

2397 

2398 variance : float 

2399 unbiased variance 

2400 

2401 skewness : float 

2402 biased skewness 

2403 

2404 kurtosis : float 

2405 biased kurtosis 

2406 

2407 Examples 

2408 -------- 

2409 >>> from scipy.stats.mstats import describe 

2410 >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1]) 

2411 >>> describe(ma) 

2412 DescribeResult(nobs=3, minmax=(masked_array(data=0, 

2413 mask=False, 

2414 fill_value=999999), masked_array(data=2, 

2415 mask=False, 

2416 fill_value=999999)), mean=1.0, variance=0.6666666666666666, 

2417 skewness=masked_array(data=0., mask=False, fill_value=1e+20), 

2418 kurtosis=-1.5) 

2419 

2420 """ 

2421 a, axis = _chk_asarray(a, axis) 

2422 n = a.count(axis) 

2423 mm = (ma.minimum.reduce(a), ma.maximum.reduce(a)) 

2424 m = a.mean(axis) 

2425 v = a.var(axis, ddof=ddof) 

2426 sk = skew(a, axis, bias=bias) 

2427 kurt = kurtosis(a, axis, bias=bias) 

2428 

2429 return DescribeResult(n, mm, m, v, sk, kurt) 

2430 

2431 

2432def stde_median(data, axis=None): 

2433 """Returns the McKean-Schrader estimate of the standard error of the sample 

2434 median along the given axis. masked values are discarded. 

2435 

2436 Parameters 

2437 ---------- 

2438 data : ndarray 

2439 Data to trim. 

2440 axis : {None,int}, optional 

2441 Axis along which to perform the trimming. 

2442 If None, the input array is first flattened. 

2443 

2444 """ 

2445 def _stdemed_1D(data): 

2446 data = np.sort(data.compressed()) 

2447 n = len(data) 

2448 z = 2.5758293035489004 

2449 k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0)) 

2450 return ((data[n-k] - data[k-1])/(2.*z)) 

2451 

2452 data = ma.array(data, copy=False, subok=True) 

2453 if (axis is None): 

2454 return _stdemed_1D(data) 

2455 else: 

2456 if data.ndim > 2: 

2457 raise ValueError("Array 'data' must be at most two dimensional, " 

2458 "but got data.ndim = %d" % data.ndim) 

2459 return ma.apply_along_axis(_stdemed_1D, axis, data) 

2460 

2461 

2462SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue')) 

2463 

2464 

2465def skewtest(a, axis=0): 

2466 """ 

2467 Tests whether the skew is different from the normal distribution. 

2468 

2469 Parameters 

2470 ---------- 

2471 a : array 

2472 The data to be tested 

2473 axis : int or None, optional 

2474 Axis along which statistics are calculated. Default is 0. 

2475 If None, compute over the whole array `a`. 

2476 

2477 Returns 

2478 ------- 

2479 statistic : float 

2480 The computed z-score for this test. 

2481 pvalue : float 

2482 a 2-sided p-value for the hypothesis test 

2483 

2484 Notes 

2485 ----- 

2486 For more details about `skewtest`, see `stats.skewtest`. 

2487 

2488 """ 

2489 a, axis = _chk_asarray(a, axis) 

2490 if axis is None: 

2491 a = a.ravel() 

2492 axis = 0 

2493 b2 = skew(a,axis) 

2494 n = a.count(axis) 

2495 if np.min(n) < 8: 

2496 raise ValueError( 

2497 "skewtest is not valid with less than 8 samples; %i samples" 

2498 " were given." % np.min(n)) 

2499 

2500 y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2))) 

2501 beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9)) 

2502 W2 = -1 + ma.sqrt(2*(beta2-1)) 

2503 delta = 1/ma.sqrt(0.5*ma.log(W2)) 

2504 alpha = ma.sqrt(2.0/(W2-1)) 

2505 y = ma.where(y == 0, 1, y) 

2506 Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1)) 

2507 

2508 return SkewtestResult(Z, 2 * distributions.norm.sf(np.abs(Z))) 

2509 

2510 

2511KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue')) 

2512 

2513 

2514def kurtosistest(a, axis=0): 

2515 """ 

2516 Tests whether a dataset has normal kurtosis 

2517 

2518 Parameters 

2519 ---------- 

2520 a : array 

2521 array of the sample data 

2522 axis : int or None, optional 

2523 Axis along which to compute test. Default is 0. If None, 

2524 compute over the whole array `a`. 

2525 

2526 Returns 

2527 ------- 

2528 statistic : float 

2529 The computed z-score for this test. 

2530 pvalue : float 

2531 The 2-sided p-value for the hypothesis test 

2532 

2533 Notes 

2534 ----- 

2535 For more details about `kurtosistest`, see `stats.kurtosistest`. 

2536 

2537 """ 

2538 a, axis = _chk_asarray(a, axis) 

2539 n = a.count(axis=axis) 

2540 if np.min(n) < 5: 

2541 raise ValueError( 

2542 "kurtosistest requires at least 5 observations; %i observations" 

2543 " were given." % np.min(n)) 

2544 if np.min(n) < 20: 

2545 warnings.warn( 

2546 "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" % 

2547 np.min(n)) 

2548 

2549 b2 = kurtosis(a, axis, fisher=False) 

2550 E = 3.0*(n-1) / (n+1) 

2551 varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5)) 

2552 x = (b2-E)/ma.sqrt(varb2) 

2553 sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) / 

2554 (n*(n-2)*(n-3))) 

2555 A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2))) 

2556 term1 = 1 - 2./(9.0*A) 

2557 denom = 1 + x*ma.sqrt(2/(A-4.0)) 

2558 if np.ma.isMaskedArray(denom): 

2559 # For multi-dimensional array input 

2560 denom[denom == 0.0] = masked 

2561 elif denom == 0.0: 

2562 denom = masked 

2563 

2564 term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0), 

2565 -ma.power(-(1-2.0/A)/denom, 1/3.0)) 

2566 Z = (term1 - term2) / np.sqrt(2/(9.0*A)) 

2567 

2568 return KurtosistestResult(Z, 2 * distributions.norm.sf(np.abs(Z))) 

2569 

2570 

2571NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue')) 

2572 

2573 

2574def normaltest(a, axis=0): 

2575 """ 

2576 Tests whether a sample differs from a normal distribution. 

2577 

2578 Parameters 

2579 ---------- 

2580 a : array_like 

2581 The array containing the data to be tested. 

2582 axis : int or None, optional 

2583 Axis along which to compute test. Default is 0. If None, 

2584 compute over the whole array `a`. 

2585 

2586 Returns 

2587 ------- 

2588 statistic : float or array 

2589 ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and 

2590 ``k`` is the z-score returned by `kurtosistest`. 

2591 pvalue : float or array 

2592 A 2-sided chi squared probability for the hypothesis test. 

2593 

2594 Notes 

2595 ----- 

2596 For more details about `normaltest`, see `stats.normaltest`. 

2597 

2598 """ 

2599 a, axis = _chk_asarray(a, axis) 

2600 s, _ = skewtest(a, axis) 

2601 k, _ = kurtosistest(a, axis) 

2602 k2 = s*s + k*k 

2603 

2604 return NormaltestResult(k2, distributions.chi2.sf(k2, 2)) 

2605 

2606 

2607def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None, 

2608 limit=()): 

2609 """ 

2610 Computes empirical quantiles for a data array. 

2611 

2612 Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``, 

2613 where ``x[j]`` is the j-th order statistic, and gamma is a function of 

2614 ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and 

2615 ``g = n*p + m - j``. 

2616 

2617 Reinterpreting the above equations to compare to **R** lead to the 

2618 equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)`` 

2619 

2620 Typical values of (alphap,betap) are: 

2621 - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf 

2622 (**R** type 4) 

2623 - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function 

2624 (**R** type 5) 

2625 - (0,0) : ``p(k) = k/(n+1)`` : 

2626 (**R** type 6) 

2627 - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])]. 

2628 (**R** type 7, **R** default) 

2629 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])]. 

2630 The resulting quantile estimates are approximately median-unbiased 

2631 regardless of the distribution of x. 

2632 (**R** type 8) 

2633 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom. 

2634 The resulting quantile estimates are approximately unbiased 

2635 if x is normally distributed 

2636 (**R** type 9) 

2637 - (.4,.4) : approximately quantile unbiased (Cunnane) 

2638 - (.35,.35): APL, used with PWM 

2639 

2640 Parameters 

2641 ---------- 

2642 a : array_like 

2643 Input data, as a sequence or array of dimension at most 2. 

2644 prob : array_like, optional 

2645 List of quantiles to compute. 

2646 alphap : float, optional 

2647 Plotting positions parameter, default is 0.4. 

2648 betap : float, optional 

2649 Plotting positions parameter, default is 0.4. 

2650 axis : int, optional 

2651 Axis along which to perform the trimming. 

2652 If None (default), the input array is first flattened. 

2653 limit : tuple, optional 

2654 Tuple of (lower, upper) values. 

2655 Values of `a` outside this open interval are ignored. 

2656 

2657 Returns 

2658 ------- 

2659 mquantiles : MaskedArray 

2660 An array containing the calculated quantiles. 

2661 

2662 Notes 

2663 ----- 

2664 This formulation is very similar to **R** except the calculation of 

2665 ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined 

2666 with each type. 

2667 

2668 References 

2669 ---------- 

2670 .. [1] *R* statistical software: https://www.r-project.org/ 

2671 .. [2] *R* ``quantile`` function: 

2672 http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html 

2673 

2674 Examples 

2675 -------- 

2676 >>> from scipy.stats.mstats import mquantiles 

2677 >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.]) 

2678 >>> mquantiles(a) 

2679 array([ 19.2, 40. , 42.8]) 

2680 

2681 Using a 2D array, specifying axis and limit. 

2682 

2683 >>> data = np.array([[ 6., 7., 1.], 

2684 ... [ 47., 15., 2.], 

2685 ... [ 49., 36., 3.], 

2686 ... [ 15., 39., 4.], 

2687 ... [ 42., 40., -999.], 

2688 ... [ 41., 41., -999.], 

2689 ... [ 7., -999., -999.], 

2690 ... [ 39., -999., -999.], 

2691 ... [ 43., -999., -999.], 

2692 ... [ 40., -999., -999.], 

2693 ... [ 36., -999., -999.]]) 

2694 >>> print(mquantiles(data, axis=0, limit=(0, 50))) 

2695 [[19.2 14.6 1.45] 

2696 [40. 37.5 2.5 ] 

2697 [42.8 40.05 3.55]] 

2698 

2699 >>> data[:, 2] = -999. 

2700 >>> print(mquantiles(data, axis=0, limit=(0, 50))) 

2701 [[19.200000000000003 14.6 --] 

2702 [40.0 37.5 --] 

2703 [42.800000000000004 40.05 --]] 

2704 

2705 """ 

2706 def _quantiles1D(data,m,p): 

2707 x = np.sort(data.compressed()) 

2708 n = len(x) 

2709 if n == 0: 

2710 return ma.array(np.empty(len(p), dtype=float), mask=True) 

2711 elif n == 1: 

2712 return ma.array(np.resize(x, p.shape), mask=nomask) 

2713 aleph = (n*p + m) 

2714 k = np.floor(aleph.clip(1, n-1)).astype(int) 

2715 gamma = (aleph-k).clip(0,1) 

2716 return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] 

2717 

2718 data = ma.array(a, copy=False) 

2719 if data.ndim > 2: 

2720 raise TypeError("Array should be 2D at most !") 

2721 

2722 if limit: 

2723 condition = (limit[0] < data) & (data < limit[1]) 

2724 data[~condition.filled(True)] = masked 

2725 

2726 p = np.array(prob, copy=False, ndmin=1) 

2727 m = alphap + p*(1.-alphap-betap) 

2728 # Computes quantiles along axis (or globally) 

2729 if (axis is None): 

2730 return _quantiles1D(data, m, p) 

2731 

2732 return ma.apply_along_axis(_quantiles1D, axis, data, m, p) 

2733 

2734 

2735def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4): 

2736 """Calculate the score at the given 'per' percentile of the 

2737 sequence a. For example, the score at per=50 is the median. 

2738 

2739 This function is a shortcut to mquantile 

2740 

2741 """ 

2742 if (per < 0) or (per > 100.): 

2743 raise ValueError("The percentile should be between 0. and 100. !" 

2744 " (got %s)" % per) 

2745 

2746 return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap, 

2747 limit=limit, axis=0).squeeze() 

2748 

2749 

2750def plotting_positions(data, alpha=0.4, beta=0.4): 

2751 """ 

2752 Returns plotting positions (or empirical percentile points) for the data. 

2753 

2754 Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where: 

2755 - i is the rank order statistics 

2756 - n is the number of unmasked values along the given axis 

2757 - `alpha` and `beta` are two parameters. 

2758 

2759 Typical values for `alpha` and `beta` are: 

2760 - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4) 

2761 - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function 

2762 (R, type 5) 

2763 - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6) 

2764 - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case, 

2765 ``p(k) = mode[F(x[k])]``. That's R default (R type 7) 

2766 - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then 

2767 ``p(k) ~ median[F(x[k])]``. 

2768 The resulting quantile estimates are approximately median-unbiased 

2769 regardless of the distribution of x. (R type 8) 

2770 - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom. 

2771 The resulting quantile estimates are approximately unbiased 

2772 if x is normally distributed (R type 9) 

2773 - (.4,.4) : approximately quantile unbiased (Cunnane) 

2774 - (.35,.35): APL, used with PWM 

2775 - (.3175, .3175): used in scipy.stats.probplot 

2776 

2777 Parameters 

2778 ---------- 

2779 data : array_like 

2780 Input data, as a sequence or array of dimension at most 2. 

2781 alpha : float, optional 

2782 Plotting positions parameter. Default is 0.4. 

2783 beta : float, optional 

2784 Plotting positions parameter. Default is 0.4. 

2785 

2786 Returns 

2787 ------- 

2788 positions : MaskedArray 

2789 The calculated plotting positions. 

2790 

2791 """ 

2792 data = ma.array(data, copy=False).reshape(1,-1) 

2793 n = data.count() 

2794 plpos = np.empty(data.size, dtype=float) 

2795 plpos[n:] = 0 

2796 plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) / 

2797 (n + 1.0 - alpha - beta)) 

2798 return ma.array(plpos, mask=data._mask) 

2799 

2800 

2801meppf = plotting_positions 

2802 

2803 

2804def obrientransform(*args): 

2805 """ 

2806 Computes a transform on input data (any number of columns). Used to 

2807 test for homogeneity of variance prior to running one-way stats. Each 

2808 array in ``*args`` is one level of a factor. If an `f_oneway()` run on 

2809 the transformed data and found significant, variances are unequal. From 

2810 Maxwell and Delaney, p.112. 

2811 

2812 Returns: transformed data for use in an ANOVA 

2813 """ 

2814 data = argstoarray(*args).T 

2815 v = data.var(axis=0,ddof=1) 

2816 m = data.mean(0) 

2817 n = data.count(0).astype(float) 

2818 # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2)) 

2819 data -= m 

2820 data **= 2 

2821 data *= (n-1.5)*n 

2822 data -= 0.5*v*(n-1) 

2823 data /= (n-1.)*(n-2.) 

2824 if not ma.allclose(v,data.mean(0)): 

2825 raise ValueError("Lack of convergence in obrientransform.") 

2826 

2827 return data 

2828 

2829 

2830def sem(a, axis=0, ddof=1): 

2831 """ 

2832 Calculates the standard error of the mean of the input array. 

2833 

2834 Also sometimes called standard error of measurement. 

2835 

2836 Parameters 

2837 ---------- 

2838 a : array_like 

2839 An array containing the values for which the standard error is 

2840 returned. 

2841 axis : int or None, optional 

2842 If axis is None, ravel `a` first. If axis is an integer, this will be 

2843 the axis over which to operate. Defaults to 0. 

2844 ddof : int, optional 

2845 Delta degrees-of-freedom. How many degrees of freedom to adjust 

2846 for bias in limited samples relative to the population estimate 

2847 of variance. Defaults to 1. 

2848 

2849 Returns 

2850 ------- 

2851 s : ndarray or float 

2852 The standard error of the mean in the sample(s), along the input axis. 

2853 

2854 Notes 

2855 ----- 

2856 The default value for `ddof` changed in scipy 0.15.0 to be consistent with 

2857 `stats.sem` as well as with the most common definition used (like in the R 

2858 documentation). 

2859 

2860 Examples 

2861 -------- 

2862 Find standard error along the first axis: 

2863 

2864 >>> from scipy import stats 

2865 >>> a = np.arange(20).reshape(5,4) 

2866 >>> print(stats.mstats.sem(a)) 

2867 [2.8284271247461903 2.8284271247461903 2.8284271247461903 

2868 2.8284271247461903] 

2869 

2870 Find standard error across the whole array, using n degrees of freedom: 

2871 

2872 >>> print(stats.mstats.sem(a, axis=None, ddof=0)) 

2873 1.2893796958227628 

2874 

2875 """ 

2876 a, axis = _chk_asarray(a, axis) 

2877 n = a.count(axis=axis) 

2878 s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n) 

2879 return s 

2880 

2881 

2882F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue')) 

2883 

2884 

2885def f_oneway(*args): 

2886 """ 

2887 Performs a 1-way ANOVA, returning an F-value and probability given 

2888 any number of groups. From Heiman, pp.394-7. 

2889 

2890 Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays, 

2891 one per treatment group. 

2892 

2893 Returns 

2894 ------- 

2895 statistic : float 

2896 The computed F-value of the test. 

2897 pvalue : float 

2898 The associated p-value from the F-distribution. 

2899 

2900 """ 

2901 # Construct a single array of arguments: each row is a group 

2902 data = argstoarray(*args) 

2903 ngroups = len(data) 

2904 ntot = data.count() 

2905 sstot = (data**2).sum() - (data.sum())**2/float(ntot) 

2906 ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum() 

2907 sswg = sstot-ssbg 

2908 dfbg = ngroups-1 

2909 dfwg = ntot - ngroups 

2910 msb = ssbg/float(dfbg) 

2911 msw = sswg/float(dfwg) 

2912 f = msb/msw 

2913 prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf 

2914 

2915 return F_onewayResult(f, prob) 

2916 

2917 

2918FriedmanchisquareResult = namedtuple('FriedmanchisquareResult', 

2919 ('statistic', 'pvalue')) 

2920 

2921 

2922def friedmanchisquare(*args): 

2923 """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA. 

2924 This function calculates the Friedman Chi-square test for repeated measures 

2925 and returns the result, along with the associated probability value. 

2926 

2927 Each input is considered a given group. Ideally, the number of treatments 

2928 among each group should be equal. If this is not the case, only the first 

2929 n treatments are taken into account, where n is the number of treatments 

2930 of the smallest group. 

2931 If a group has some missing values, the corresponding treatments are masked 

2932 in the other groups. 

2933 The test statistic is corrected for ties. 

2934 

2935 Masked values in one group are propagated to the other groups. 

2936 

2937 Returns 

2938 ------- 

2939 statistic : float 

2940 the test statistic. 

2941 pvalue : float 

2942 the associated p-value. 

2943 

2944 """ 

2945 data = argstoarray(*args).astype(float) 

2946 k = len(data) 

2947 if k < 3: 

2948 raise ValueError("Less than 3 groups (%i): " % k + 

2949 "the Friedman test is NOT appropriate.") 

2950 

2951 ranked = ma.masked_values(rankdata(data, axis=0), 0) 

2952 if ranked._mask is not nomask: 

2953 ranked = ma.mask_cols(ranked) 

2954 ranked = ranked.compressed().reshape(k,-1).view(ndarray) 

2955 else: 

2956 ranked = ranked._data 

2957 (k,n) = ranked.shape 

2958 # Ties correction 

2959 repeats = [find_repeats(row) for row in ranked.T] 

2960 ties = np.array([y for x, y in repeats if x.size > 0]) 

2961 tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k)) 

2962 

2963 ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2) 

2964 chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction 

2965 

2966 return FriedmanchisquareResult(chisq, 

2967 distributions.chi2.sf(chisq, k-1)) 

2968 

2969 

2970BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue')) 

2971 

2972 

2973def brunnermunzel(x, y, alternative="two-sided", distribution="t"): 

2974 """ 

2975 Computes the Brunner-Munzel test on samples x and y 

2976 

2977 Missing values in `x` and/or `y` are discarded. 

2978 

2979 Parameters 

2980 ---------- 

2981 x, y : array_like 

2982 Array of samples, should be one-dimensional. 

2983 alternative : 'less', 'two-sided', or 'greater', optional 

2984 Whether to get the p-value for the one-sided hypothesis ('less' 

2985 or 'greater') or for the two-sided hypothesis ('two-sided'). 

2986 Defaults value is 'two-sided' . 

2987 distribution: 't' or 'normal', optional 

2988 Whether to get the p-value by t-distribution or by standard normal 

2989 distribution. 

2990 Defaults value is 't' . 

2991 

2992 Returns 

2993 ------- 

2994 statistic : float 

2995 The Brunner-Munzer W statistic. 

2996 pvalue : float 

2997 p-value assuming an t distribution. One-sided or 

2998 two-sided, depending on the choice of `alternative` and `distribution`. 

2999 

3000 See Also 

3001 -------- 

3002 mannwhitneyu : Mann-Whitney rank test on two samples. 

3003 

3004 Notes 

3005 ------- 

3006 For more details on `brunnermunzel`, see `stats.brunnermunzel`. 

3007 

3008 """ 

3009 x = ma.asarray(x).compressed().view(ndarray) 

3010 y = ma.asarray(y).compressed().view(ndarray) 

3011 nx = len(x) 

3012 ny = len(y) 

3013 if nx == 0 or ny == 0: 

3014 return BrunnerMunzelResult(np.nan, np.nan) 

3015 rankc = rankdata(np.concatenate((x,y))) 

3016 rankcx = rankc[0:nx] 

3017 rankcy = rankc[nx:nx+ny] 

3018 rankcx_mean = np.mean(rankcx) 

3019 rankcy_mean = np.mean(rankcy) 

3020 rankx = rankdata(x) 

3021 ranky = rankdata(y) 

3022 rankx_mean = np.mean(rankx) 

3023 ranky_mean = np.mean(ranky) 

3024 

3025 Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0)) 

3026 Sx /= nx - 1 

3027 Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0)) 

3028 Sy /= ny - 1 

3029 

3030 wbfn = nx * ny * (rankcy_mean - rankcx_mean) 

3031 wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy) 

3032 

3033 if distribution == "t": 

3034 df_numer = np.power(nx * Sx + ny * Sy, 2.0) 

3035 df_denom = np.power(nx * Sx, 2.0) / (nx - 1) 

3036 df_denom += np.power(ny * Sy, 2.0) / (ny - 1) 

3037 df = df_numer / df_denom 

3038 p = distributions.t.cdf(wbfn, df) 

3039 elif distribution == "normal": 

3040 p = distributions.norm.cdf(wbfn) 

3041 else: 

3042 raise ValueError( 

3043 "distribution should be 't' or 'normal'") 

3044 

3045 if alternative == "greater": 

3046 pass 

3047 elif alternative == "less": 

3048 p = 1 - p 

3049 elif alternative == "two-sided": 

3050 p = 2 * np.min([p, 1-p]) 

3051 else: 

3052 raise ValueError( 

3053 "alternative should be 'less', 'greater' or 'two-sided'") 

3054 

3055 return BrunnerMunzelResult(wbfn, p)