Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from statsmodels.compat.python import lrange, lmap, iterkeys, iteritems 

2from statsmodels.compat.pandas import Appender 

3 

4import numpy as np 

5from scipy import stats 

6from statsmodels.iolib.table import SimpleTable 

7from statsmodels.tools.decorators import nottest 

8 

9 

10def _kurtosis(a): 

11 '''wrapper for scipy.stats.kurtosis that returns nan instead of raising Error 

12 

13 missing options 

14 ''' 

15 try: 

16 res = stats.kurtosis(a) 

17 except ValueError: 

18 res = np.nan 

19 return res 

20 

21 

22def _skew(a): 

23 '''wrapper for scipy.stats.skew that returns nan instead of raising Error 

24 

25 missing options 

26 ''' 

27 try: 

28 res = stats.skew(a) 

29 except ValueError: 

30 res = np.nan 

31 return res 

32 

33 

34@nottest 

35def sign_test(samp, mu0=0): 

36 """ 

37 Signs test. 

38 

39 Parameters 

40 ---------- 

41 samp : array_like 

42 1d array. The sample for which you want to perform the signs 

43 test. 

44 mu0 : float 

45 See Notes for the definition of the sign test. mu0 is 0 by 

46 default, but it is common to set it to the median. 

47 

48 Returns 

49 ------- 

50 M 

51 p-value 

52 

53 Notes 

54 ----- 

55 The signs test returns 

56 

57 M = (N(+) - N(-))/2 

58 

59 where N(+) is the number of values above `mu0`, N(-) is the number of 

60 values below. Values equal to `mu0` are discarded. 

61 

62 The p-value for M is calculated using the binomial distribution 

63 and can be interpreted the same as for a t-test. The test-statistic 

64 is distributed Binom(min(N(+), N(-)), n_trials, .5) where n_trials 

65 equals N(+) + N(-). 

66 

67 See Also 

68 -------- 

69 scipy.stats.wilcoxon 

70 """ 

71 samp = np.asarray(samp) 

72 pos = np.sum(samp > mu0) 

73 neg = np.sum(samp < mu0) 

74 M = (pos-neg)/2. 

75 p = stats.binom_test(min(pos,neg), pos+neg, .5) 

76 return M, p 

77 

78 

79class Describe(object): 

80 ''' 

81 Calculates descriptive statistics for data. 

82 

83 Defaults to a basic set of statistics, "all" can be specified, or a list 

84 can be given. 

85 

86 Parameters 

87 ---------- 

88 dataset : array_like 

89 2D dataset for descriptive statistics. 

90 ''' 

91 def __init__(self, dataset): 

92 self.dataset = dataset 

93 

94 #better if this is initially a list to define order, or use an 

95 # ordered dict. First position is the function 

96 # Second position is the tuple/list of column names/numbers 

97 # third is are the results in order of the columns 

98 self.univariate = dict( 

99 obs = [len, None, None], 

100 mean = [np.mean, None, None], 

101 std = [np.std, None, None], 

102 min = [np.min, None, None], 

103 max = [np.max, None, None], 

104 ptp = [np.ptp, None, None], 

105 var = [np.var, None, None], 

106 mode_val = [self._mode_val, None, None], 

107 mode_bin = [self._mode_bin, None, None], 

108 median = [np.median, None, None], 

109 skew = [stats.skew, None, None], 

110 uss = [lambda x: np.sum(np.asarray(x)**2, axis=0), None, None], 

111 kurtosis = [stats.kurtosis, None, None], 

112 percentiles = [self._percentiles, None, None], 

113 #BUG: not single value 

114 #sign_test_M = [self.sign_test_m, None, None], 

115 #sign_test_P = [self.sign_test_p, None, None] 

116 ) 

117 

118 # TODO: Basic stats for strings 

119 # self.strings = dict( 

120 # unique = [np.unique, None, None], 

121 # number_uniq = [len( 

122 # most = [ 

123 # least = [ 

124 

125 #TODO: Multivariate 

126 # self.multivariate = dict( 

127 # corrcoef(x[, y, rowvar, bias]), 

128 # cov(m[, y, rowvar, bias]), 

129 # histogram2d(x, y[, bins, range, normed, weights]) 

130 # ) 

131 self._arraytype = None 

132 self._columns_list = None 

133 

134 def _percentiles(self,x): 

135 p = [stats.scoreatpercentile(x,per) for per in 

136 (1,5,10,25,50,75,90,95,99)] 

137 return p 

138 def _mode_val(self,x): 

139 return stats.mode(x)[0][0] 

140 def _mode_bin(self,x): 

141 return stats.mode(x)[1][0] 

142 

143 def _array_typer(self): 

144 """if not a sctructured array""" 

145 if not(self.dataset.dtype.names): 

146 """homogeneous dtype array""" 

147 self._arraytype = 'homog' 

148 elif self.dataset.dtype.names: 

149 """structured or rec array""" 

150 self._arraytype = 'sctruct' 

151 else: 

152 assert self._arraytype == 'sctruct' or self._arraytype == 'homog' 

153 

154 def _is_dtype_like(self, col): 

155 """ 

156 Check whether self.dataset.[col][0] behaves like a string, numbern 

157 unknown. `numpy.lib._iotools._is_string_like` 

158 """ 

159 def string_like(): 

160 # TODO: not sure what the result is if the first item is some 

161 # type of missing value 

162 try: 

163 self.dataset[col][0] + '' 

164 except (TypeError, ValueError): 

165 return False 

166 return True 

167 

168 def number_like(): 

169 try: 

170 self.dataset[col][0] + 1.0 

171 except (TypeError, ValueError): 

172 return False 

173 return True 

174 if number_like() and not string_like(): 

175 return 'number' 

176 elif not number_like() and string_like(): 

177 return 'string' 

178 else: 

179 assert (number_like() or string_like()), '\ 

180 Not sure of dtype'+str(self.dataset[col][0]) 

181 

182 #@property 

183 def summary(self, stats='basic', columns='all', orientation='auto'): 

184 """ 

185 Return a summary of descriptive statistics. 

186 

187 Parameters 

188 ---------- 

189 stats: list or str 

190 The desired statistics, Accepts 'basic' or 'all' or a list. 

191 'basic' = ('obs', 'mean', 'std', 'min', 'max') 

192 'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 

193 'mode', 'meadian', 'skew', 'uss', 'kurtosis', 

194 'percentiles') 

195 columns : list or str 

196 The columns/variables to report the statistics, default is 'all' 

197 If an object with named columns is given, you may specify the 

198 column names. For example 

199 """ 

200 #NOTE 

201 # standard array: Specifiy column numbers (NEED TO TEST) 

202 # percentiles currently broken 

203 # mode requires mode_val and mode_bin separately 

204 if self._arraytype is None: 

205 self._array_typer() 

206 

207 if stats == 'basic': 

208 stats = ('obs', 'mean', 'std', 'min', 'max') 

209 elif stats == 'all': 

210 #stats = self.univariate.keys() 

211 #dict does not keep an order, use full list instead 

212 stats = ['obs', 'mean', 'std', 'min', 'max', 'ptp', 'var', 

213 'mode_val', 'mode_bin', 'median', 'uss', 'skew', 

214 'kurtosis', 'percentiles'] 

215 else: 

216 for astat in stats: 

217 pass 

218 #assert astat in self.univariate 

219 

220 #hack around percentiles multiple output 

221 

222 #bad naming 

223 import scipy.stats 

224 #BUG: the following has all per the same per=99 

225 ##perdict = dict(('perc_%2d'%per, [lambda x: 

226 # scipy.stats.scoreatpercentile(x, per), None, None]) 

227 ## for per in (1,5,10,25,50,75,90,95,99)) 

228 

229 def _fun(per): 

230 return lambda x: scipy.stats.scoreatpercentile(x, per) 

231 

232 perdict = dict(('perc_%02d' % per, [_fun(per), None, None]) 

233 for per in (1,5,10,25,50,75,90,95,99)) 

234 

235 if 'percentiles' in stats: 

236 self.univariate.update(perdict) 

237 idx = stats.index('percentiles') 

238 stats[idx:idx+1] = sorted(iterkeys(perdict)) 

239 

240 

241 

242 #JP: this does not allow a change in sequence, sequence in stats is 

243 #ignored 

244 #this is just an if condition 

245 if any([aitem[1] for aitem in iteritems(self.univariate) if aitem[0] in 

246 stats]): 

247 if columns == 'all': 

248 self._columns_list = [] 

249 if self._arraytype == 'sctruct': 

250 self._columns_list = self.dataset.dtype.names 

251 #self._columns_list = [col for col in 

252 # self.dataset.dtype.names if 

253 # (self._is_dtype_like(col)=='number')] 

254 else: 

255 self._columns_list = lrange(self.dataset.shape[1]) 

256 else: 

257 self._columns_list = columns 

258 if self._arraytype == 'sctruct': 

259 for col in self._columns_list: 

260 assert (col in self.dataset.dtype.names) 

261 else: 

262 assert self._is_dtype_like(self.dataset) == 'number' 

263 

264 columstypes = self.dataset.dtype 

265 #TODO: do we need to make sure they dtype is float64 ? 

266 for astat in stats: 

267 calc = self.univariate[astat] 

268 if self._arraytype == 'sctruct': 

269 calc[1] = self._columns_list 

270 calc[2] = [calc[0](self.dataset[col]) for col in 

271 self._columns_list if (self._is_dtype_like(col) == 

272 'number')] 

273 #calc[2].append([len(np.unique(self.dataset[col])) for col 

274 # in self._columns_list if 

275 # self._is_dtype_like(col)=='string'] 

276 else: 

277 calc[1] = ['Col '+str(col) for col in self._columns_list] 

278 calc[2] = [calc[0](self.dataset[:,col]) for col in 

279 self._columns_list] 

280 return self.print_summary(stats, orientation=orientation) 

281 else: 

282 return self.print_summary(stats, orientation=orientation) 

283 

284 def print_summary(self, stats, orientation='auto'): 

285 #TODO: need to specify a table formating for the numbers, using defualt 

286 title = 'Summary Statistics' 

287 header = stats 

288 stubs = self.univariate['obs'][1] 

289 data = [[self.univariate[astat][2][col] for astat in stats] for col in 

290 range(len(self.univariate['obs'][2]))] 

291 

292 if (orientation == 'varcols') or \ 

293 (orientation == 'auto' and len(stubs) < len(header)): 

294 #swap rows and columns 

295 data = lmap(lambda *row: list(row), *data) 

296 header, stubs = stubs, header 

297 

298 part_fmt = dict(data_fmts = ["%#8.4g"]*(len(header)-1)) 

299 table = SimpleTable(data, 

300 header, 

301 stubs, 

302 title=title, 

303 txt_fmt = part_fmt) 

304 

305 return table 

306 

307 @Appender(sign_test.__doc__) # i.e. module-level sign_test 

308 def sign_test(self, samp, mu0=0): 

309 return sign_test(samp, mu0) 

310 

311 #TODO: There must be a better way but formating the stats of a fuction that 

312 # returns 2 values is a problem. 

313 #def sign_test_m(samp,mu0=0): 

314 #return self.sign_test(samp,mu0)[0] 

315 #def sign_test_p(samp,mu0=0): 

316 #return self.sign_test(samp,mu0)[1] 

317 

318if __name__ == "__main__": 

319 #unittest.main() 

320 

321 data4 = np.array([[1,2,3,4,5,6], 

322 [6,5,4,3,2,1], 

323 [9,9,9,9,9,9]]) 

324 

325 t1 = Describe(data4) 

326 #print(t1.summary(stats='all')) 

327 noperc = ['obs', 'mean', 'std', 'min', 'max', 'ptp', #'mode', #'var', 

328 'median', 'skew', 'uss', 'kurtosis'] 

329 #TODO: mode var raise exception, 

330 #TODO: percentile writes list in cell (?), huge wide format 

331 print(t1.summary(stats=noperc)) 

332 print(t1.summary()) 

333 print(t1.summary( orientation='varcols')) 

334 print(t1.summary(stats=['mean', 'median', 'min', 'max'], orientation=('varcols'))) 

335 print(t1.summary(stats='all')) 

336 

337 

338 data1 = np.array([(1,2,'a','aa'), 

339 (2,3,'b','bb'), 

340 (2,4,'b','cc')], 

341 dtype = [('alpha',float), ('beta', int), 

342 ('gamma', '|S1'), ('delta', '|S2')]) 

343 data2 = np.array([(1,2), 

344 (2,3), 

345 (2,4)], 

346 dtype = [('alpha',float), ('beta', float)]) 

347 

348 data3 = np.array([[1,2,4,4], 

349 [2,3,3,3], 

350 [2,4,4,3]], dtype=float) 

351 

352 

353 class TestSimpleTable(object): 

354 #from statsmodels.iolib.table import SimpleTable, default_txt_fmt 

355 

356 def test_basic_1(self): 

357 print('test_basic_1') 

358 t1 = Describe(data1) 

359 print(t1.summary()) 

360 

361 

362 def test_basic_2(self): 

363 print('test_basic_2') 

364 t2 = Describe(data2) 

365 print(t2.summary()) 

366 

367 def test_describe_summary_float_ndarray(self): 

368 print('test_describe_summary_float_ndarray') 

369 t1 = Describe(data3) 

370 print(t1.summary()) 

371 

372 def test_basic_4(self): 

373 print('test_basic_4') 

374 t1 = Describe(data4) 

375 print(t1.summary()) 

376 

377 def test_basic_1a(self): 

378 print('test_basic_1a') 

379 t1 = Describe(data1) 

380 print(t1.summary(stats='basic', columns=['alpha'])) 

381 

382 def test_basic_1b(self): 

383 print('test_basic_1b') 

384 t1 = Describe(data1) 

385 print(t1.summary(stats='basic', columns='all')) 

386 

387 def test_basic_2a(self): 

388 print('test_basic_2a') 

389 t2 = Describe(data2) 

390 print(t2.summary(stats='all')) 

391 

392 def test_basic_3(aself): 

393 t1 = Describe(data3) 

394 print(t1.summary(stats='all')) 

395 

396 def test_basic_4a(self): 

397 t1 = Describe(data4) 

398 print(t1.summary(stats='all'))