Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/descriptivestats.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from statsmodels.compat.python import lrange, lmap, iterkeys, iteritems
2from statsmodels.compat.pandas import Appender
4import numpy as np
5from scipy import stats
6from statsmodels.iolib.table import SimpleTable
7from statsmodels.tools.decorators import nottest
10def _kurtosis(a):
11 '''wrapper for scipy.stats.kurtosis that returns nan instead of raising Error
13 missing options
14 '''
15 try:
16 res = stats.kurtosis(a)
17 except ValueError:
18 res = np.nan
19 return res
22def _skew(a):
23 '''wrapper for scipy.stats.skew that returns nan instead of raising Error
25 missing options
26 '''
27 try:
28 res = stats.skew(a)
29 except ValueError:
30 res = np.nan
31 return res
34@nottest
35def sign_test(samp, mu0=0):
36 """
37 Signs test.
39 Parameters
40 ----------
41 samp : array_like
42 1d array. The sample for which you want to perform the signs
43 test.
44 mu0 : float
45 See Notes for the definition of the sign test. mu0 is 0 by
46 default, but it is common to set it to the median.
48 Returns
49 -------
50 M
51 p-value
53 Notes
54 -----
55 The signs test returns
57 M = (N(+) - N(-))/2
59 where N(+) is the number of values above `mu0`, N(-) is the number of
60 values below. Values equal to `mu0` are discarded.
62 The p-value for M is calculated using the binomial distribution
63 and can be interpreted the same as for a t-test. The test-statistic
64 is distributed Binom(min(N(+), N(-)), n_trials, .5) where n_trials
65 equals N(+) + N(-).
67 See Also
68 --------
69 scipy.stats.wilcoxon
70 """
71 samp = np.asarray(samp)
72 pos = np.sum(samp > mu0)
73 neg = np.sum(samp < mu0)
74 M = (pos-neg)/2.
75 p = stats.binom_test(min(pos,neg), pos+neg, .5)
76 return M, p
79class Describe(object):
80 '''
81 Calculates descriptive statistics for data.
83 Defaults to a basic set of statistics, "all" can be specified, or a list
84 can be given.
86 Parameters
87 ----------
88 dataset : array_like
89 2D dataset for descriptive statistics.
90 '''
91 def __init__(self, dataset):
92 self.dataset = dataset
94 #better if this is initially a list to define order, or use an
95 # ordered dict. First position is the function
96 # Second position is the tuple/list of column names/numbers
97 # third is are the results in order of the columns
98 self.univariate = dict(
99 obs = [len, None, None],
100 mean = [np.mean, None, None],
101 std = [np.std, None, None],
102 min = [np.min, None, None],
103 max = [np.max, None, None],
104 ptp = [np.ptp, None, None],
105 var = [np.var, None, None],
106 mode_val = [self._mode_val, None, None],
107 mode_bin = [self._mode_bin, None, None],
108 median = [np.median, None, None],
109 skew = [stats.skew, None, None],
110 uss = [lambda x: np.sum(np.asarray(x)**2, axis=0), None, None],
111 kurtosis = [stats.kurtosis, None, None],
112 percentiles = [self._percentiles, None, None],
113 #BUG: not single value
114 #sign_test_M = [self.sign_test_m, None, None],
115 #sign_test_P = [self.sign_test_p, None, None]
116 )
118 # TODO: Basic stats for strings
119 # self.strings = dict(
120 # unique = [np.unique, None, None],
121 # number_uniq = [len(
122 # most = [
123 # least = [
125 #TODO: Multivariate
126 # self.multivariate = dict(
127 # corrcoef(x[, y, rowvar, bias]),
128 # cov(m[, y, rowvar, bias]),
129 # histogram2d(x, y[, bins, range, normed, weights])
130 # )
131 self._arraytype = None
132 self._columns_list = None
134 def _percentiles(self,x):
135 p = [stats.scoreatpercentile(x,per) for per in
136 (1,5,10,25,50,75,90,95,99)]
137 return p
138 def _mode_val(self,x):
139 return stats.mode(x)[0][0]
140 def _mode_bin(self,x):
141 return stats.mode(x)[1][0]
143 def _array_typer(self):
144 """if not a sctructured array"""
145 if not(self.dataset.dtype.names):
146 """homogeneous dtype array"""
147 self._arraytype = 'homog'
148 elif self.dataset.dtype.names:
149 """structured or rec array"""
150 self._arraytype = 'sctruct'
151 else:
152 assert self._arraytype == 'sctruct' or self._arraytype == 'homog'
154 def _is_dtype_like(self, col):
155 """
156 Check whether self.dataset.[col][0] behaves like a string, numbern
157 unknown. `numpy.lib._iotools._is_string_like`
158 """
159 def string_like():
160 # TODO: not sure what the result is if the first item is some
161 # type of missing value
162 try:
163 self.dataset[col][0] + ''
164 except (TypeError, ValueError):
165 return False
166 return True
168 def number_like():
169 try:
170 self.dataset[col][0] + 1.0
171 except (TypeError, ValueError):
172 return False
173 return True
174 if number_like() and not string_like():
175 return 'number'
176 elif not number_like() and string_like():
177 return 'string'
178 else:
179 assert (number_like() or string_like()), '\
180 Not sure of dtype'+str(self.dataset[col][0])
182 #@property
183 def summary(self, stats='basic', columns='all', orientation='auto'):
184 """
185 Return a summary of descriptive statistics.
187 Parameters
188 ----------
189 stats: list or str
190 The desired statistics, Accepts 'basic' or 'all' or a list.
191 'basic' = ('obs', 'mean', 'std', 'min', 'max')
192 'all' = ('obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
193 'mode', 'meadian', 'skew', 'uss', 'kurtosis',
194 'percentiles')
195 columns : list or str
196 The columns/variables to report the statistics, default is 'all'
197 If an object with named columns is given, you may specify the
198 column names. For example
199 """
200 #NOTE
201 # standard array: Specifiy column numbers (NEED TO TEST)
202 # percentiles currently broken
203 # mode requires mode_val and mode_bin separately
204 if self._arraytype is None:
205 self._array_typer()
207 if stats == 'basic':
208 stats = ('obs', 'mean', 'std', 'min', 'max')
209 elif stats == 'all':
210 #stats = self.univariate.keys()
211 #dict does not keep an order, use full list instead
212 stats = ['obs', 'mean', 'std', 'min', 'max', 'ptp', 'var',
213 'mode_val', 'mode_bin', 'median', 'uss', 'skew',
214 'kurtosis', 'percentiles']
215 else:
216 for astat in stats:
217 pass
218 #assert astat in self.univariate
220 #hack around percentiles multiple output
222 #bad naming
223 import scipy.stats
224 #BUG: the following has all per the same per=99
225 ##perdict = dict(('perc_%2d'%per, [lambda x:
226 # scipy.stats.scoreatpercentile(x, per), None, None])
227 ## for per in (1,5,10,25,50,75,90,95,99))
229 def _fun(per):
230 return lambda x: scipy.stats.scoreatpercentile(x, per)
232 perdict = dict(('perc_%02d' % per, [_fun(per), None, None])
233 for per in (1,5,10,25,50,75,90,95,99))
235 if 'percentiles' in stats:
236 self.univariate.update(perdict)
237 idx = stats.index('percentiles')
238 stats[idx:idx+1] = sorted(iterkeys(perdict))
242 #JP: this does not allow a change in sequence, sequence in stats is
243 #ignored
244 #this is just an if condition
245 if any([aitem[1] for aitem in iteritems(self.univariate) if aitem[0] in
246 stats]):
247 if columns == 'all':
248 self._columns_list = []
249 if self._arraytype == 'sctruct':
250 self._columns_list = self.dataset.dtype.names
251 #self._columns_list = [col for col in
252 # self.dataset.dtype.names if
253 # (self._is_dtype_like(col)=='number')]
254 else:
255 self._columns_list = lrange(self.dataset.shape[1])
256 else:
257 self._columns_list = columns
258 if self._arraytype == 'sctruct':
259 for col in self._columns_list:
260 assert (col in self.dataset.dtype.names)
261 else:
262 assert self._is_dtype_like(self.dataset) == 'number'
264 columstypes = self.dataset.dtype
265 #TODO: do we need to make sure they dtype is float64 ?
266 for astat in stats:
267 calc = self.univariate[astat]
268 if self._arraytype == 'sctruct':
269 calc[1] = self._columns_list
270 calc[2] = [calc[0](self.dataset[col]) for col in
271 self._columns_list if (self._is_dtype_like(col) ==
272 'number')]
273 #calc[2].append([len(np.unique(self.dataset[col])) for col
274 # in self._columns_list if
275 # self._is_dtype_like(col)=='string']
276 else:
277 calc[1] = ['Col '+str(col) for col in self._columns_list]
278 calc[2] = [calc[0](self.dataset[:,col]) for col in
279 self._columns_list]
280 return self.print_summary(stats, orientation=orientation)
281 else:
282 return self.print_summary(stats, orientation=orientation)
284 def print_summary(self, stats, orientation='auto'):
285 #TODO: need to specify a table formating for the numbers, using defualt
286 title = 'Summary Statistics'
287 header = stats
288 stubs = self.univariate['obs'][1]
289 data = [[self.univariate[astat][2][col] for astat in stats] for col in
290 range(len(self.univariate['obs'][2]))]
292 if (orientation == 'varcols') or \
293 (orientation == 'auto' and len(stubs) < len(header)):
294 #swap rows and columns
295 data = lmap(lambda *row: list(row), *data)
296 header, stubs = stubs, header
298 part_fmt = dict(data_fmts = ["%#8.4g"]*(len(header)-1))
299 table = SimpleTable(data,
300 header,
301 stubs,
302 title=title,
303 txt_fmt = part_fmt)
305 return table
307 @Appender(sign_test.__doc__) # i.e. module-level sign_test
308 def sign_test(self, samp, mu0=0):
309 return sign_test(samp, mu0)
311 #TODO: There must be a better way but formating the stats of a fuction that
312 # returns 2 values is a problem.
313 #def sign_test_m(samp,mu0=0):
314 #return self.sign_test(samp,mu0)[0]
315 #def sign_test_p(samp,mu0=0):
316 #return self.sign_test(samp,mu0)[1]
318if __name__ == "__main__":
319 #unittest.main()
321 data4 = np.array([[1,2,3,4,5,6],
322 [6,5,4,3,2,1],
323 [9,9,9,9,9,9]])
325 t1 = Describe(data4)
326 #print(t1.summary(stats='all'))
327 noperc = ['obs', 'mean', 'std', 'min', 'max', 'ptp', #'mode', #'var',
328 'median', 'skew', 'uss', 'kurtosis']
329 #TODO: mode var raise exception,
330 #TODO: percentile writes list in cell (?), huge wide format
331 print(t1.summary(stats=noperc))
332 print(t1.summary())
333 print(t1.summary( orientation='varcols'))
334 print(t1.summary(stats=['mean', 'median', 'min', 'max'], orientation=('varcols')))
335 print(t1.summary(stats='all'))
338 data1 = np.array([(1,2,'a','aa'),
339 (2,3,'b','bb'),
340 (2,4,'b','cc')],
341 dtype = [('alpha',float), ('beta', int),
342 ('gamma', '|S1'), ('delta', '|S2')])
343 data2 = np.array([(1,2),
344 (2,3),
345 (2,4)],
346 dtype = [('alpha',float), ('beta', float)])
348 data3 = np.array([[1,2,4,4],
349 [2,3,3,3],
350 [2,4,4,3]], dtype=float)
353 class TestSimpleTable(object):
354 #from statsmodels.iolib.table import SimpleTable, default_txt_fmt
356 def test_basic_1(self):
357 print('test_basic_1')
358 t1 = Describe(data1)
359 print(t1.summary())
362 def test_basic_2(self):
363 print('test_basic_2')
364 t2 = Describe(data2)
365 print(t2.summary())
367 def test_describe_summary_float_ndarray(self):
368 print('test_describe_summary_float_ndarray')
369 t1 = Describe(data3)
370 print(t1.summary())
372 def test_basic_4(self):
373 print('test_basic_4')
374 t1 = Describe(data4)
375 print(t1.summary())
377 def test_basic_1a(self):
378 print('test_basic_1a')
379 t1 = Describe(data1)
380 print(t1.summary(stats='basic', columns=['alpha']))
382 def test_basic_1b(self):
383 print('test_basic_1b')
384 t1 = Describe(data1)
385 print(t1.summary(stats='basic', columns='all'))
387 def test_basic_2a(self):
388 print('test_basic_2a')
389 t2 = Describe(data2)
390 print(t2.summary(stats='all'))
392 def test_basic_3(aself):
393 t1 = Describe(data3)
394 print(t1.summary(stats='all'))
396 def test_basic_4a(self):
397 t1 = Describe(data4)
398 print(t1.summary(stats='all'))