Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/mstats_extras.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Additional statistics functions with support for masked arrays.
4"""
6# Original author (2007): Pierre GF Gerard-Marchant
9__all__ = ['compare_medians_ms',
10 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
11 'idealfourths',
12 'median_cihs','mjci','mquantiles_cimj',
13 'rsh',
14 'trimmed_mean_ci',]
17import numpy as np
18from numpy import float_, int_, ndarray
20import numpy.ma as ma
21from numpy.ma import MaskedArray
23from . import mstats_basic as mstats
25from scipy.stats.distributions import norm, beta, t, binom
28def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
29 """
30 Computes quantile estimates with the Harrell-Davis method.
32 The quantile estimates are calculated as a weighted linear combination
33 of order statistics.
35 Parameters
36 ----------
37 data : array_like
38 Data array.
39 prob : sequence, optional
40 Sequence of quantiles to compute.
41 axis : int or None, optional
42 Axis along which to compute the quantiles. If None, use a flattened
43 array.
44 var : bool, optional
45 Whether to return the variance of the estimate.
47 Returns
48 -------
49 hdquantiles : MaskedArray
50 A (p,) array of quantiles (if `var` is False), or a (2,p) array of
51 quantiles and variances (if `var` is True), where ``p`` is the
52 number of quantiles.
54 See Also
55 --------
56 hdquantiles_sd
58 """
59 def _hd_1D(data,prob,var):
60 "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
61 xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
62 # Don't use length here, in case we have a numpy scalar
63 n = xsorted.size
65 hd = np.empty((2,len(prob)), float_)
66 if n < 2:
67 hd.flat = np.nan
68 if var:
69 return hd
70 return hd[0]
72 v = np.arange(n+1) / float(n)
73 betacdf = beta.cdf
74 for (i,p) in enumerate(prob):
75 _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
76 w = _w[1:] - _w[:-1]
77 hd_mean = np.dot(w, xsorted)
78 hd[0,i] = hd_mean
79 #
80 hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
81 #
82 hd[0, prob == 0] = xsorted[0]
83 hd[0, prob == 1] = xsorted[-1]
84 if var:
85 hd[1, prob == 0] = hd[1, prob == 1] = np.nan
86 return hd
87 return hd[0]
88 # Initialization & checks
89 data = ma.array(data, copy=False, dtype=float_)
90 p = np.array(prob, copy=False, ndmin=1)
91 # Computes quantiles along axis (or globally)
92 if (axis is None) or (data.ndim == 1):
93 result = _hd_1D(data, p, var)
94 else:
95 if data.ndim > 2:
96 raise ValueError("Array 'data' must be at most two dimensional, "
97 "but got data.ndim = %d" % data.ndim)
98 result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
100 return ma.fix_invalid(result, copy=False)
103def hdmedian(data, axis=-1, var=False):
104 """
105 Returns the Harrell-Davis estimate of the median along the given axis.
107 Parameters
108 ----------
109 data : ndarray
110 Data array.
111 axis : int, optional
112 Axis along which to compute the quantiles. If None, use a flattened
113 array.
114 var : bool, optional
115 Whether to return the variance of the estimate.
117 Returns
118 -------
119 hdmedian : MaskedArray
120 The median values. If ``var=True``, the variance is returned inside
121 the masked array. E.g. for a 1-D array the shape change from (1,) to
122 (2,).
124 """
125 result = hdquantiles(data,[0.5], axis=axis, var=var)
126 return result.squeeze()
129def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
130 """
131 The standard error of the Harrell-Davis quantile estimates by jackknife.
133 Parameters
134 ----------
135 data : array_like
136 Data array.
137 prob : sequence, optional
138 Sequence of quantiles to compute.
139 axis : int, optional
140 Axis along which to compute the quantiles. If None, use a flattened
141 array.
143 Returns
144 -------
145 hdquantiles_sd : MaskedArray
146 Standard error of the Harrell-Davis quantile estimates.
148 See Also
149 --------
150 hdquantiles
152 """
153 def _hdsd_1D(data, prob):
154 "Computes the std error for 1D arrays."
155 xsorted = np.sort(data.compressed())
156 n = len(xsorted)
158 hdsd = np.empty(len(prob), float_)
159 if n < 2:
160 hdsd.flat = np.nan
162 vv = np.arange(n) / float(n-1)
163 betacdf = beta.cdf
165 for (i,p) in enumerate(prob):
166 _w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
167 w = _w[1:] - _w[:-1]
168 mx_ = np.fromiter([np.dot(w,xsorted[np.r_[list(range(0,k)),
169 list(range(k+1,n))].astype(int_)])
170 for k in range(n)], dtype=float_)
171 mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
172 hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
173 return hdsd
175 # Initialization & checks
176 data = ma.array(data, copy=False, dtype=float_)
177 p = np.array(prob, copy=False, ndmin=1)
178 # Computes quantiles along axis (or globally)
179 if (axis is None):
180 result = _hdsd_1D(data, p)
181 else:
182 if data.ndim > 2:
183 raise ValueError("Array 'data' must be at most two dimensional, "
184 "but got data.ndim = %d" % data.ndim)
185 result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
187 return ma.fix_invalid(result, copy=False).ravel()
190def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
191 alpha=0.05, axis=None):
192 """
193 Selected confidence interval of the trimmed mean along the given axis.
195 Parameters
196 ----------
197 data : array_like
198 Input data.
199 limits : {None, tuple}, optional
200 None or a two item tuple.
201 Tuple of the percentages to cut on each side of the array, with respect
202 to the number of unmasked data, as floats between 0. and 1. If ``n``
203 is the number of unmasked data before trimming, then
204 (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
205 largest data are masked. The total number of unmasked data after
206 trimming is ``n * (1. - sum(limits))``.
207 The value of one limit can be set to None to indicate an open interval.
209 Defaults to (0.2, 0.2).
210 inclusive : (2,) tuple of boolean, optional
211 If relative==False, tuple indicating whether values exactly equal to
212 the absolute limits are allowed.
213 If relative==True, tuple indicating whether the number of data being
214 masked on each side should be rounded (True) or truncated (False).
216 Defaults to (True, True).
217 alpha : float, optional
218 Confidence level of the intervals.
220 Defaults to 0.05.
221 axis : int, optional
222 Axis along which to cut. If None, uses a flattened version of `data`.
224 Defaults to None.
226 Returns
227 -------
228 trimmed_mean_ci : (2,) ndarray
229 The lower and upper confidence intervals of the trimmed data.
231 """
232 data = ma.array(data, copy=False)
233 trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
234 tmean = trimmed.mean(axis)
235 tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
236 df = trimmed.count(axis) - 1
237 tppf = t.ppf(1-alpha/2.,df)
238 return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
241def mjci(data, prob=[0.25,0.5,0.75], axis=None):
242 """
243 Returns the Maritz-Jarrett estimators of the standard error of selected
244 experimental quantiles of the data.
246 Parameters
247 ----------
248 data : ndarray
249 Data array.
250 prob : sequence, optional
251 Sequence of quantiles to compute.
252 axis : int or None, optional
253 Axis along which to compute the quantiles. If None, use a flattened
254 array.
256 """
257 def _mjci_1D(data, p):
258 data = np.sort(data.compressed())
259 n = data.size
260 prob = (np.array(p) * n + 0.5).astype(int_)
261 betacdf = beta.cdf
263 mj = np.empty(len(prob), float_)
264 x = np.arange(1,n+1, dtype=float_) / n
265 y = x - 1./n
266 for (i,m) in enumerate(prob):
267 W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
268 C1 = np.dot(W,data)
269 C2 = np.dot(W,data**2)
270 mj[i] = np.sqrt(C2 - C1**2)
271 return mj
273 data = ma.array(data, copy=False)
274 if data.ndim > 2:
275 raise ValueError("Array 'data' must be at most two dimensional, "
276 "but got data.ndim = %d" % data.ndim)
278 p = np.array(prob, copy=False, ndmin=1)
279 # Computes quantiles along axis (or globally)
280 if (axis is None):
281 return _mjci_1D(data, p)
282 else:
283 return ma.apply_along_axis(_mjci_1D, axis, data, p)
286def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
287 """
288 Computes the alpha confidence interval for the selected quantiles of the
289 data, with Maritz-Jarrett estimators.
291 Parameters
292 ----------
293 data : ndarray
294 Data array.
295 prob : sequence, optional
296 Sequence of quantiles to compute.
297 alpha : float, optional
298 Confidence level of the intervals.
299 axis : int or None, optional
300 Axis along which to compute the quantiles.
301 If None, use a flattened array.
303 Returns
304 -------
305 ci_lower : ndarray
306 The lower boundaries of the confidence interval. Of the same length as
307 `prob`.
308 ci_upper : ndarray
309 The upper boundaries of the confidence interval. Of the same length as
310 `prob`.
312 """
313 alpha = min(alpha, 1 - alpha)
314 z = norm.ppf(1 - alpha/2.)
315 xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
316 smj = mjci(data, prob, axis=axis)
317 return (xq - z * smj, xq + z * smj)
320def median_cihs(data, alpha=0.05, axis=None):
321 """
322 Computes the alpha-level confidence interval for the median of the data.
324 Uses the Hettmasperger-Sheather method.
326 Parameters
327 ----------
328 data : array_like
329 Input data. Masked values are discarded. The input should be 1D only,
330 or `axis` should be set to None.
331 alpha : float, optional
332 Confidence level of the intervals.
333 axis : int or None, optional
334 Axis along which to compute the quantiles. If None, use a flattened
335 array.
337 Returns
338 -------
339 median_cihs
340 Alpha level confidence interval.
342 """
343 def _cihs_1D(data, alpha):
344 data = np.sort(data.compressed())
345 n = len(data)
346 alpha = min(alpha, 1-alpha)
347 k = int(binom._ppf(alpha/2., n, 0.5))
348 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
349 if gk < 1-alpha:
350 k -= 1
351 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
352 gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
353 I = (gk - 1 + alpha)/(gk - gkk)
354 lambd = (n-k) * I / float(k + (n-2*k)*I)
355 lims = (lambd*data[k] + (1-lambd)*data[k-1],
356 lambd*data[n-k-1] + (1-lambd)*data[n-k])
357 return lims
358 data = ma.array(data, copy=False)
359 # Computes quantiles along axis (or globally)
360 if (axis is None):
361 result = _cihs_1D(data, alpha)
362 else:
363 if data.ndim > 2:
364 raise ValueError("Array 'data' must be at most two dimensional, "
365 "but got data.ndim = %d" % data.ndim)
366 result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
368 return result
371def compare_medians_ms(group_1, group_2, axis=None):
372 """
373 Compares the medians from two independent groups along the given axis.
375 The comparison is performed using the McKean-Schrader estimate of the
376 standard error of the medians.
378 Parameters
379 ----------
380 group_1 : array_like
381 First dataset. Has to be of size >=7.
382 group_2 : array_like
383 Second dataset. Has to be of size >=7.
384 axis : int, optional
385 Axis along which the medians are estimated. If None, the arrays are
386 flattened. If `axis` is not None, then `group_1` and `group_2`
387 should have the same shape.
389 Returns
390 -------
391 compare_medians_ms : {float, ndarray}
392 If `axis` is None, then returns a float, otherwise returns a 1-D
393 ndarray of floats with a length equal to the length of `group_1`
394 along `axis`.
396 """
397 (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
398 (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
399 mstats.stde_median(group_2, axis=axis))
400 W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
401 return 1 - norm.cdf(W)
404def idealfourths(data, axis=None):
405 """
406 Returns an estimate of the lower and upper quartiles.
408 Uses the ideal fourths algorithm.
410 Parameters
411 ----------
412 data : array_like
413 Input array.
414 axis : int, optional
415 Axis along which the quartiles are estimated. If None, the arrays are
416 flattened.
418 Returns
419 -------
420 idealfourths : {list of floats, masked array}
421 Returns the two internal values that divide `data` into four parts
422 using the ideal fourths algorithm either along the flattened array
423 (if `axis` is None) or along `axis` of `data`.
425 """
426 def _idf(data):
427 x = data.compressed()
428 n = len(x)
429 if n < 3:
430 return [np.nan,np.nan]
431 (j,h) = divmod(n/4. + 5/12.,1)
432 j = int(j)
433 qlo = (1-h)*x[j-1] + h*x[j]
434 k = n - j
435 qup = (1-h)*x[k] + h*x[k-1]
436 return [qlo, qup]
437 data = ma.sort(data, axis=axis).view(MaskedArray)
438 if (axis is None):
439 return _idf(data)
440 else:
441 return ma.apply_along_axis(_idf, axis, data)
444def rsh(data, points=None):
445 """
446 Evaluates Rosenblatt's shifted histogram estimators for each data point.
448 Rosenblatt's estimator is a centered finite-difference approximation to the
449 derivative of the empirical cumulative distribution function.
451 Parameters
452 ----------
453 data : sequence
454 Input data, should be 1-D. Masked values are ignored.
455 points : sequence or None, optional
456 Sequence of points where to evaluate Rosenblatt shifted histogram.
457 If None, use the data.
459 """
460 data = ma.array(data, copy=False)
461 if points is None:
462 points = data
463 else:
464 points = np.array(points, copy=False, ndmin=1)
466 if data.ndim != 1:
467 raise AttributeError("The input array should be 1D only !")
469 n = data.count()
470 r = idealfourths(data, axis=None)
471 h = 1.2 * (r[-1]-r[0]) / n**(1./5)
472 nhi = (data[:,None] <= points[None,:] + h).sum(0)
473 nlo = (data[:,None] < points[None,:] - h).sum(0)
474 return (nhi-nlo) / (2.*n*h)