Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/stats/_stats_mstats_common.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from collections import namedtuple
3import numpy as np
5from . import distributions
8__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
10LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
11 'rvalue', 'pvalue',
12 'stderr'))
15def linregress(x, y=None):
16 """
17 Calculate a linear least-squares regression for two sets of measurements.
19 Parameters
20 ----------
21 x, y : array_like
22 Two sets of measurements. Both arrays should have the same length. If
23 only `x` is given (and ``y=None``), then it must be a two-dimensional
24 array where one dimension has length 2. The two sets of measurements
25 are then found by splitting the array along the length-2 dimension. In
26 the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
27 equivalent to ``linregress(x[0], x[1])``.
29 Returns
30 -------
31 slope : float
32 Slope of the regression line.
33 intercept : float
34 Intercept of the regression line.
35 rvalue : float
36 Correlation coefficient.
37 pvalue : float
38 Two-sided p-value for a hypothesis test whose null hypothesis is
39 that the slope is zero, using Wald Test with t-distribution of
40 the test statistic.
41 stderr : float
42 Standard error of the estimated gradient.
44 See also
45 --------
46 :func:`scipy.optimize.curve_fit` : Use non-linear
47 least squares to fit a function to data.
48 :func:`scipy.optimize.leastsq` : Minimize the sum of
49 squares of a set of equations.
51 Notes
52 -----
53 Missing values are considered pair-wise: if a value is missing in `x`,
54 the corresponding value in `y` is masked.
56 Examples
57 --------
58 >>> import matplotlib.pyplot as plt
59 >>> from scipy import stats
61 Generate some data:
63 >>> np.random.seed(12345678)
64 >>> x = np.random.random(10)
65 >>> y = 1.6*x + np.random.random(10)
67 Perform the linear regression:
69 >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
70 >>> print("slope: %f intercept: %f" % (slope, intercept))
71 slope: 1.944864 intercept: 0.268578
73 To get coefficient of determination (R-squared):
75 >>> print("R-squared: %f" % r_value**2)
76 R-squared: 0.735498
78 Plot the data along with the fitted line:
80 >>> plt.plot(x, y, 'o', label='original data')
81 >>> plt.plot(x, intercept + slope*x, 'r', label='fitted line')
82 >>> plt.legend()
83 >>> plt.show()
85 Example for the case where only x is provided as a 2x2 array:
87 >>> x = np.array([[0, 1], [0, 2]])
88 >>> r = stats.linregress(x)
89 >>> r.slope, r.intercept
90 (2.0, 0.0)
92 """
93 TINY = 1.0e-20
94 if y is None: # x is a (2, N) or (N, 2) shaped array_like
95 x = np.asarray(x)
96 if x.shape[0] == 2:
97 x, y = x
98 elif x.shape[1] == 2:
99 x, y = x.T
100 else:
101 msg = ("If only `x` is given as input, it has to be of shape "
102 "(2, N) or (N, 2), provided shape was %s" % str(x.shape))
103 raise ValueError(msg)
104 else:
105 x = np.asarray(x)
106 y = np.asarray(y)
108 if x.size == 0 or y.size == 0:
109 raise ValueError("Inputs must not be empty.")
111 n = len(x)
112 xmean = np.mean(x, None)
113 ymean = np.mean(y, None)
115 # average sum of squares:
116 ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
117 r_num = ssxym
118 r_den = np.sqrt(ssxm * ssym)
119 if r_den == 0.0:
120 r = 0.0
121 else:
122 r = r_num / r_den
123 # test for numerical error propagation
124 if r > 1.0:
125 r = 1.0
126 elif r < -1.0:
127 r = -1.0
129 df = n - 2
130 slope = r_num / ssxm
131 intercept = ymean - slope*xmean
132 if n == 2:
133 # handle case when only two points are passed in
134 if y[0] == y[1]:
135 prob = 1.0
136 else:
137 prob = 0.0
138 sterrest = 0.0
139 else:
140 t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
141 prob = 2 * distributions.t.sf(np.abs(t), df)
142 sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
144 return LinregressResult(slope, intercept, r, prob, sterrest)
147def theilslopes(y, x=None, alpha=0.95):
148 r"""
149 Computes the Theil-Sen estimator for a set of points (x, y).
151 `theilslopes` implements a method for robust linear regression. It
152 computes the slope as the median of all slopes between paired values.
154 Parameters
155 ----------
156 y : array_like
157 Dependent variable.
158 x : array_like or None, optional
159 Independent variable. If None, use ``arange(len(y))`` instead.
160 alpha : float, optional
161 Confidence degree between 0 and 1. Default is 95% confidence.
162 Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
163 interpreted as "find the 90% confidence interval".
165 Returns
166 -------
167 medslope : float
168 Theil slope.
169 medintercept : float
170 Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
171 lo_slope : float
172 Lower bound of the confidence interval on `medslope`.
173 up_slope : float
174 Upper bound of the confidence interval on `medslope`.
176 See also
177 --------
178 siegelslopes : a similar technique using repeated medians
180 Notes
181 -----
182 The implementation of `theilslopes` follows [1]_. The intercept is
183 not defined in [1]_, and here it is defined as ``median(y) -
184 medslope*median(x)``, which is given in [3]_. Other definitions of
185 the intercept exist in the literature. A confidence interval for
186 the intercept is not given as this question is not addressed in
187 [1]_.
189 References
190 ----------
191 .. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau",
192 J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
193 .. [2] H. Theil, "A rank-invariant method of linear and polynomial
194 regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
195 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
196 .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
197 John Wiley and Sons, New York, pp. 493.
199 Examples
200 --------
201 >>> from scipy import stats
202 >>> import matplotlib.pyplot as plt
204 >>> x = np.linspace(-5, 5, num=150)
205 >>> y = x + np.random.normal(size=x.size)
206 >>> y[11:15] += 10 # add outliers
207 >>> y[-5:] -= 7
209 Compute the slope, intercept and 90% confidence interval. For comparison,
210 also compute the least-squares fit with `linregress`:
212 >>> res = stats.theilslopes(y, x, 0.90)
213 >>> lsq_res = stats.linregress(x, y)
215 Plot the results. The Theil-Sen regression line is shown in red, with the
216 dashed red lines illustrating the confidence interval of the slope (note
217 that the dashed red lines are not the confidence interval of the regression
218 as the confidence interval of the intercept is not included). The green
219 line shows the least-squares fit for comparison.
221 >>> fig = plt.figure()
222 >>> ax = fig.add_subplot(111)
223 >>> ax.plot(x, y, 'b.')
224 >>> ax.plot(x, res[1] + res[0] * x, 'r-')
225 >>> ax.plot(x, res[1] + res[2] * x, 'r--')
226 >>> ax.plot(x, res[1] + res[3] * x, 'r--')
227 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
228 >>> plt.show()
230 """
231 # We copy both x and y so we can use _find_repeats.
232 y = np.array(y).flatten()
233 if x is None:
234 x = np.arange(len(y), dtype=float)
235 else:
236 x = np.array(x, dtype=float).flatten()
237 if len(x) != len(y):
238 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
240 # Compute sorted slopes only when deltax > 0
241 deltax = x[:, np.newaxis] - x
242 deltay = y[:, np.newaxis] - y
243 slopes = deltay[deltax > 0] / deltax[deltax > 0]
244 slopes.sort()
245 medslope = np.median(slopes)
246 medinter = np.median(y) - medslope * np.median(x)
247 # Now compute confidence intervals
248 if alpha > 0.5:
249 alpha = 1. - alpha
251 z = distributions.norm.ppf(alpha / 2.)
252 # This implements (2.6) from Sen (1968)
253 _, nxreps = _find_repeats(x)
254 _, nyreps = _find_repeats(y)
255 nt = len(slopes) # N in Sen (1968)
256 ny = len(y) # n in Sen (1968)
257 # Equation 2.6 in Sen (1968):
258 sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
259 sum(k * (k-1) * (2*k + 5) for k in nxreps) -
260 sum(k * (k-1) * (2*k + 5) for k in nyreps))
261 # Find the confidence interval indices in `slopes`
262 sigma = np.sqrt(sigsq)
263 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
264 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
265 delta = slopes[[Rl, Ru]]
266 return medslope, medinter, delta[0], delta[1]
269def _find_repeats(arr):
270 # This function assumes it may clobber its input.
271 if len(arr) == 0:
272 return np.array(0, np.float64), np.array(0, np.intp)
274 # XXX This cast was previously needed for the Fortran implementation,
275 # should we ditch it?
276 arr = np.asarray(arr, np.float64).ravel()
277 arr.sort()
279 # Taken from NumPy 1.9's np.unique.
280 change = np.concatenate(([True], arr[1:] != arr[:-1]))
281 unique = arr[change]
282 change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
283 freq = np.diff(change_idx)
284 atleast2 = freq > 1
285 return unique[atleast2], freq[atleast2]
288def siegelslopes(y, x=None, method="hierarchical"):
289 r"""
290 Computes the Siegel estimator for a set of points (x, y).
292 `siegelslopes` implements a method for robust linear regression
293 using repeated medians (see [1]_) to fit a line to the points (x, y).
294 The method is robust to outliers with an asymptotic breakdown point
295 of 50%.
297 Parameters
298 ----------
299 y : array_like
300 Dependent variable.
301 x : array_like or None, optional
302 Independent variable. If None, use ``arange(len(y))`` instead.
303 method : {'hierarchical', 'separate'}
304 If 'hierarchical', estimate the intercept using the estimated
305 slope ``medslope`` (default option).
306 If 'separate', estimate the intercept independent of the estimated
307 slope. See Notes for details.
309 Returns
310 -------
311 medslope : float
312 Estimate of the slope of the regression line.
313 medintercept : float
314 Estimate of the intercept of the regression line.
316 See also
317 --------
318 theilslopes : a similar technique without repeated medians
320 Notes
321 -----
322 With ``n = len(y)``, compute ``m_j`` as the median of
323 the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
324 ``medslope`` is then the median of all slopes ``m_j``.
325 Two ways are given to estimate the intercept in [1]_ which can be chosen
326 via the parameter ``method``.
327 The hierarchical approach uses the estimated slope ``medslope``
328 and computes ``medintercept`` as the median of ``y - medslope*x``.
329 The other approach estimates the intercept separately as follows: for
330 each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
331 lines through the remaining points and take the median ``i_j``.
332 ``medintercept`` is the median of the ``i_j``.
334 The implementation computes `n` times the median of a vector of size `n`
335 which can be slow for large vectors. There are more efficient algorithms
336 (see [2]_) which are not implemented here.
338 References
339 ----------
340 .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
341 Biometrika, Vol. 69, pp. 242-244, 1982.
343 .. [2] A. Stein and M. Werman, "Finding the repeated median regression
344 line", Proceedings of the Third Annual ACM-SIAM Symposium on
345 Discrete Algorithms, pp. 409-413, 1992.
347 Examples
348 --------
349 >>> from scipy import stats
350 >>> import matplotlib.pyplot as plt
352 >>> x = np.linspace(-5, 5, num=150)
353 >>> y = x + np.random.normal(size=x.size)
354 >>> y[11:15] += 10 # add outliers
355 >>> y[-5:] -= 7
357 Compute the slope and intercept. For comparison, also compute the
358 least-squares fit with `linregress`:
360 >>> res = stats.siegelslopes(y, x)
361 >>> lsq_res = stats.linregress(x, y)
363 Plot the results. The Siegel regression line is shown in red. The green
364 line shows the least-squares fit for comparison.
366 >>> fig = plt.figure()
367 >>> ax = fig.add_subplot(111)
368 >>> ax.plot(x, y, 'b.')
369 >>> ax.plot(x, res[1] + res[0] * x, 'r-')
370 >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
371 >>> plt.show()
373 """
374 if method not in ['hierarchical', 'separate']:
375 raise ValueError("method can only be 'hierarchical' or 'separate'")
376 y = np.asarray(y).ravel()
377 if x is None:
378 x = np.arange(len(y), dtype=float)
379 else:
380 x = np.asarray(x, dtype=float).ravel()
381 if len(x) != len(y):
382 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
384 deltax = x[:, np.newaxis] - x
385 deltay = y[:, np.newaxis] - y
386 slopes, intercepts = [], []
388 for j in range(len(x)):
389 id_nonzero = deltax[j, :] != 0
390 slopes_j = deltay[j, id_nonzero] / deltax[j, id_nonzero]
391 medslope_j = np.median(slopes_j)
392 slopes.append(medslope_j)
393 if method == 'separate':
394 z = y*x[j] - y[j]*x
395 medintercept_j = np.median(z[id_nonzero] / deltax[j, id_nonzero])
396 intercepts.append(medintercept_j)
398 medslope = np.median(np.asarray(slopes))
399 if method == "separate":
400 medinter = np.median(np.asarray(intercepts))
401 else:
402 medinter = np.median(y - medslope*x)
404 return medslope, medinter