Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/graphics/gofplots.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from statsmodels.compat.python import lzip
3import numpy as np
4from scipy import stats
6from statsmodels.regression.linear_model import OLS
7from statsmodels.tools.tools import add_constant
8from statsmodels.tools.decorators import cache_readonly
9from statsmodels.distributions import ECDF
10from . import utils
12__all__ = ['qqplot', 'qqplot_2samples', 'qqline', 'ProbPlot']
15class ProbPlot(object):
16 """
17 Q-Q and P-P Probability Plots
19 Can take arguments specifying the parameters for dist or fit them
20 automatically. (See fit under kwargs.)
22 Parameters
23 ----------
24 data : array_like
25 A 1d data array
26 dist : callable
27 Compare x against dist. A scipy.stats or statsmodels distribution. The
28 default is scipy.stats.distributions.norm (a standard normal).
29 fit : bool
30 If fit is false, loc, scale, and distargs are passed to the
31 distribution. If fit is True then the parameters for dist
32 are fit automatically using dist.fit. The quantiles are formed
33 from the standardized data, after subtracting the fitted loc
34 and dividing by the fitted scale.
35 distargs : tuple
36 A tuple of arguments passed to dist to specify it fully
37 so dist.ppf may be called. distargs must not contain loc
38 or scale. These values must be passed using the loc or
39 scale inputs.
40 a : float
41 Offset for the plotting position of an expected order
42 statistic, for example. The plotting positions are given
43 by (i - a)/(nobs - 2*a + 1) for i in range(0,nobs+1)
44 loc : float
45 Location parameter for dist
46 scale : float
47 Scale parameter for dist
49 See Also
50 --------
51 scipy.stats.probplot
53 Notes
54 -----
55 1) Depends on matplotlib.
56 2) If `fit` is True then the parameters are fit using the
57 distribution's `fit()` method.
58 3) The call signatures for the `qqplot`, `ppplot`, and `probplot`
59 methods are similar, so examples 1 through 4 apply to all
60 three methods.
61 4) The three plotting methods are summarized below:
62 ppplot : Probability-Probability plot
63 Compares the sample and theoretical probabilities (percentiles).
64 qqplot : Quantile-Quantile plot
65 Compares the sample and theoretical quantiles
66 probplot : Probability plot
67 Same as a Q-Q plot, however probabilities are shown in the scale of
68 the theoretical distribution (x-axis) and the y-axis contains
69 unscaled quantiles of the sample data.
71 Examples
72 --------
73 The first example shows a Q-Q plot for regression residuals
75 >>> # example 1
76 >>> import statsmodels.api as sm
77 >>> from matplotlib import pyplot as plt
78 >>> data = sm.datasets.longley.load(as_pandas=False)
79 >>> data.exog = sm.add_constant(data.exog)
80 >>> model = sm.OLS(data.endog, data.exog)
81 >>> mod_fit = model.fit()
82 >>> res = mod_fit.resid # residuals
83 >>> probplot = sm.ProbPlot(res)
84 >>> fig = probplot.qqplot()
85 >>> h = plt.title('Ex. 1 - qqplot - residuals of OLS fit')
86 >>> plt.show()
88 qqplot of the residuals against quantiles of t-distribution with 4
89 degrees of freedom:
91 >>> # example 2
92 >>> import scipy.stats as stats
93 >>> probplot = sm.ProbPlot(res, stats.t, distargs=(4,))
94 >>> fig = probplot.qqplot()
95 >>> h = plt.title('Ex. 2 - qqplot - residuals against quantiles of t-dist')
96 >>> plt.show()
98 qqplot against same as above, but with mean 3 and std 10:
100 >>> # example 3
101 >>> probplot = sm.ProbPlot(res, stats.t, distargs=(4,), loc=3, scale=10)
102 >>> fig = probplot.qqplot()
103 >>> h = plt.title('Ex. 3 - qqplot - resids vs quantiles of t-dist')
104 >>> plt.show()
106 Automatically determine parameters for t distribution including the
107 loc and scale:
109 >>> # example 4
110 >>> probplot = sm.ProbPlot(res, stats.t, fit=True)
111 >>> fig = probplot.qqplot(line='45')
112 >>> h = plt.title('Ex. 4 - qqplot - resids vs. quantiles of fitted t-dist')
113 >>> plt.show()
115 A second `ProbPlot` object can be used to compare two separate sample
116 sets by using the `other` kwarg in the `qqplot` and `ppplot` methods.
118 >>> # example 5
119 >>> import numpy as np
120 >>> x = np.random.normal(loc=8.25, scale=2.75, size=37)
121 >>> y = np.random.normal(loc=8.75, scale=3.25, size=37)
122 >>> pp_x = sm.ProbPlot(x, fit=True)
123 >>> pp_y = sm.ProbPlot(y, fit=True)
124 >>> fig = pp_x.qqplot(line='45', other=pp_y)
125 >>> h = plt.title('Ex. 5 - qqplot - compare two sample sets')
126 >>> plt.show()
128 In qqplot, sample size of `other` can be equal or larger than the first.
129 In case of larger, size of `other` samples will be reduced to match the
130 size of the first by interpolation
132 >>> # example 6
133 >>> x = np.random.normal(loc=8.25, scale=2.75, size=37)
134 >>> y = np.random.normal(loc=8.75, scale=3.25, size=57)
135 >>> pp_x = sm.ProbPlot(x, fit=True)
136 >>> pp_y = sm.ProbPlot(y, fit=True)
137 >>> fig = pp_x.qqplot(line='45', other=pp_y)
138 >>> title = 'Ex. 6 - qqplot - compare different sample sizes'
139 >>> h = plt.title(title)
140 >>> plt.show()
142 In ppplot, sample size of `other` and the first can be different. `other`
143 will be used to estimate an empirical cumulative distribution function
144 (ECDF). ECDF(x) will be plotted against p(x)=0.5/n, 1.5/n, ..., (n-0.5)/n
145 where x are sorted samples from the first.
147 >>> # example 7
148 >>> x = np.random.normal(loc=8.25, scale=2.75, size=37)
149 >>> y = np.random.normal(loc=8.75, scale=3.25, size=57)
150 >>> pp_x = sm.ProbPlot(x, fit=True)
151 >>> pp_y = sm.ProbPlot(y, fit=True)
152 >>> fig = pp_y.ppplot(line='45', other=pp_x)
153 >>> h = plt.title('Ex. 7A- ppplot - compare two sample sets, other=pp_x')
154 >>> fig = pp_x.ppplot(line='45', other=pp_y)
155 >>> h = plt.title('Ex. 7B- ppplot - compare two sample sets, other=pp_y')
156 >>> plt.show()
158 The following plot displays some options, follow the link to see the
159 code.
161 .. plot:: plots/graphics_gofplots_qqplot.py
162 """
164 def __init__(self, data, dist=stats.norm, fit=False, distargs=(), a=0,
165 loc=0, scale=1):
167 self.data = data
168 self.a = a
169 self.nobs = data.shape[0]
170 self.distargs = distargs
171 self.fit = fit
173 if isinstance(dist, str):
174 dist = getattr(stats, dist)
176 if fit:
177 self.fit_params = dist.fit(data)
178 self.loc = self.fit_params[-2]
179 self.scale = self.fit_params[-1]
180 if len(self.fit_params) > 2:
181 self.dist = dist(*self.fit_params[:-2],
182 **dict(loc=0, scale=1))
183 else:
184 self.dist = dist(loc=0, scale=1)
185 elif distargs or loc != 0 or scale != 1:
186 try:
187 self.dist = dist(*distargs, **dict(loc=loc, scale=scale))
188 except Exception:
189 distargs = ', '.join([str(da) for da in distargs])
190 cmd = 'dist({distargs}, loc={loc}, scale={scale})'
191 cmd = cmd.format(distargs=distargs, loc=loc, scale=scale)
192 raise TypeError('Initializing the distribution failed. This '
193 'can occur if distargs contains loc or scale. '
194 'The distribution initialization command '
195 'is:\n{cmd}'.format(cmd=cmd))
196 self.loc = loc
197 self.scale = scale
198 self.fit_params = np.r_[distargs, loc, scale]
199 else:
200 self.dist = dist
201 self.loc = loc
202 self.scale = scale
203 self.fit_params = np.r_[loc, scale]
205 # propertes
206 self._cache = {}
208 @cache_readonly
209 def theoretical_percentiles(self):
210 """Theoretical percentiles"""
211 return plotting_pos(self.nobs, self.a)
213 @cache_readonly
214 def theoretical_quantiles(self):
215 """Theoretical quantiles"""
216 try:
217 return self.dist.ppf(self.theoretical_percentiles)
218 except TypeError:
219 msg = '%s requires more parameters to ' \
220 'compute ppf'.format(self.dist.name,)
221 raise TypeError(msg)
222 except:
223 msg = 'failed to compute the ppf of {0}'.format(self.dist.name,)
224 raise
226 @cache_readonly
227 def sorted_data(self):
228 """sorted data"""
229 sorted_data = np.array(self.data, copy=True)
230 sorted_data.sort()
231 return sorted_data
233 @cache_readonly
234 def sample_quantiles(self):
235 """sample quantiles"""
236 if self.fit and self.loc != 0 and self.scale != 1:
237 return (self.sorted_data-self.loc)/self.scale
238 else:
239 return self.sorted_data
241 @cache_readonly
242 def sample_percentiles(self):
243 """Sample percentiles"""
244 quantiles = \
245 (self.sorted_data - self.fit_params[-2])/self.fit_params[-1]
246 return self.dist.cdf(quantiles)
248 def ppplot(self, xlabel=None, ylabel=None, line=None, other=None,
249 ax=None, **plotkwargs):
250 """
251 Plot of the percentiles of x versus the percentiles of a distribution.
253 Parameters
254 ----------
255 xlabel : str or None, optional
256 User-provided labels for the x-axis. If None (default),
257 other values are used depending on the status of the kwarg `other`.
258 ylabel : str or None, optional
259 User-provided labels for the y-axis. If None (default),
260 other values are used depending on the status of the kwarg `other`.
261 line : {None, '45', 's', 'r', q'}, optional
262 Options for the reference line to which the data is compared:
264 - '45': 45-degree line
265 - 's': standardized line, the expected order statistics are
266 scaled by the standard deviation of the given sample and have
267 the mean added to them
268 - 'r': A regression line is fit
269 - 'q': A line is fit through the quartiles.
270 - None: by default no reference line is added to the plot.
272 other : ProbPlot, array_like, or None, optional
273 If provided, ECDF(x) will be plotted against p(x) where x are
274 sorted samples from `self`. ECDF is an empirical cumulative
275 distribution function estimated from `other` and
276 p(x) = 0.5/n, 1.5/n, ..., (n-0.5)/n where n is the number of
277 samples in `self`. If an array-object is provided, it will be
278 turned into a `ProbPlot` instance default parameters. If not
279 provided (default), `self.dist(x)` is be plotted against p(x).
281 ax : AxesSubplot, optional
282 If given, this subplot is used to plot in instead of a new figure
283 being created.
284 **plotkwargs
285 Additional arguments to be passed to the `plot` command.
287 Returns
288 -------
289 Figure
290 If `ax` is None, the created figure. Otherwise the figure to which
291 `ax` is connected.
292 """
293 if other is not None:
294 check_other = isinstance(other, ProbPlot)
295 if not check_other:
296 other = ProbPlot(other)
298 p_x = self.theoretical_percentiles
299 ecdf_x = ECDF(other.sample_quantiles)(self.sample_quantiles)
301 fig, ax = _do_plot(p_x, ecdf_x, self.dist, ax=ax, line=line,
302 **plotkwargs)
304 if xlabel is None:
305 xlabel = 'Probabilities of 2nd Sample'
306 if ylabel is None:
307 ylabel = 'Probabilities of 1st Sample'
309 else:
310 fig, ax = _do_plot(self.theoretical_percentiles,
311 self.sample_percentiles,
312 self.dist, ax=ax, line=line,
313 **plotkwargs)
314 if xlabel is None:
315 xlabel = "Theoretical Probabilities"
316 if ylabel is None:
317 ylabel = "Sample Probabilities"
319 ax.set_xlabel(xlabel)
320 ax.set_ylabel(ylabel)
322 ax.set_xlim([0.0, 1.0])
323 ax.set_ylim([0.0, 1.0])
325 return fig
327 def qqplot(self, xlabel=None, ylabel=None, line=None, other=None,
328 ax=None, **plotkwargs):
329 """
330 Plot of the quantiles of x versus the quantiles/ppf of a distribution.
332 Can also be used to plot against the quantiles of another `ProbPlot`
333 instance.
335 Parameters
336 ----------
337 xlabel : {None, str}
338 User-provided labels for the x-axis. If None (default),
339 other values are used depending on the status of the kwarg `other`.
340 ylabel : {None, str}
341 User-provided labels for the y-axis. If None (default),
342 other values are used depending on the status of the kwarg `other`.
343 line : {None, '45', 's', 'r', q'}, optional
344 Options for the reference line to which the data is compared:
346 - '45' - 45-degree line
347 - 's' - standardized line, the expected order statistics are scaled
348 by the standard deviation of the given sample and have the mean
349 added to them
350 - 'r' - A regression line is fit
351 - 'q' - A line is fit through the quartiles.
352 - None - by default no reference line is added to the plot.
354 other : {ProbPlot, array_like, None}, optional
355 If provided, the sample quantiles of this `ProbPlot` instance are
356 plotted against the sample quantiles of the `other` `ProbPlot`
357 instance. Sample size of `other` must be equal or larger than
358 this `ProbPlot` instance. If the sample size is larger, sample
359 quantiles of `other` will be interpolated to match the sample size
360 of this `ProbPlot` instance. If an array-like object is provided,
361 it will be turned into a `ProbPlot` instance using default
362 parameters. If not provided (default), the theoretical quantiles
363 are used.
364 ax : AxesSubplot, optional
365 If given, this subplot is used to plot in instead of a new figure
366 being created.
367 **plotkwargs
368 Additional arguments to be passed to the `plot` command.
370 Returns
371 -------
372 Figure
373 If `ax` is None, the created figure. Otherwise the figure to which
374 `ax` is connected.
375 """
376 if other is not None:
377 check_other = isinstance(other, ProbPlot)
378 if not check_other:
379 other = ProbPlot(other)
381 s_self = self.sample_quantiles
382 s_other = other.sample_quantiles
384 if len(s_self) > len(s_other):
385 raise ValueError("Sample size of `other` must be equal or " +
386 "larger than this `ProbPlot` instance")
387 elif len(s_self) < len(s_other):
388 # Use quantiles of the smaller set and interpolate quantiles of
389 # the larger data set
390 p = plotting_pos(self.nobs, self.a)
391 s_other = stats.mstats.mquantiles(s_other, p)
393 fig, ax = _do_plot(s_other, s_self, self.dist, ax=ax, line=line,
394 **plotkwargs)
396 if xlabel is None:
397 xlabel = 'Quantiles of 2nd Sample'
398 if ylabel is None:
399 ylabel = 'Quantiles of 1st Sample'
401 else:
402 fig, ax = _do_plot(self.theoretical_quantiles,
403 self.sample_quantiles,
404 self.dist, ax=ax, line=line,
405 **plotkwargs)
406 if xlabel is None:
407 xlabel = "Theoretical Quantiles"
408 if ylabel is None:
409 ylabel = "Sample Quantiles"
411 ax.set_xlabel(xlabel)
412 ax.set_ylabel(ylabel)
414 return fig
416 def probplot(self, xlabel=None, ylabel=None, line=None,
417 exceed=False, ax=None, **plotkwargs):
418 """
419 Plot of unscaled quantiles of x against the prob of a distribution.
421 The x-axis is scaled linearly with the quantiles, but the probabilities
422 are used to label the axis.
424 Parameters
425 ----------
426 xlabel : {None, str}, optional
427 User-provided labels for the x-axis. If None (default),
428 other values are used depending on the status of the kwarg `other`.
429 ylabel : {None, str}, optional
430 User-provided labels for the y-axis. If None (default),
431 other values are used depending on the status of the kwarg `other`.
432 line : {None, '45', 's', 'r', q'}, optional
433 Options for the reference line to which the data is compared:
435 - '45' - 45-degree line
436 - 's' - standardized line, the expected order statistics are scaled
437 by the standard deviation of the given sample and have the mean
438 added to them
439 - 'r' - A regression line is fit
440 - 'q' - A line is fit through the quartiles.
441 - None - by default no reference line is added to the plot.
443 exceed : bool, optional
444 If False (default) the raw sample quantiles are plotted against
445 the theoretical quantiles, show the probability that a sample will
446 not exceed a given value. If True, the theoretical quantiles are
447 flipped such that the figure displays the probability that a
448 sample will exceed a given value.
449 ax : AxesSubplot, optional
450 If given, this subplot is used to plot in instead of a new figure
451 being created.
452 **plotkwargs
453 Additional arguments to be passed to the `plot` command.
455 Returns
456 -------
457 Figure
458 If `ax` is None, the created figure. Otherwise the figure to which
459 `ax` is connected.
460 """
461 if exceed:
462 fig, ax = _do_plot(self.theoretical_quantiles[::-1],
463 self.sorted_data,
464 self.dist, ax=ax, line=line,
465 **plotkwargs)
466 if xlabel is None:
467 xlabel = 'Probability of Exceedance (%)'
469 else:
470 fig, ax = _do_plot(self.theoretical_quantiles,
471 self.sorted_data,
472 self.dist, ax=ax, line=line,
473 **plotkwargs)
474 if xlabel is None:
475 xlabel = 'Non-exceedance Probability (%)'
477 if ylabel is None:
478 ylabel = "Sample Quantiles"
480 ax.set_xlabel(xlabel)
481 ax.set_ylabel(ylabel)
482 _fmt_probplot_axis(ax, self.dist, self.nobs)
484 return fig
487def qqplot(data, dist=stats.norm, distargs=(), a=0, loc=0, scale=1, fit=False,
488 line=None, ax=None, **plotkwargs):
489 """
490 Q-Q plot of the quantiles of x versus the quantiles/ppf of a distribution.
492 Can take arguments specifying the parameters for dist or fit them
493 automatically. (See fit under Parameters.)
495 Parameters
496 ----------
497 data : array_like
498 A 1d data array.
499 dist : callable
500 Comparison distribution. The default is
501 scipy.stats.distributions.norm (a standard normal).
502 distargs : tuple
503 A tuple of arguments passed to dist to specify it fully
504 so dist.ppf may be called.
505 a : float
506 Offset for the plotting position of an expected order statistic, for
507 example. The plotting positions are given by (i - a)/(nobs - 2*a + 1)
508 for i in range(0,nobs+1)
509 loc : float
510 Location parameter for dist
511 scale : float
512 Scale parameter for dist
513 fit : bool
514 If fit is false, loc, scale, and distargs are passed to the
515 distribution. If fit is True then the parameters for dist
516 are fit automatically using dist.fit. The quantiles are formed
517 from the standardized data, after subtracting the fitted loc
518 and dividing by the fitted scale.
519 line : {None, '45', 's', 'r', q'}
520 Options for the reference line to which the data is compared:
522 - '45' - 45-degree line
523 - 's' - standardized line, the expected order statistics are scaled
524 by the standard deviation of the given sample and have the mean
525 added to them
526 - 'r' - A regression line is fit
527 - 'q' - A line is fit through the quartiles.
528 - None - by default no reference line is added to the plot.
530 ax : AxesSubplot, optional
531 If given, this subplot is used to plot in instead of a new figure being
532 created.
533 **plotkwargs
534 Additional matplotlib arguments to be passed to the `plot` command.
536 Returns
537 -------
538 Figure
539 If `ax` is None, the created figure. Otherwise the figure to which
540 `ax` is connected.
542 See Also
543 --------
544 scipy.stats.probplot
546 Notes
547 -----
548 Depends on matplotlib. If `fit` is True then the parameters are fit using
549 the distribution's fit() method.
551 Examples
552 --------
553 >>> import statsmodels.api as sm
554 >>> from matplotlib import pyplot as plt
555 >>> data = sm.datasets.longley.load(as_pandas=False)
556 >>> data.exog = sm.add_constant(data.exog)
557 >>> mod_fit = sm.OLS(data.endog, data.exog).fit()
558 >>> res = mod_fit.resid # residuals
559 >>> fig = sm.qqplot(res)
560 >>> plt.show()
562 qqplot of the residuals against quantiles of t-distribution with 4 degrees
563 of freedom:
565 >>> import scipy.stats as stats
566 >>> fig = sm.qqplot(res, stats.t, distargs=(4,))
567 >>> plt.show()
569 qqplot against same as above, but with mean 3 and std 10:
571 >>> fig = sm.qqplot(res, stats.t, distargs=(4,), loc=3, scale=10)
572 >>> plt.show()
574 Automatically determine parameters for t distribution including the
575 loc and scale:
577 >>> fig = sm.qqplot(res, stats.t, fit=True, line='45')
578 >>> plt.show()
580 The following plot displays some options, follow the link to see the code.
582 .. plot:: plots/graphics_gofplots_qqplot.py
583 """
584 probplot = ProbPlot(data, dist=dist, distargs=distargs,
585 fit=fit, a=a, loc=loc, scale=scale)
586 fig = probplot.qqplot(ax=ax, line=line, **plotkwargs)
587 return fig
590def qqplot_2samples(data1, data2, xlabel=None, ylabel=None, line=None,
591 ax=None):
592 """
593 Q-Q Plot of two samples' quantiles.
595 Can take either two `ProbPlot` instances or two array-like objects. In the
596 case of the latter, both inputs will be converted to `ProbPlot` instances
597 using only the default values - so use `ProbPlot` instances if
598 finer-grained control of the quantile computations is required.
600 Parameters
601 ----------
602 data1 : {array_like, ProbPlot}
603 Data to plot along x axis.
604 data2 : {array_like, ProbPlot}
605 Data to plot along y axis.
606 xlabel : {None, str}
607 User-provided labels for the x-axis. If None (default),
608 other values are used.
609 ylabel : {None, str}
610 User-provided labels for the y-axis. If None (default),
611 other values are used.
612 line : {None, '45', 's', 'r', q'}
613 Options for the reference line to which the data is compared:
615 - '45' - 45-degree line
616 - 's' - standardized line, the expected order statistics are scaled
617 by the standard deviation of the given sample and have the mean
618 added to them
619 - 'r' - A regression line is fit
620 - 'q' - A line is fit through the quartiles.
621 - None - by default no reference line is added to the plot.
623 ax : AxesSubplot, optional
624 If given, this subplot is used to plot in instead of a new figure being
625 created.
627 Returns
628 -------
629 Figure
630 If `ax` is None, the created figure. Otherwise the figure to which
631 `ax` is connected.
633 See Also
634 --------
635 scipy.stats.probplot
637 Notes
638 -----
639 1) Depends on matplotlib.
640 2) If `data1` and `data2` are not `ProbPlot` instances, instances will be
641 created using the default parameters. Therefore, it is recommended to use
642 `ProbPlot` instance if fine-grained control is needed in the computation
643 of the quantiles.
645 Examples
646 --------
647 >>> import statsmodels.api as sm
648 >>> import numpy as np
649 >>> import matplotlib.pyplot as plt
650 >>> from statsmodels.graphics.gofplots import qqplot_2samples
651 >>> x = np.random.normal(loc=8.5, scale=2.5, size=37)
652 >>> y = np.random.normal(loc=8.0, scale=3.0, size=37)
653 >>> pp_x = sm.ProbPlot(x)
654 >>> pp_y = sm.ProbPlot(y)
655 >>> qqplot_2samples(pp_x, pp_y)
656 >>> plt.show()
658 .. plot:: plots/graphics_gofplots_qqplot_2samples.py
660 >>> fig = qqplot_2samples(pp_x, pp_y, xlabel=None, ylabel=None, \
661 ... line=None, ax=None)
662 """
663 if not isinstance(data1, ProbPlot):
664 data1 = ProbPlot(data1)
666 if not isinstance(data2, ProbPlot):
667 data2 = ProbPlot(data2)
669 fig = data1.qqplot(xlabel=xlabel, ylabel=ylabel,
670 line=line, other=data2, ax=ax)
672 return fig
675def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
676 """
677 Plot a reference line for a qqplot.
679 Parameters
680 ----------
681 ax : matplotlib axes instance
682 The axes on which to plot the line
683 line : str {'45','r','s','q'}
684 Options for the reference line to which the data is compared.:
686 - '45' - 45-degree line
687 - 's' - standardized line, the expected order statistics are scaled by
688 the standard deviation of the given sample and have the mean
689 added to them
690 - 'r' - A regression line is fit
691 - 'q' - A line is fit through the quartiles.
692 - None - By default no reference line is added to the plot.
694 x : ndarray
695 X data for plot. Not needed if line is '45'.
696 y : ndarray
697 Y data for plot. Not needed if line is '45'.
698 dist : scipy.stats.distribution
699 A scipy.stats distribution, needed if line is 'q'.
700 fmt : str, optional
701 Line format string passed to `plot`.
703 Notes
704 -----
705 There is no return value. The line is plotted on the given `ax`.
707 Examples
708 --------
709 Import the food expenditure dataset. Plot annual food expenditure on x-axis
710 and household income on y-axis. Use qqline to add regression line into the
711 plot.
713 >>> import statsmodels.api as sm
714 >>> import numpy as np
715 >>> import matplotlib.pyplot as plt
716 >>> from statsmodels.graphics.gofplots import qqline
718 >>> foodexp = sm.datasets.engel.load(as_pandas=False)
719 >>> x = foodexp.exog
720 >>> y = foodexp.endog
721 >>> ax = plt.subplot(111)
722 >>> plt.scatter(x, y)
723 >>> ax.set_xlabel(foodexp.exog_name[0])
724 >>> ax.set_ylabel(foodexp.endog_name)
725 >>> qqline(ax, 'r', x, y)
726 >>> plt.show()
728 .. plot:: plots/graphics_gofplots_qqplot_qqline.py
729 """
730 if line == '45':
731 end_pts = lzip(ax.get_xlim(), ax.get_ylim())
732 end_pts[0] = min(end_pts[0])
733 end_pts[1] = max(end_pts[1])
734 ax.plot(end_pts, end_pts, fmt)
735 ax.set_xlim(end_pts)
736 ax.set_ylim(end_pts)
737 return # does this have any side effects?
738 if x is None and y is None:
739 raise ValueError("If line is not 45, x and y cannot be None.")
740 elif line == 'r':
741 # could use ax.lines[0].get_xdata(), get_ydata(),
742 # but do not know axes are 'clean'
743 y = OLS(y, add_constant(x)).fit().fittedvalues
744 ax.plot(x,y,fmt)
745 elif line == 's':
746 m,b = y.std(), y.mean()
747 ref_line = x*m + b
748 ax.plot(x, ref_line, fmt)
749 elif line == 'q':
750 _check_for_ppf(dist)
751 q25 = stats.scoreatpercentile(y, 25)
752 q75 = stats.scoreatpercentile(y, 75)
753 theoretical_quartiles = dist.ppf([0.25, 0.75])
754 m = (q75 - q25) / np.diff(theoretical_quartiles)
755 b = q25 - m*theoretical_quartiles[0]
756 ax.plot(x, m*x + b, fmt)
759# about 10x faster than plotting_position in sandbox and mstats
760def plotting_pos(nobs, a):
761 """
762 Generates sequence of plotting positions
764 Parameters
765 ----------
766 nobs : int
767 Number of probability points to plot
768 a : float
769 Offset for the plotting position of an expected order statistic, for
770 example.
772 Returns
773 -------
774 plotting_positions : ndarray
775 The plotting positions
777 Notes
778 -----
779 The plotting positions are given by (i - a)/(nobs - 2*a + 1) for i in
780 range(0,nobs+1)
782 See Also
783 --------
784 scipy.stats.mstats.plotting_positions
785 """
786 return (np.arange(1., nobs + 1) - a)/(nobs - 2 * a + 1)
789def _fmt_probplot_axis(ax, dist, nobs):
790 """
791 Formats a theoretical quantile axis to display the corresponding
792 probabilities on the quantiles' scale.
794 Parameteters
795 ------------
796 ax : AxesSubplot, optional
797 The axis to be formatted
798 nobs : scalar
799 Numbero of observations in the sample
800 dist : scipy.stats.distribution
801 A scipy.stats distribution sufficiently specified to impletment its
802 ppf() method.
804 Returns
805 -------
806 There is no return value. This operates on `ax` in place
807 """
808 _check_for_ppf(dist)
809 if nobs < 50:
810 axis_probs = np.array([1, 2, 5, 10, 20, 30, 40, 50, 60,
811 70, 80, 90, 95, 98, 99, ]) / 100.0
812 elif nobs < 500:
813 axis_probs = np.array([0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 30, 40, 50, 60,
814 70, 80, 90, 95, 98, 99, 99.5, 99.8,
815 99.9]) / 100.0
816 else:
817 axis_probs = np.array([0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10,
818 20, 30, 40, 50, 60, 70, 80, 90, 95, 98, 99,
819 99.5, 99.8, 99.9, 99.95, 99.98, 99.99]) / 100.0
820 axis_qntls = dist.ppf(axis_probs)
821 ax.set_xticks(axis_qntls)
822 ax.set_xticklabels(axis_probs*100, rotation=45,
823 rotation_mode='anchor',
824 horizontalalignment='right',
825 verticalalignment='center')
826 ax.set_xlim([axis_qntls.min(), axis_qntls.max()])
829def _do_plot(x, y, dist=None, line=False, ax=None, fmt='bo', **kwargs):
830 """
831 Boiler plate plotting function for the `ppplot`, `qqplot`, and
832 `probplot` methods of the `ProbPlot` class
834 Parameteters
835 ------------
836 x, y : array_like
837 Data to be plotted
838 dist : scipy.stats.distribution
839 A scipy.stats distribution, needed if `line` is 'q'.
840 line : str {'45', 's', 'r', q'} or None
841 Options for the reference line to which the data is compared.
842 ax : AxesSubplot, optional
843 If given, this subplot is used to plot in instead of a new figure being
844 created.
845 fmt : str, optional
846 matplotlib-compatible formatting string for the data markers
847 kwargs : keywords
848 These are passed to matplotlib.plot
850 Returns
851 -------
852 fig : Figure
853 The figure containing `ax`.
854 ax : AxesSubplot
855 The original axes if provided. Otherwise a new instance.
856 """
857 fig, ax = utils.create_mpl_ax(ax)
858 ax.set_xmargin(0.02)
859 ax.plot(x, y, fmt, **kwargs)
860 if line:
861 if line not in ['r','q','45','s']:
862 msg = "%s option for line not understood" % line
863 raise ValueError(msg)
865 qqline(ax, line, x=x, y=y, dist=dist)
867 return fig, ax
870def _check_for_ppf(dist):
871 if not hasattr(dist, 'ppf'):
872 raise ValueError("distribution must have a ppf method")