Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from statsmodels.compat.python import lzip 

2 

3import numpy as np 

4from scipy import stats 

5 

6from statsmodels.regression.linear_model import OLS 

7from statsmodels.tools.tools import add_constant 

8from statsmodels.tools.decorators import cache_readonly 

9from statsmodels.distributions import ECDF 

10from . import utils 

11 

12__all__ = ['qqplot', 'qqplot_2samples', 'qqline', 'ProbPlot'] 

13 

14 

15class ProbPlot(object): 

16 """ 

17 Q-Q and P-P Probability Plots 

18 

19 Can take arguments specifying the parameters for dist or fit them 

20 automatically. (See fit under kwargs.) 

21 

22 Parameters 

23 ---------- 

24 data : array_like 

25 A 1d data array 

26 dist : callable 

27 Compare x against dist. A scipy.stats or statsmodels distribution. The 

28 default is scipy.stats.distributions.norm (a standard normal). 

29 fit : bool 

30 If fit is false, loc, scale, and distargs are passed to the 

31 distribution. If fit is True then the parameters for dist 

32 are fit automatically using dist.fit. The quantiles are formed 

33 from the standardized data, after subtracting the fitted loc 

34 and dividing by the fitted scale. 

35 distargs : tuple 

36 A tuple of arguments passed to dist to specify it fully 

37 so dist.ppf may be called. distargs must not contain loc 

38 or scale. These values must be passed using the loc or 

39 scale inputs. 

40 a : float 

41 Offset for the plotting position of an expected order 

42 statistic, for example. The plotting positions are given 

43 by (i - a)/(nobs - 2*a + 1) for i in range(0,nobs+1) 

44 loc : float 

45 Location parameter for dist 

46 scale : float 

47 Scale parameter for dist 

48 

49 See Also 

50 -------- 

51 scipy.stats.probplot 

52 

53 Notes 

54 ----- 

55 1) Depends on matplotlib. 

56 2) If `fit` is True then the parameters are fit using the 

57 distribution's `fit()` method. 

58 3) The call signatures for the `qqplot`, `ppplot`, and `probplot` 

59 methods are similar, so examples 1 through 4 apply to all 

60 three methods. 

61 4) The three plotting methods are summarized below: 

62 ppplot : Probability-Probability plot 

63 Compares the sample and theoretical probabilities (percentiles). 

64 qqplot : Quantile-Quantile plot 

65 Compares the sample and theoretical quantiles 

66 probplot : Probability plot 

67 Same as a Q-Q plot, however probabilities are shown in the scale of 

68 the theoretical distribution (x-axis) and the y-axis contains 

69 unscaled quantiles of the sample data. 

70 

71 Examples 

72 -------- 

73 The first example shows a Q-Q plot for regression residuals 

74 

75 >>> # example 1 

76 >>> import statsmodels.api as sm 

77 >>> from matplotlib import pyplot as plt 

78 >>> data = sm.datasets.longley.load(as_pandas=False) 

79 >>> data.exog = sm.add_constant(data.exog) 

80 >>> model = sm.OLS(data.endog, data.exog) 

81 >>> mod_fit = model.fit() 

82 >>> res = mod_fit.resid # residuals 

83 >>> probplot = sm.ProbPlot(res) 

84 >>> fig = probplot.qqplot() 

85 >>> h = plt.title('Ex. 1 - qqplot - residuals of OLS fit') 

86 >>> plt.show() 

87 

88 qqplot of the residuals against quantiles of t-distribution with 4 

89 degrees of freedom: 

90 

91 >>> # example 2 

92 >>> import scipy.stats as stats 

93 >>> probplot = sm.ProbPlot(res, stats.t, distargs=(4,)) 

94 >>> fig = probplot.qqplot() 

95 >>> h = plt.title('Ex. 2 - qqplot - residuals against quantiles of t-dist') 

96 >>> plt.show() 

97 

98 qqplot against same as above, but with mean 3 and std 10: 

99 

100 >>> # example 3 

101 >>> probplot = sm.ProbPlot(res, stats.t, distargs=(4,), loc=3, scale=10) 

102 >>> fig = probplot.qqplot() 

103 >>> h = plt.title('Ex. 3 - qqplot - resids vs quantiles of t-dist') 

104 >>> plt.show() 

105 

106 Automatically determine parameters for t distribution including the 

107 loc and scale: 

108 

109 >>> # example 4 

110 >>> probplot = sm.ProbPlot(res, stats.t, fit=True) 

111 >>> fig = probplot.qqplot(line='45') 

112 >>> h = plt.title('Ex. 4 - qqplot - resids vs. quantiles of fitted t-dist') 

113 >>> plt.show() 

114 

115 A second `ProbPlot` object can be used to compare two separate sample 

116 sets by using the `other` kwarg in the `qqplot` and `ppplot` methods. 

117 

118 >>> # example 5 

119 >>> import numpy as np 

120 >>> x = np.random.normal(loc=8.25, scale=2.75, size=37) 

121 >>> y = np.random.normal(loc=8.75, scale=3.25, size=37) 

122 >>> pp_x = sm.ProbPlot(x, fit=True) 

123 >>> pp_y = sm.ProbPlot(y, fit=True) 

124 >>> fig = pp_x.qqplot(line='45', other=pp_y) 

125 >>> h = plt.title('Ex. 5 - qqplot - compare two sample sets') 

126 >>> plt.show() 

127 

128 In qqplot, sample size of `other` can be equal or larger than the first. 

129 In case of larger, size of `other` samples will be reduced to match the 

130 size of the first by interpolation 

131 

132 >>> # example 6 

133 >>> x = np.random.normal(loc=8.25, scale=2.75, size=37) 

134 >>> y = np.random.normal(loc=8.75, scale=3.25, size=57) 

135 >>> pp_x = sm.ProbPlot(x, fit=True) 

136 >>> pp_y = sm.ProbPlot(y, fit=True) 

137 >>> fig = pp_x.qqplot(line='45', other=pp_y) 

138 >>> title = 'Ex. 6 - qqplot - compare different sample sizes' 

139 >>> h = plt.title(title) 

140 >>> plt.show() 

141 

142 In ppplot, sample size of `other` and the first can be different. `other` 

143 will be used to estimate an empirical cumulative distribution function 

144 (ECDF). ECDF(x) will be plotted against p(x)=0.5/n, 1.5/n, ..., (n-0.5)/n 

145 where x are sorted samples from the first. 

146 

147 >>> # example 7 

148 >>> x = np.random.normal(loc=8.25, scale=2.75, size=37) 

149 >>> y = np.random.normal(loc=8.75, scale=3.25, size=57) 

150 >>> pp_x = sm.ProbPlot(x, fit=True) 

151 >>> pp_y = sm.ProbPlot(y, fit=True) 

152 >>> fig = pp_y.ppplot(line='45', other=pp_x) 

153 >>> h = plt.title('Ex. 7A- ppplot - compare two sample sets, other=pp_x') 

154 >>> fig = pp_x.ppplot(line='45', other=pp_y) 

155 >>> h = plt.title('Ex. 7B- ppplot - compare two sample sets, other=pp_y') 

156 >>> plt.show() 

157 

158 The following plot displays some options, follow the link to see the 

159 code. 

160 

161 .. plot:: plots/graphics_gofplots_qqplot.py 

162 """ 

163 

164 def __init__(self, data, dist=stats.norm, fit=False, distargs=(), a=0, 

165 loc=0, scale=1): 

166 

167 self.data = data 

168 self.a = a 

169 self.nobs = data.shape[0] 

170 self.distargs = distargs 

171 self.fit = fit 

172 

173 if isinstance(dist, str): 

174 dist = getattr(stats, dist) 

175 

176 if fit: 

177 self.fit_params = dist.fit(data) 

178 self.loc = self.fit_params[-2] 

179 self.scale = self.fit_params[-1] 

180 if len(self.fit_params) > 2: 

181 self.dist = dist(*self.fit_params[:-2], 

182 **dict(loc=0, scale=1)) 

183 else: 

184 self.dist = dist(loc=0, scale=1) 

185 elif distargs or loc != 0 or scale != 1: 

186 try: 

187 self.dist = dist(*distargs, **dict(loc=loc, scale=scale)) 

188 except Exception: 

189 distargs = ', '.join([str(da) for da in distargs]) 

190 cmd = 'dist({distargs}, loc={loc}, scale={scale})' 

191 cmd = cmd.format(distargs=distargs, loc=loc, scale=scale) 

192 raise TypeError('Initializing the distribution failed. This ' 

193 'can occur if distargs contains loc or scale. ' 

194 'The distribution initialization command ' 

195 'is:\n{cmd}'.format(cmd=cmd)) 

196 self.loc = loc 

197 self.scale = scale 

198 self.fit_params = np.r_[distargs, loc, scale] 

199 else: 

200 self.dist = dist 

201 self.loc = loc 

202 self.scale = scale 

203 self.fit_params = np.r_[loc, scale] 

204 

205 # propertes 

206 self._cache = {} 

207 

208 @cache_readonly 

209 def theoretical_percentiles(self): 

210 """Theoretical percentiles""" 

211 return plotting_pos(self.nobs, self.a) 

212 

213 @cache_readonly 

214 def theoretical_quantiles(self): 

215 """Theoretical quantiles""" 

216 try: 

217 return self.dist.ppf(self.theoretical_percentiles) 

218 except TypeError: 

219 msg = '%s requires more parameters to ' \ 

220 'compute ppf'.format(self.dist.name,) 

221 raise TypeError(msg) 

222 except: 

223 msg = 'failed to compute the ppf of {0}'.format(self.dist.name,) 

224 raise 

225 

226 @cache_readonly 

227 def sorted_data(self): 

228 """sorted data""" 

229 sorted_data = np.array(self.data, copy=True) 

230 sorted_data.sort() 

231 return sorted_data 

232 

233 @cache_readonly 

234 def sample_quantiles(self): 

235 """sample quantiles""" 

236 if self.fit and self.loc != 0 and self.scale != 1: 

237 return (self.sorted_data-self.loc)/self.scale 

238 else: 

239 return self.sorted_data 

240 

241 @cache_readonly 

242 def sample_percentiles(self): 

243 """Sample percentiles""" 

244 quantiles = \ 

245 (self.sorted_data - self.fit_params[-2])/self.fit_params[-1] 

246 return self.dist.cdf(quantiles) 

247 

248 def ppplot(self, xlabel=None, ylabel=None, line=None, other=None, 

249 ax=None, **plotkwargs): 

250 """ 

251 Plot of the percentiles of x versus the percentiles of a distribution. 

252 

253 Parameters 

254 ---------- 

255 xlabel : str or None, optional 

256 User-provided labels for the x-axis. If None (default), 

257 other values are used depending on the status of the kwarg `other`. 

258 ylabel : str or None, optional 

259 User-provided labels for the y-axis. If None (default), 

260 other values are used depending on the status of the kwarg `other`. 

261 line : {None, '45', 's', 'r', q'}, optional 

262 Options for the reference line to which the data is compared: 

263 

264 - '45': 45-degree line 

265 - 's': standardized line, the expected order statistics are 

266 scaled by the standard deviation of the given sample and have 

267 the mean added to them 

268 - 'r': A regression line is fit 

269 - 'q': A line is fit through the quartiles. 

270 - None: by default no reference line is added to the plot. 

271 

272 other : ProbPlot, array_like, or None, optional 

273 If provided, ECDF(x) will be plotted against p(x) where x are 

274 sorted samples from `self`. ECDF is an empirical cumulative 

275 distribution function estimated from `other` and 

276 p(x) = 0.5/n, 1.5/n, ..., (n-0.5)/n where n is the number of 

277 samples in `self`. If an array-object is provided, it will be 

278 turned into a `ProbPlot` instance default parameters. If not 

279 provided (default), `self.dist(x)` is be plotted against p(x). 

280 

281 ax : AxesSubplot, optional 

282 If given, this subplot is used to plot in instead of a new figure 

283 being created. 

284 **plotkwargs 

285 Additional arguments to be passed to the `plot` command. 

286 

287 Returns 

288 ------- 

289 Figure 

290 If `ax` is None, the created figure. Otherwise the figure to which 

291 `ax` is connected. 

292 """ 

293 if other is not None: 

294 check_other = isinstance(other, ProbPlot) 

295 if not check_other: 

296 other = ProbPlot(other) 

297 

298 p_x = self.theoretical_percentiles 

299 ecdf_x = ECDF(other.sample_quantiles)(self.sample_quantiles) 

300 

301 fig, ax = _do_plot(p_x, ecdf_x, self.dist, ax=ax, line=line, 

302 **plotkwargs) 

303 

304 if xlabel is None: 

305 xlabel = 'Probabilities of 2nd Sample' 

306 if ylabel is None: 

307 ylabel = 'Probabilities of 1st Sample' 

308 

309 else: 

310 fig, ax = _do_plot(self.theoretical_percentiles, 

311 self.sample_percentiles, 

312 self.dist, ax=ax, line=line, 

313 **plotkwargs) 

314 if xlabel is None: 

315 xlabel = "Theoretical Probabilities" 

316 if ylabel is None: 

317 ylabel = "Sample Probabilities" 

318 

319 ax.set_xlabel(xlabel) 

320 ax.set_ylabel(ylabel) 

321 

322 ax.set_xlim([0.0, 1.0]) 

323 ax.set_ylim([0.0, 1.0]) 

324 

325 return fig 

326 

327 def qqplot(self, xlabel=None, ylabel=None, line=None, other=None, 

328 ax=None, **plotkwargs): 

329 """ 

330 Plot of the quantiles of x versus the quantiles/ppf of a distribution. 

331 

332 Can also be used to plot against the quantiles of another `ProbPlot` 

333 instance. 

334 

335 Parameters 

336 ---------- 

337 xlabel : {None, str} 

338 User-provided labels for the x-axis. If None (default), 

339 other values are used depending on the status of the kwarg `other`. 

340 ylabel : {None, str} 

341 User-provided labels for the y-axis. If None (default), 

342 other values are used depending on the status of the kwarg `other`. 

343 line : {None, '45', 's', 'r', q'}, optional 

344 Options for the reference line to which the data is compared: 

345 

346 - '45' - 45-degree line 

347 - 's' - standardized line, the expected order statistics are scaled 

348 by the standard deviation of the given sample and have the mean 

349 added to them 

350 - 'r' - A regression line is fit 

351 - 'q' - A line is fit through the quartiles. 

352 - None - by default no reference line is added to the plot. 

353 

354 other : {ProbPlot, array_like, None}, optional 

355 If provided, the sample quantiles of this `ProbPlot` instance are 

356 plotted against the sample quantiles of the `other` `ProbPlot` 

357 instance. Sample size of `other` must be equal or larger than 

358 this `ProbPlot` instance. If the sample size is larger, sample 

359 quantiles of `other` will be interpolated to match the sample size 

360 of this `ProbPlot` instance. If an array-like object is provided, 

361 it will be turned into a `ProbPlot` instance using default 

362 parameters. If not provided (default), the theoretical quantiles 

363 are used. 

364 ax : AxesSubplot, optional 

365 If given, this subplot is used to plot in instead of a new figure 

366 being created. 

367 **plotkwargs 

368 Additional arguments to be passed to the `plot` command. 

369 

370 Returns 

371 ------- 

372 Figure 

373 If `ax` is None, the created figure. Otherwise the figure to which 

374 `ax` is connected. 

375 """ 

376 if other is not None: 

377 check_other = isinstance(other, ProbPlot) 

378 if not check_other: 

379 other = ProbPlot(other) 

380 

381 s_self = self.sample_quantiles 

382 s_other = other.sample_quantiles 

383 

384 if len(s_self) > len(s_other): 

385 raise ValueError("Sample size of `other` must be equal or " + 

386 "larger than this `ProbPlot` instance") 

387 elif len(s_self) < len(s_other): 

388 # Use quantiles of the smaller set and interpolate quantiles of 

389 # the larger data set 

390 p = plotting_pos(self.nobs, self.a) 

391 s_other = stats.mstats.mquantiles(s_other, p) 

392 

393 fig, ax = _do_plot(s_other, s_self, self.dist, ax=ax, line=line, 

394 **plotkwargs) 

395 

396 if xlabel is None: 

397 xlabel = 'Quantiles of 2nd Sample' 

398 if ylabel is None: 

399 ylabel = 'Quantiles of 1st Sample' 

400 

401 else: 

402 fig, ax = _do_plot(self.theoretical_quantiles, 

403 self.sample_quantiles, 

404 self.dist, ax=ax, line=line, 

405 **plotkwargs) 

406 if xlabel is None: 

407 xlabel = "Theoretical Quantiles" 

408 if ylabel is None: 

409 ylabel = "Sample Quantiles" 

410 

411 ax.set_xlabel(xlabel) 

412 ax.set_ylabel(ylabel) 

413 

414 return fig 

415 

416 def probplot(self, xlabel=None, ylabel=None, line=None, 

417 exceed=False, ax=None, **plotkwargs): 

418 """ 

419 Plot of unscaled quantiles of x against the prob of a distribution. 

420 

421 The x-axis is scaled linearly with the quantiles, but the probabilities 

422 are used to label the axis. 

423 

424 Parameters 

425 ---------- 

426 xlabel : {None, str}, optional 

427 User-provided labels for the x-axis. If None (default), 

428 other values are used depending on the status of the kwarg `other`. 

429 ylabel : {None, str}, optional 

430 User-provided labels for the y-axis. If None (default), 

431 other values are used depending on the status of the kwarg `other`. 

432 line : {None, '45', 's', 'r', q'}, optional 

433 Options for the reference line to which the data is compared: 

434 

435 - '45' - 45-degree line 

436 - 's' - standardized line, the expected order statistics are scaled 

437 by the standard deviation of the given sample and have the mean 

438 added to them 

439 - 'r' - A regression line is fit 

440 - 'q' - A line is fit through the quartiles. 

441 - None - by default no reference line is added to the plot. 

442 

443 exceed : bool, optional 

444 If False (default) the raw sample quantiles are plotted against 

445 the theoretical quantiles, show the probability that a sample will 

446 not exceed a given value. If True, the theoretical quantiles are 

447 flipped such that the figure displays the probability that a 

448 sample will exceed a given value. 

449 ax : AxesSubplot, optional 

450 If given, this subplot is used to plot in instead of a new figure 

451 being created. 

452 **plotkwargs 

453 Additional arguments to be passed to the `plot` command. 

454 

455 Returns 

456 ------- 

457 Figure 

458 If `ax` is None, the created figure. Otherwise the figure to which 

459 `ax` is connected. 

460 """ 

461 if exceed: 

462 fig, ax = _do_plot(self.theoretical_quantiles[::-1], 

463 self.sorted_data, 

464 self.dist, ax=ax, line=line, 

465 **plotkwargs) 

466 if xlabel is None: 

467 xlabel = 'Probability of Exceedance (%)' 

468 

469 else: 

470 fig, ax = _do_plot(self.theoretical_quantiles, 

471 self.sorted_data, 

472 self.dist, ax=ax, line=line, 

473 **plotkwargs) 

474 if xlabel is None: 

475 xlabel = 'Non-exceedance Probability (%)' 

476 

477 if ylabel is None: 

478 ylabel = "Sample Quantiles" 

479 

480 ax.set_xlabel(xlabel) 

481 ax.set_ylabel(ylabel) 

482 _fmt_probplot_axis(ax, self.dist, self.nobs) 

483 

484 return fig 

485 

486 

487def qqplot(data, dist=stats.norm, distargs=(), a=0, loc=0, scale=1, fit=False, 

488 line=None, ax=None, **plotkwargs): 

489 """ 

490 Q-Q plot of the quantiles of x versus the quantiles/ppf of a distribution. 

491 

492 Can take arguments specifying the parameters for dist or fit them 

493 automatically. (See fit under Parameters.) 

494 

495 Parameters 

496 ---------- 

497 data : array_like 

498 A 1d data array. 

499 dist : callable 

500 Comparison distribution. The default is 

501 scipy.stats.distributions.norm (a standard normal). 

502 distargs : tuple 

503 A tuple of arguments passed to dist to specify it fully 

504 so dist.ppf may be called. 

505 a : float 

506 Offset for the plotting position of an expected order statistic, for 

507 example. The plotting positions are given by (i - a)/(nobs - 2*a + 1) 

508 for i in range(0,nobs+1) 

509 loc : float 

510 Location parameter for dist 

511 scale : float 

512 Scale parameter for dist 

513 fit : bool 

514 If fit is false, loc, scale, and distargs are passed to the 

515 distribution. If fit is True then the parameters for dist 

516 are fit automatically using dist.fit. The quantiles are formed 

517 from the standardized data, after subtracting the fitted loc 

518 and dividing by the fitted scale. 

519 line : {None, '45', 's', 'r', q'} 

520 Options for the reference line to which the data is compared: 

521 

522 - '45' - 45-degree line 

523 - 's' - standardized line, the expected order statistics are scaled 

524 by the standard deviation of the given sample and have the mean 

525 added to them 

526 - 'r' - A regression line is fit 

527 - 'q' - A line is fit through the quartiles. 

528 - None - by default no reference line is added to the plot. 

529 

530 ax : AxesSubplot, optional 

531 If given, this subplot is used to plot in instead of a new figure being 

532 created. 

533 **plotkwargs 

534 Additional matplotlib arguments to be passed to the `plot` command. 

535 

536 Returns 

537 ------- 

538 Figure 

539 If `ax` is None, the created figure. Otherwise the figure to which 

540 `ax` is connected. 

541 

542 See Also 

543 -------- 

544 scipy.stats.probplot 

545 

546 Notes 

547 ----- 

548 Depends on matplotlib. If `fit` is True then the parameters are fit using 

549 the distribution's fit() method. 

550 

551 Examples 

552 -------- 

553 >>> import statsmodels.api as sm 

554 >>> from matplotlib import pyplot as plt 

555 >>> data = sm.datasets.longley.load(as_pandas=False) 

556 >>> data.exog = sm.add_constant(data.exog) 

557 >>> mod_fit = sm.OLS(data.endog, data.exog).fit() 

558 >>> res = mod_fit.resid # residuals 

559 >>> fig = sm.qqplot(res) 

560 >>> plt.show() 

561 

562 qqplot of the residuals against quantiles of t-distribution with 4 degrees 

563 of freedom: 

564 

565 >>> import scipy.stats as stats 

566 >>> fig = sm.qqplot(res, stats.t, distargs=(4,)) 

567 >>> plt.show() 

568 

569 qqplot against same as above, but with mean 3 and std 10: 

570 

571 >>> fig = sm.qqplot(res, stats.t, distargs=(4,), loc=3, scale=10) 

572 >>> plt.show() 

573 

574 Automatically determine parameters for t distribution including the 

575 loc and scale: 

576 

577 >>> fig = sm.qqplot(res, stats.t, fit=True, line='45') 

578 >>> plt.show() 

579 

580 The following plot displays some options, follow the link to see the code. 

581 

582 .. plot:: plots/graphics_gofplots_qqplot.py 

583 """ 

584 probplot = ProbPlot(data, dist=dist, distargs=distargs, 

585 fit=fit, a=a, loc=loc, scale=scale) 

586 fig = probplot.qqplot(ax=ax, line=line, **plotkwargs) 

587 return fig 

588 

589 

590def qqplot_2samples(data1, data2, xlabel=None, ylabel=None, line=None, 

591 ax=None): 

592 """ 

593 Q-Q Plot of two samples' quantiles. 

594 

595 Can take either two `ProbPlot` instances or two array-like objects. In the 

596 case of the latter, both inputs will be converted to `ProbPlot` instances 

597 using only the default values - so use `ProbPlot` instances if 

598 finer-grained control of the quantile computations is required. 

599 

600 Parameters 

601 ---------- 

602 data1 : {array_like, ProbPlot} 

603 Data to plot along x axis. 

604 data2 : {array_like, ProbPlot} 

605 Data to plot along y axis. 

606 xlabel : {None, str} 

607 User-provided labels for the x-axis. If None (default), 

608 other values are used. 

609 ylabel : {None, str} 

610 User-provided labels for the y-axis. If None (default), 

611 other values are used. 

612 line : {None, '45', 's', 'r', q'} 

613 Options for the reference line to which the data is compared: 

614 

615 - '45' - 45-degree line 

616 - 's' - standardized line, the expected order statistics are scaled 

617 by the standard deviation of the given sample and have the mean 

618 added to them 

619 - 'r' - A regression line is fit 

620 - 'q' - A line is fit through the quartiles. 

621 - None - by default no reference line is added to the plot. 

622 

623 ax : AxesSubplot, optional 

624 If given, this subplot is used to plot in instead of a new figure being 

625 created. 

626 

627 Returns 

628 ------- 

629 Figure 

630 If `ax` is None, the created figure. Otherwise the figure to which 

631 `ax` is connected. 

632 

633 See Also 

634 -------- 

635 scipy.stats.probplot 

636 

637 Notes 

638 ----- 

639 1) Depends on matplotlib. 

640 2) If `data1` and `data2` are not `ProbPlot` instances, instances will be 

641 created using the default parameters. Therefore, it is recommended to use 

642 `ProbPlot` instance if fine-grained control is needed in the computation 

643 of the quantiles. 

644 

645 Examples 

646 -------- 

647 >>> import statsmodels.api as sm 

648 >>> import numpy as np 

649 >>> import matplotlib.pyplot as plt 

650 >>> from statsmodels.graphics.gofplots import qqplot_2samples 

651 >>> x = np.random.normal(loc=8.5, scale=2.5, size=37) 

652 >>> y = np.random.normal(loc=8.0, scale=3.0, size=37) 

653 >>> pp_x = sm.ProbPlot(x) 

654 >>> pp_y = sm.ProbPlot(y) 

655 >>> qqplot_2samples(pp_x, pp_y) 

656 >>> plt.show() 

657 

658 .. plot:: plots/graphics_gofplots_qqplot_2samples.py 

659 

660 >>> fig = qqplot_2samples(pp_x, pp_y, xlabel=None, ylabel=None, \ 

661 ... line=None, ax=None) 

662 """ 

663 if not isinstance(data1, ProbPlot): 

664 data1 = ProbPlot(data1) 

665 

666 if not isinstance(data2, ProbPlot): 

667 data2 = ProbPlot(data2) 

668 

669 fig = data1.qqplot(xlabel=xlabel, ylabel=ylabel, 

670 line=line, other=data2, ax=ax) 

671 

672 return fig 

673 

674 

675def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): 

676 """ 

677 Plot a reference line for a qqplot. 

678 

679 Parameters 

680 ---------- 

681 ax : matplotlib axes instance 

682 The axes on which to plot the line 

683 line : str {'45','r','s','q'} 

684 Options for the reference line to which the data is compared.: 

685 

686 - '45' - 45-degree line 

687 - 's' - standardized line, the expected order statistics are scaled by 

688 the standard deviation of the given sample and have the mean 

689 added to them 

690 - 'r' - A regression line is fit 

691 - 'q' - A line is fit through the quartiles. 

692 - None - By default no reference line is added to the plot. 

693 

694 x : ndarray 

695 X data for plot. Not needed if line is '45'. 

696 y : ndarray 

697 Y data for plot. Not needed if line is '45'. 

698 dist : scipy.stats.distribution 

699 A scipy.stats distribution, needed if line is 'q'. 

700 fmt : str, optional 

701 Line format string passed to `plot`. 

702 

703 Notes 

704 ----- 

705 There is no return value. The line is plotted on the given `ax`. 

706 

707 Examples 

708 -------- 

709 Import the food expenditure dataset. Plot annual food expenditure on x-axis 

710 and household income on y-axis. Use qqline to add regression line into the 

711 plot. 

712 

713 >>> import statsmodels.api as sm 

714 >>> import numpy as np 

715 >>> import matplotlib.pyplot as plt 

716 >>> from statsmodels.graphics.gofplots import qqline 

717 

718 >>> foodexp = sm.datasets.engel.load(as_pandas=False) 

719 >>> x = foodexp.exog 

720 >>> y = foodexp.endog 

721 >>> ax = plt.subplot(111) 

722 >>> plt.scatter(x, y) 

723 >>> ax.set_xlabel(foodexp.exog_name[0]) 

724 >>> ax.set_ylabel(foodexp.endog_name) 

725 >>> qqline(ax, 'r', x, y) 

726 >>> plt.show() 

727 

728 .. plot:: plots/graphics_gofplots_qqplot_qqline.py 

729 """ 

730 if line == '45': 

731 end_pts = lzip(ax.get_xlim(), ax.get_ylim()) 

732 end_pts[0] = min(end_pts[0]) 

733 end_pts[1] = max(end_pts[1]) 

734 ax.plot(end_pts, end_pts, fmt) 

735 ax.set_xlim(end_pts) 

736 ax.set_ylim(end_pts) 

737 return # does this have any side effects? 

738 if x is None and y is None: 

739 raise ValueError("If line is not 45, x and y cannot be None.") 

740 elif line == 'r': 

741 # could use ax.lines[0].get_xdata(), get_ydata(), 

742 # but do not know axes are 'clean' 

743 y = OLS(y, add_constant(x)).fit().fittedvalues 

744 ax.plot(x,y,fmt) 

745 elif line == 's': 

746 m,b = y.std(), y.mean() 

747 ref_line = x*m + b 

748 ax.plot(x, ref_line, fmt) 

749 elif line == 'q': 

750 _check_for_ppf(dist) 

751 q25 = stats.scoreatpercentile(y, 25) 

752 q75 = stats.scoreatpercentile(y, 75) 

753 theoretical_quartiles = dist.ppf([0.25, 0.75]) 

754 m = (q75 - q25) / np.diff(theoretical_quartiles) 

755 b = q25 - m*theoretical_quartiles[0] 

756 ax.plot(x, m*x + b, fmt) 

757 

758 

759# about 10x faster than plotting_position in sandbox and mstats 

760def plotting_pos(nobs, a): 

761 """ 

762 Generates sequence of plotting positions 

763 

764 Parameters 

765 ---------- 

766 nobs : int 

767 Number of probability points to plot 

768 a : float 

769 Offset for the plotting position of an expected order statistic, for 

770 example. 

771 

772 Returns 

773 ------- 

774 plotting_positions : ndarray 

775 The plotting positions 

776 

777 Notes 

778 ----- 

779 The plotting positions are given by (i - a)/(nobs - 2*a + 1) for i in 

780 range(0,nobs+1) 

781 

782 See Also 

783 -------- 

784 scipy.stats.mstats.plotting_positions 

785 """ 

786 return (np.arange(1., nobs + 1) - a)/(nobs - 2 * a + 1) 

787 

788 

789def _fmt_probplot_axis(ax, dist, nobs): 

790 """ 

791 Formats a theoretical quantile axis to display the corresponding 

792 probabilities on the quantiles' scale. 

793 

794 Parameteters 

795 ------------ 

796 ax : AxesSubplot, optional 

797 The axis to be formatted 

798 nobs : scalar 

799 Numbero of observations in the sample 

800 dist : scipy.stats.distribution 

801 A scipy.stats distribution sufficiently specified to impletment its 

802 ppf() method. 

803 

804 Returns 

805 ------- 

806 There is no return value. This operates on `ax` in place 

807 """ 

808 _check_for_ppf(dist) 

809 if nobs < 50: 

810 axis_probs = np.array([1, 2, 5, 10, 20, 30, 40, 50, 60, 

811 70, 80, 90, 95, 98, 99, ]) / 100.0 

812 elif nobs < 500: 

813 axis_probs = np.array([0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 30, 40, 50, 60, 

814 70, 80, 90, 95, 98, 99, 99.5, 99.8, 

815 99.9]) / 100.0 

816 else: 

817 axis_probs = np.array([0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 

818 20, 30, 40, 50, 60, 70, 80, 90, 95, 98, 99, 

819 99.5, 99.8, 99.9, 99.95, 99.98, 99.99]) / 100.0 

820 axis_qntls = dist.ppf(axis_probs) 

821 ax.set_xticks(axis_qntls) 

822 ax.set_xticklabels(axis_probs*100, rotation=45, 

823 rotation_mode='anchor', 

824 horizontalalignment='right', 

825 verticalalignment='center') 

826 ax.set_xlim([axis_qntls.min(), axis_qntls.max()]) 

827 

828 

829def _do_plot(x, y, dist=None, line=False, ax=None, fmt='bo', **kwargs): 

830 """ 

831 Boiler plate plotting function for the `ppplot`, `qqplot`, and 

832 `probplot` methods of the `ProbPlot` class 

833 

834 Parameteters 

835 ------------ 

836 x, y : array_like 

837 Data to be plotted 

838 dist : scipy.stats.distribution 

839 A scipy.stats distribution, needed if `line` is 'q'. 

840 line : str {'45', 's', 'r', q'} or None 

841 Options for the reference line to which the data is compared. 

842 ax : AxesSubplot, optional 

843 If given, this subplot is used to plot in instead of a new figure being 

844 created. 

845 fmt : str, optional 

846 matplotlib-compatible formatting string for the data markers 

847 kwargs : keywords 

848 These are passed to matplotlib.plot 

849 

850 Returns 

851 ------- 

852 fig : Figure 

853 The figure containing `ax`. 

854 ax : AxesSubplot 

855 The original axes if provided. Otherwise a new instance. 

856 """ 

857 fig, ax = utils.create_mpl_ax(ax) 

858 ax.set_xmargin(0.02) 

859 ax.plot(x, y, fmt, **kwargs) 

860 if line: 

861 if line not in ['r','q','45','s']: 

862 msg = "%s option for line not understood" % line 

863 raise ValueError(msg) 

864 

865 qqline(ax, line, x=x, y=y, dist=dist) 

866 

867 return fig, ax 

868 

869 

870def _check_for_ppf(dist): 

871 if not hasattr(dist, 'ppf'): 

872 raise ValueError("distribution must have a ppf method")