Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Module for functional boxplots.""" 

2from scipy.special import factorial 

3from statsmodels.multivariate.pca import PCA 

4from statsmodels.nonparametric.kernel_density import KDEMultivariate 

5from statsmodels.graphics.utils import _import_mpl 

6from collections import OrderedDict 

7from itertools import combinations 

8import numpy as np 

9try: 

10 from scipy.optimize import differential_evolution, brute, fmin 

11 have_de_optim = True 

12except ImportError: 

13 from scipy.optimize import brute, fmin 

14 have_de_optim = False 

15from multiprocessing import Pool 

16import itertools 

17from . import utils 

18 

19 

20__all__ = ['hdrboxplot', 'fboxplot', 'rainbowplot', 'banddepth'] 

21 

22 

23class HdrResults(object): 

24 """Wrap results and pretty print them.""" 

25 

26 def __init__(self, kwds): 

27 self.__dict__.update(kwds) 

28 

29 def __repr__(self): 

30 msg = ("HDR boxplot summary:\n" 

31 "-> median:\n{}\n" 

32 "-> 50% HDR (max, min):\n{}\n" 

33 "-> 90% HDR (max, min):\n{}\n" 

34 "-> Extra quantiles (max, min):\n{}\n" 

35 "-> Outliers:\n{}\n" 

36 "-> Outliers indices:\n{}\n" 

37 ).format(self.median, self.hdr_50, self.hdr_90, 

38 self.extra_quantiles, self.outliers, self.outliers_idx) 

39 

40 return msg 

41 

42 

43def _inverse_transform(pca, data): 

44 """ 

45 Inverse transform on PCA. 

46 

47 Use PCA's `project` method by temporary replacing its factors with 

48 `data`. 

49 

50 Parameters 

51 ---------- 

52 pca : statsmodels Principal Component Analysis instance 

53 The PCA object to use. 

54 data : sequence of ndarrays or 2-D ndarray 

55 The vectors of functions to create a functional boxplot from. If a 

56 sequence of 1-D arrays, these should all be the same size. 

57 The first axis is the function index, the second axis the one along 

58 which the function is defined. So ``data[0, :]`` is the first 

59 functional curve. 

60 

61 Returns 

62 ------- 

63 projection : ndarray 

64 nobs by nvar array of the projection onto ncomp factors 

65 """ 

66 factors = pca.factors 

67 pca.factors = data.reshape(-1, factors.shape[1]) 

68 projection = pca.project() 

69 pca.factors = factors 

70 return projection 

71 

72 

73def _curve_constrained(x, idx, sign, band, pca, ks_gaussian): 

74 """Find out if the curve is within the band. 

75 

76 The curve value at :attr:`idx` for a given PDF is only returned if 

77 within bounds defined by the band. Otherwise, 1E6 is returned. 

78 

79 Parameters 

80 ---------- 

81 x : float 

82 Curve in reduced space. 

83 idx : int 

84 Index value of the components to compute. 

85 sign : int 

86 Return positive or negative value. 

87 band : list of float 

88 PDF values `[min_pdf, max_pdf]` to be within. 

89 pca : statsmodels Principal Component Analysis instance 

90 The PCA object to use. 

91 ks_gaussian : KDEMultivariate instance 

92 

93 Returns 

94 ------- 

95 value : float 

96 Curve value at `idx`. 

97 """ 

98 x = x.reshape(1, -1) 

99 pdf = ks_gaussian.pdf(x) 

100 if band[0] < pdf < band[1]: 

101 value = sign * _inverse_transform(pca, x)[0][idx] 

102 else: 

103 value = 1E6 

104 return value 

105 

106 

107def _min_max_band(args): 

108 """ 

109 Min and max values at `idx`. 

110 

111 Global optimization to find the extrema per component. 

112 

113 Parameters 

114 ---------- 

115 args: list 

116 It is a list of an idx and other arguments as a tuple: 

117 idx : int 

118 Index value of the components to compute 

119 The tuple contains: 

120 band : list of float 

121 PDF values `[min_pdf, max_pdf]` to be within. 

122 pca : statsmodels Principal Component Analysis instance 

123 The PCA object to use. 

124 bounds : sequence 

125 ``(min, max)`` pair for each components 

126 ks_gaussian : KDEMultivariate instance 

127 

128 Returns 

129 ------- 

130 band : tuple of float 

131 ``(max, min)`` curve values at `idx` 

132 """ 

133 idx, (band, pca, bounds, ks_gaussian, use_brute, seed) = args 

134 if have_de_optim and not use_brute: 

135 max_ = differential_evolution(_curve_constrained, bounds=bounds, 

136 args=(idx, -1, band, pca, ks_gaussian), 

137 maxiter=7, seed=seed).x 

138 min_ = differential_evolution(_curve_constrained, bounds=bounds, 

139 args=(idx, 1, band, pca, ks_gaussian), 

140 maxiter=7, seed=seed).x 

141 else: 

142 max_ = brute(_curve_constrained, ranges=bounds, finish=fmin, 

143 args=(idx, -1, band, pca, ks_gaussian)) 

144 

145 min_ = brute(_curve_constrained, ranges=bounds, finish=fmin, 

146 args=(idx, 1, band, pca, ks_gaussian)) 

147 

148 band = (_inverse_transform(pca, max_)[0][idx], 

149 _inverse_transform(pca, min_)[0][idx]) 

150 return band 

151 

152 

153def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, 

154 xdata=None, labels=None, ax=None, use_brute=False, seed=None): 

155 """ 

156 High Density Region boxplot 

157 

158 Parameters 

159 ---------- 

160 data : sequence of ndarrays or 2-D ndarray 

161 The vectors of functions to create a functional boxplot from. If a 

162 sequence of 1-D arrays, these should all be the same size. 

163 The first axis is the function index, the second axis the one along 

164 which the function is defined. So ``data[0, :]`` is the first 

165 functional curve. 

166 ncomp : int, optional 

167 Number of components to use. If None, returns the as many as the 

168 smaller of the number of rows or columns in data. 

169 alpha : list of floats between 0 and 1, optional 

170 Extra quantile values to compute. Default is None 

171 threshold : float between 0 and 1, optional 

172 Percentile threshold value for outliers detection. High value means 

173 a lower sensitivity to outliers. Default is `0.95`. 

174 bw : array_like or str, optional 

175 If an array, it is a fixed user-specified bandwidth. If `None`, set to 

176 `normal_reference`. If a string, should be one of: 

177 

178 - normal_reference: normal reference rule of thumb (default) 

179 - cv_ml: cross validation maximum likelihood 

180 - cv_ls: cross validation least squares 

181 

182 xdata : ndarray, optional 

183 The independent variable for the data. If not given, it is assumed to 

184 be an array of integers 0..N-1 with N the length of the vectors in 

185 `data`. 

186 labels : sequence of scalar or str, optional 

187 The labels or identifiers of the curves in `data`. If not given, 

188 outliers are labeled in the plot with array indices. 

189 ax : AxesSubplot, optional 

190 If given, this subplot is used to plot in instead of a new figure being 

191 created. 

192 use_brute : bool 

193 Use the brute force optimizer instead of the default differential 

194 evolution to find the curves. Default is False. 

195 seed : {None, int, np.random.RandomState} 

196 Seed value to pass to scipy.optimize.differential_evolution. Can be an 

197 integer or RandomState instance. If None, then the default RandomState 

198 provided by np.random is used. 

199 

200 Returns 

201 ------- 

202 fig : Figure 

203 If `ax` is None, the created figure. Otherwise the figure to which 

204 `ax` is connected. 

205 hdr_res : HdrResults instance 

206 An `HdrResults` instance with the following attributes: 

207 

208 - 'median', array. Median curve. 

209 - 'hdr_50', array. 50% quantile band. [sup, inf] curves 

210 - 'hdr_90', list of array. 90% quantile band. [sup, inf] 

211 curves. 

212 - 'extra_quantiles', list of array. Extra quantile band. 

213 [sup, inf] curves. 

214 - 'outliers', ndarray. Outlier curves. 

215 

216 See Also 

217 -------- 

218 banddepth, rainbowplot, fboxplot 

219 

220 Notes 

221 ----- 

222 The median curve is the curve with the highest probability on the reduced 

223 space of a Principal Component Analysis (PCA). 

224 

225 Outliers are defined as curves that fall outside the band corresponding 

226 to the quantile given by `threshold`. 

227 

228 The non-outlying region is defined as the band made up of all the 

229 non-outlying curves. 

230 

231 Behind the scene, the dataset is represented as a matrix. Each line 

232 corresponding to a 1D curve. This matrix is then decomposed using Principal 

233 Components Analysis (PCA). This allows to represent the data using a finite 

234 number of modes, or components. This compression process allows to turn the 

235 functional representation into a scalar representation of the matrix. In 

236 other words, you can visualize each curve from its components. Each curve 

237 is thus a point in this reduced space. With 2 components, this is called a 

238 bivariate plot (2D plot). 

239 

240 In this plot, if some points are adjacent (similar components), it means 

241 that back in the original space, the curves are similar. Then, finding the 

242 median curve means finding the higher density region (HDR) in the reduced 

243 space. Moreover, the more you get away from this HDR, the more the curve is 

244 unlikely to be similar to the other curves. 

245 

246 Using a kernel smoothing technique, the probability density function (PDF) 

247 of the multivariate space can be recovered. From this PDF, it is possible 

248 to compute the density probability linked to the cluster of points and plot 

249 its contours. 

250 

251 Finally, using these contours, the different quantiles can be extracted 

252 along with the median curve and the outliers. 

253 

254 Steps to produce the HDR boxplot include: 

255 

256 1. Compute a multivariate kernel density estimation 

257 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 

258 3. Plot the bivariate plot 

259 4. Compute median curve along with quantiles and outliers curves. 

260 

261 References 

262 ---------- 

263 [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for 

264 Functional Data", vol. 19, pp. 29-45, 2010. 

265 

266 Examples 

267 -------- 

268 Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea 

269 surface temperature data. 

270 

271 >>> import matplotlib.pyplot as plt 

272 >>> import statsmodels.api as sm 

273 >>> data = sm.datasets.elnino.load(as_pandas=False) 

274 

275 Create a functional boxplot. We see that the years 1982-83 and 1997-98 are 

276 outliers; these are the years where El Nino (a climate pattern 

277 characterized by warming up of the sea surface and higher air pressures) 

278 occurred with unusual intensity. 

279 

280 >>> fig = plt.figure() 

281 >>> ax = fig.add_subplot(111) 

282 >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], 

283 ... labels=data.raw_data[:, 0].astype(int), 

284 ... ax=ax) 

285 

286 >>> ax.set_xlabel("Month of the year") 

287 >>> ax.set_ylabel("Sea surface temperature (C)") 

288 >>> ax.set_xticks(np.arange(13, step=3) - 1) 

289 >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) 

290 >>> ax.set_xlim([-0.2, 11.2]) 

291 

292 >>> plt.show() 

293 

294 .. plot:: plots/graphics_functional_hdrboxplot.py 

295 """ 

296 fig, ax = utils.create_mpl_ax(ax) 

297 

298 if labels is None: 

299 # For use with pandas, get the labels 

300 if hasattr(data, 'index'): 

301 labels = data.index 

302 else: 

303 labels = np.arange(len(data)) 

304 

305 data = np.asarray(data) 

306 if xdata is None: 

307 xdata = np.arange(data.shape[1]) 

308 

309 n_samples, dim = data.shape 

310 # PCA and bivariate plot 

311 pca = PCA(data, ncomp=ncomp) 

312 data_r = pca.factors 

313 

314 # Create gaussian kernel 

315 ks_gaussian = KDEMultivariate(data_r, bw=bw, 

316 var_type='c' * data_r.shape[1]) 

317 

318 # Boundaries of the n-variate space 

319 bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T 

320 

321 # Compute contour line of pvalue linked to a given probability level 

322 if alpha is None: 

323 alpha = [threshold, 0.9, 0.5] 

324 else: 

325 alpha.extend([threshold, 0.9, 0.5]) 

326 alpha = list(set(alpha)) 

327 alpha.sort(reverse=True) 

328 

329 n_quantiles = len(alpha) 

330 pdf_r = ks_gaussian.pdf(data_r).flatten() 

331 pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100, 

332 interpolation='linear') 

333 for i in range(n_quantiles)] 

334 

335 # Find mean, outliers curves 

336 if have_de_optim and not use_brute: 

337 median = differential_evolution(lambda x: - ks_gaussian.pdf(x), 

338 bounds=bounds, maxiter=5, seed=seed).x 

339 else: 

340 median = brute(lambda x: - ks_gaussian.pdf(x), 

341 ranges=bounds, finish=fmin) 

342 

343 outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] 

344 labels_outlier = [labels[i] for i in outliers_idx] 

345 outliers = data[outliers_idx] 

346 

347 # Find HDR given some quantiles 

348 

349 def _band_quantiles(band, use_brute=use_brute, seed=seed): 

350 """ 

351 Find extreme curves for a quantile band. 

352 

353 From the `band` of quantiles, the associated PDF extrema values 

354 are computed. If `min_alpha` is not provided (single quantile value), 

355 `max_pdf` is set to `1E6` in order not to constrain the problem on high 

356 values. 

357 

358 An optimization is performed per component in order to find the min and 

359 max curves. This is done by comparing the PDF value of a given curve 

360 with the band PDF. 

361 

362 Parameters 

363 ---------- 

364 band : array_like 

365 alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` 

366 use_brute : bool 

367 Use the brute force optimizer instead of the default differential 

368 evolution to find the curves. Default is False. 

369 seed : {None, int, np.random.RandomState} 

370 Seed value to pass to scipy.optimize.differential_evolution. Can 

371 be an integer or RandomState instance. If None, then the default 

372 RandomState provided by np.random is used. 

373 

374 

375 Returns 

376 ------- 

377 band_quantiles : list of 1-D array 

378 ``(max_quantile, min_quantile)`` (2, n_features) 

379 """ 

380 min_pdf = pvalues[alpha.index(band[0])] 

381 try: 

382 max_pdf = pvalues[alpha.index(band[1])] 

383 except IndexError: 

384 max_pdf = 1E6 

385 band = [min_pdf, max_pdf] 

386 

387 pool = Pool() 

388 data = zip(range(dim), itertools.repeat((band, pca, 

389 bounds, ks_gaussian, 

390 seed, use_brute))) 

391 band_quantiles = pool.map(_min_max_band, data) 

392 pool.terminate() 

393 pool.close() 

394 

395 band_quantiles = list(zip(*band_quantiles)) 

396 

397 return band_quantiles 

398 

399 extra_alpha = [i for i in alpha 

400 if 0.5 != i and 0.9 != i and threshold != i] 

401 if len(extra_alpha) > 0: 

402 extra_quantiles = [] 

403 for x in extra_alpha: 

404 for y in _band_quantiles([x], use_brute=use_brute, seed=seed): 

405 extra_quantiles.append(y) 

406 else: 

407 extra_quantiles = [] 

408 

409 # Inverse transform from n-variate plot to dataset dataset's shape 

410 median = _inverse_transform(pca, median)[0] 

411 hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed) 

412 hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed) 

413 

414 hdr_res = HdrResults({ 

415 "median": median, 

416 "hdr_50": hdr_50, 

417 "hdr_90": hdr_90, 

418 "extra_quantiles": extra_quantiles, 

419 "outliers": outliers, 

420 "outliers_idx": outliers_idx 

421 }) 

422 

423 # Plots 

424 ax.plot(np.array([xdata] * n_samples).T, data.T, 

425 c='c', alpha=.1, label=None) 

426 ax.plot(xdata, median, c='k', label='Median') 

427 fill_betweens = [] 

428 fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray', 

429 alpha=.4, label='50% HDR')) 

430 fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray', 

431 alpha=.3, label='90% HDR')) 

432 

433 if len(extra_quantiles) != 0: 

434 ax.plot(np.array([xdata] * len(extra_quantiles)).T, 

435 np.array(extra_quantiles).T, 

436 c='y', ls='-.', alpha=.4, label='Extra quantiles') 

437 

438 if len(outliers) != 0: 

439 for ii, outlier in enumerate(outliers): 

440 if labels_outlier is None: 

441 label = 'Outliers' 

442 else: 

443 label = str(labels_outlier[ii]) 

444 ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) 

445 

446 handles, labels = ax.get_legend_handles_labels() 

447 

448 # Proxy artist for fill_between legend entry 

449 # See https://matplotlib.org/1.3.1/users/legend_guide.html 

450 plt = _import_mpl() 

451 for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): 

452 p = plt.Rectangle((0, 0), 1, 1, 

453 fc=fill_between.get_facecolor()[0]) 

454 handles.append(p) 

455 labels.append(label) 

456 

457 by_label = OrderedDict(zip(labels, handles)) 

458 if len(outliers) != 0: 

459 by_label.pop('Median') 

460 by_label.pop('50% HDR') 

461 by_label.pop('90% HDR') 

462 

463 ax.legend(by_label.values(), by_label.keys(), loc='best') 

464 

465 return fig, hdr_res 

466 

467 

468def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD', 

469 wfactor=1.5, ax=None, plot_opts=None): 

470 """ 

471 Plot functional boxplot. 

472 

473 A functional boxplot is the analog of a boxplot for functional data. 

474 Functional data is any type of data that varies over a continuum, i.e. 

475 curves, probability distributions, seasonal data, etc. 

476 

477 The data is first ordered, the order statistic used here is `banddepth`. 

478 Plotted are then the median curve, the envelope of the 50% central region, 

479 the maximum non-outlying envelope and the outlier curves. 

480 

481 Parameters 

482 ---------- 

483 data : sequence of ndarrays or 2-D ndarray 

484 The vectors of functions to create a functional boxplot from. If a 

485 sequence of 1-D arrays, these should all be the same size. 

486 The first axis is the function index, the second axis the one along 

487 which the function is defined. So ``data[0, :]`` is the first 

488 functional curve. 

489 xdata : ndarray, optional 

490 The independent variable for the data. If not given, it is assumed to 

491 be an array of integers 0..N-1 with N the length of the vectors in 

492 `data`. 

493 labels : sequence of scalar or str, optional 

494 The labels or identifiers of the curves in `data`. If given, outliers 

495 are labeled in the plot. 

496 depth : ndarray, optional 

497 A 1-D array of band depths for `data`, or equivalent order statistic. 

498 If not given, it will be calculated through `banddepth`. 

499 method : {'MBD', 'BD2'}, optional 

500 The method to use to calculate the band depth. Default is 'MBD'. 

501 wfactor : float, optional 

502 Factor by which the central 50% region is multiplied to find the outer 

503 region (analog of "whiskers" of a classical boxplot). 

504 ax : AxesSubplot, optional 

505 If given, this subplot is used to plot in instead of a new figure being 

506 created. 

507 plot_opts : dict, optional 

508 A dictionary with plotting options. Any of the following can be 

509 provided, if not present in `plot_opts` the defaults will be used:: 

510 

511 - 'cmap_outliers', a Matplotlib LinearSegmentedColormap instance. 

512 - 'c_inner', valid MPL color. Color of the central 50% region 

513 - 'c_outer', valid MPL color. Color of the non-outlying region 

514 - 'c_median', valid MPL color. Color of the median. 

515 - 'lw_outliers', scalar. Linewidth for drawing outlier curves. 

516 - 'lw_median', scalar. Linewidth for drawing the median curve. 

517 - 'draw_nonout', bool. If True, also draw non-outlying curves. 

518 

519 Returns 

520 ------- 

521 fig : Figure 

522 If `ax` is None, the created figure. Otherwise the figure to which 

523 `ax` is connected. 

524 depth : ndarray 

525 A 1-D array containing the calculated band depths of the curves. 

526 ix_depth : ndarray 

527 A 1-D array of indices needed to order curves (or `depth`) from most to 

528 least central curve. 

529 ix_outliers : ndarray 

530 A 1-D array of indices of outlying curves in `data`. 

531 

532 See Also 

533 -------- 

534 banddepth, rainbowplot 

535 

536 Notes 

537 ----- 

538 The median curve is the curve with the highest band depth. 

539 

540 Outliers are defined as curves that fall outside the band created by 

541 multiplying the central region by `wfactor`. Note that the range over 

542 which they fall outside this band does not matter, a single data point 

543 outside the band is enough. If the data is noisy, smoothing may therefore 

544 be required. 

545 

546 The non-outlying region is defined as the band made up of all the 

547 non-outlying curves. 

548 

549 References 

550 ---------- 

551 [1] Y. Sun and M.G. Genton, "Functional Boxplots", Journal of Computational 

552 and Graphical Statistics, vol. 20, pp. 1-19, 2011. 

553 [2] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for 

554 Functional Data", vol. 19, pp. 29-45, 2010. 

555 

556 Examples 

557 -------- 

558 Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea 

559 surface temperature data. 

560 

561 >>> import matplotlib.pyplot as plt 

562 >>> import statsmodels.api as sm 

563 >>> data = sm.datasets.elnino.load(as_pandas=False) 

564 

565 Create a functional boxplot. We see that the years 1982-83 and 1997-98 are 

566 outliers; these are the years where El Nino (a climate pattern 

567 characterized by warming up of the sea surface and higher air pressures) 

568 occurred with unusual intensity. 

569 

570 >>> fig = plt.figure() 

571 >>> ax = fig.add_subplot(111) 

572 >>> res = sm.graphics.fboxplot(data.raw_data[:, 1:], wfactor=2.58, 

573 ... labels=data.raw_data[:, 0].astype(int), 

574 ... ax=ax) 

575 

576 >>> ax.set_xlabel("Month of the year") 

577 >>> ax.set_ylabel("Sea surface temperature (C)") 

578 >>> ax.set_xticks(np.arange(13, step=3) - 1) 

579 >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) 

580 >>> ax.set_xlim([-0.2, 11.2]) 

581 

582 >>> plt.show() 

583 

584 .. plot:: plots/graphics_functional_fboxplot.py 

585 """ 

586 fig, ax = utils.create_mpl_ax(ax) 

587 

588 plot_opts = {} if plot_opts is None else plot_opts 

589 if plot_opts.get('cmap_outliers') is None: 

590 from matplotlib.cm import rainbow_r 

591 plot_opts['cmap_outliers'] = rainbow_r 

592 

593 data = np.asarray(data) 

594 if xdata is None: 

595 xdata = np.arange(data.shape[1]) 

596 

597 # Calculate band depth if required. 

598 if depth is None: 

599 if method not in ['MBD', 'BD2']: 

600 raise ValueError("Unknown value for parameter `method`.") 

601 

602 depth = banddepth(data, method=method) 

603 else: 

604 if depth.size != data.shape[0]: 

605 raise ValueError("Provided `depth` array is not of correct size.") 

606 

607 # Inner area is 25%-75% region of band-depth ordered curves. 

608 ix_depth = np.argsort(depth)[::-1] 

609 median_curve = data[ix_depth[0], :] 

610 ix_IQR = data.shape[0] // 2 

611 lower = data[ix_depth[0:ix_IQR], :].min(axis=0) 

612 upper = data[ix_depth[0:ix_IQR], :].max(axis=0) 

613 

614 # Determine region for outlier detection 

615 inner_median = np.median(data[ix_depth[0:ix_IQR], :], axis=0) 

616 lower_fence = inner_median - (inner_median - lower) * wfactor 

617 upper_fence = inner_median + (upper - inner_median) * wfactor 

618 

619 # Find outliers. 

620 ix_outliers = [] 

621 ix_nonout = [] 

622 for ii in range(data.shape[0]): 

623 if (np.any(data[ii, :] > upper_fence) or 

624 np.any(data[ii, :] < lower_fence)): 

625 ix_outliers.append(ii) 

626 else: 

627 ix_nonout.append(ii) 

628 

629 ix_outliers = np.asarray(ix_outliers) 

630 

631 # Plot envelope of all non-outlying data 

632 lower_nonout = data[ix_nonout, :].min(axis=0) 

633 upper_nonout = data[ix_nonout, :].max(axis=0) 

634 ax.fill_between(xdata, lower_nonout, upper_nonout, 

635 color=plot_opts.get('c_outer', (0.75, 0.75, 0.75))) 

636 

637 # Plot central 50% region 

638 ax.fill_between(xdata, lower, upper, 

639 color=plot_opts.get('c_inner', (0.5, 0.5, 0.5))) 

640 

641 # Plot median curve 

642 ax.plot(xdata, median_curve, color=plot_opts.get('c_median', 'k'), 

643 lw=plot_opts.get('lw_median', 2)) 

644 

645 # Plot outliers 

646 cmap = plot_opts.get('cmap_outliers') 

647 for ii, ix in enumerate(ix_outliers): 

648 label = str(labels[ix]) if labels is not None else None 

649 ax.plot(xdata, data[ix, :], 

650 color=cmap(float(ii) / (len(ix_outliers)-1)), label=label, 

651 lw=plot_opts.get('lw_outliers', 1)) 

652 

653 if plot_opts.get('draw_nonout', False): 

654 for ix in ix_nonout: 

655 ax.plot(xdata, data[ix, :], 'k-', lw=0.5) 

656 

657 if labels is not None: 

658 ax.legend() 

659 

660 return fig, depth, ix_depth, ix_outliers 

661 

662 

663def rainbowplot(data, xdata=None, depth=None, method='MBD', ax=None, 

664 cmap=None): 

665 """ 

666 Create a rainbow plot for a set of curves. 

667 

668 A rainbow plot contains line plots of all curves in the dataset, colored in 

669 order of functional depth. The median curve is shown in black. 

670 

671 Parameters 

672 ---------- 

673 data : sequence of ndarrays or 2-D ndarray 

674 The vectors of functions to create a functional boxplot from. If a 

675 sequence of 1-D arrays, these should all be the same size. 

676 The first axis is the function index, the second axis the one along 

677 which the function is defined. So ``data[0, :]`` is the first 

678 functional curve. 

679 xdata : ndarray, optional 

680 The independent variable for the data. If not given, it is assumed to 

681 be an array of integers 0..N-1 with N the length of the vectors in 

682 `data`. 

683 depth : ndarray, optional 

684 A 1-D array of band depths for `data`, or equivalent order statistic. 

685 If not given, it will be calculated through `banddepth`. 

686 method : {'MBD', 'BD2'}, optional 

687 The method to use to calculate the band depth. Default is 'MBD'. 

688 ax : AxesSubplot, optional 

689 If given, this subplot is used to plot in instead of a new figure being 

690 created. 

691 cmap : Matplotlib LinearSegmentedColormap instance, optional 

692 The colormap used to color curves with. Default is a rainbow colormap, 

693 with red used for the most central and purple for the least central 

694 curves. 

695 

696 Returns 

697 ------- 

698 Figure 

699 If `ax` is None, the created figure. Otherwise the figure to which 

700 `ax` is connected. 

701 

702 See Also 

703 -------- 

704 banddepth, fboxplot 

705 

706 References 

707 ---------- 

708 [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for 

709 Functional Data", vol. 19, pp. 29-25, 2010. 

710 

711 Examples 

712 -------- 

713 Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea 

714 surface temperature data. 

715 

716 >>> import matplotlib.pyplot as plt 

717 >>> import statsmodels.api as sm 

718 >>> data = sm.datasets.elnino.load(as_pandas=False) 

719 

720 Create a rainbow plot: 

721 

722 >>> fig = plt.figure() 

723 >>> ax = fig.add_subplot(111) 

724 >>> res = sm.graphics.rainbowplot(data.raw_data[:, 1:], ax=ax) 

725 

726 >>> ax.set_xlabel("Month of the year") 

727 >>> ax.set_ylabel("Sea surface temperature (C)") 

728 >>> ax.set_xticks(np.arange(13, step=3) - 1) 

729 >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) 

730 >>> ax.set_xlim([-0.2, 11.2]) 

731 >>> plt.show() 

732 

733 .. plot:: plots/graphics_functional_rainbowplot.py 

734 """ 

735 fig, ax = utils.create_mpl_ax(ax) 

736 

737 if cmap is None: 

738 from matplotlib.cm import rainbow_r 

739 cmap = rainbow_r 

740 

741 data = np.asarray(data) 

742 if xdata is None: 

743 xdata = np.arange(data.shape[1]) 

744 

745 # Calculate band depth if required. 

746 if depth is None: 

747 if method not in ['MBD', 'BD2']: 

748 raise ValueError("Unknown value for parameter `method`.") 

749 

750 depth = banddepth(data, method=method) 

751 else: 

752 if depth.size != data.shape[0]: 

753 raise ValueError("Provided `depth` array is not of correct size.") 

754 

755 ix_depth = np.argsort(depth)[::-1] 

756 

757 # Plot all curves, colored by depth 

758 num_curves = data.shape[0] 

759 for ii in range(num_curves): 

760 ax.plot(xdata, data[ix_depth[ii], :], c=cmap(ii / (num_curves - 1.))) 

761 

762 # Plot the median curve 

763 median_curve = data[ix_depth[0], :] 

764 ax.plot(xdata, median_curve, 'k-', lw=2) 

765 

766 return fig 

767 

768 

769def banddepth(data, method='MBD'): 

770 """ 

771 Calculate the band depth for a set of functional curves. 

772 

773 Band depth is an order statistic for functional data (see `fboxplot`), with 

774 a higher band depth indicating larger "centrality". In analog to scalar 

775 data, the functional curve with highest band depth is called the median 

776 curve, and the band made up from the first N/2 of N curves is the 50% 

777 central region. 

778 

779 Parameters 

780 ---------- 

781 data : ndarray 

782 The vectors of functions to create a functional boxplot from. 

783 The first axis is the function index, the second axis the one along 

784 which the function is defined. So ``data[0, :]`` is the first 

785 functional curve. 

786 method : {'MBD', 'BD2'}, optional 

787 Whether to use the original band depth (with J=2) of [1]_ or the 

788 modified band depth. See Notes for details. 

789 

790 Returns 

791 ------- 

792 ndarray 

793 Depth values for functional curves. 

794 

795 Notes 

796 ----- 

797 Functional band depth as an order statistic for functional data was 

798 proposed in [1]_ and applied to functional boxplots and bagplots in [2]_. 

799 

800 The method 'BD2' checks for each curve whether it lies completely inside 

801 bands constructed from two curves. All permutations of two curves in the 

802 set of curves are used, and the band depth is normalized to one. Due to 

803 the complete curve having to fall within the band, this method yields a lot 

804 of ties. 

805 

806 The method 'MBD' is similar to 'BD2', but checks the fraction of the curve 

807 falling within the bands. It therefore generates very few ties. 

808 

809 References 

810 ---------- 

811 .. [1] S. Lopez-Pintado and J. Romo, "On the Concept of Depth for 

812 Functional Data", Journal of the American Statistical Association, 

813 vol. 104, pp. 718-734, 2009. 

814 .. [2] Y. Sun and M.G. Genton, "Functional Boxplots", Journal of 

815 Computational and Graphical Statistics, vol. 20, pp. 1-19, 2011. 

816 """ 

817 def _band2(x1, x2, curve): 

818 xb = np.vstack([x1, x2]) 

819 if np.any(curve < xb.min(axis=0)) or np.any(curve > xb.max(axis=0)): 

820 res = 0 

821 else: 

822 res = 1 

823 

824 return res 

825 

826 def _band_mod(x1, x2, curve): 

827 xb = np.vstack([x1, x2]) 

828 res = np.logical_and(curve >= xb.min(axis=0), 

829 curve <= xb.max(axis=0)) 

830 return np.sum(res) / float(res.size) 

831 

832 if method == 'BD2': 

833 band = _band2 

834 elif method == 'MBD': 

835 band = _band_mod 

836 else: 

837 raise ValueError("Unknown input value for parameter `method`.") 

838 

839 num = data.shape[0] 

840 ix = np.arange(num) 

841 depth = [] 

842 for ii in range(num): 

843 res = 0 

844 for ix1, ix2 in combinations(ix, 2): 

845 res += band(data[ix1, :], data[ix2, :], data[ii, :]) 

846 

847 # Normalize by number of combinations to get band depth 

848 normfactor = factorial(num) / 2. / factorial(num - 2) 

849 depth.append(float(res) / normfactor) 

850 

851 return np.asarray(depth)