Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/graphics/functional.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""Module for functional boxplots."""
2from scipy.special import factorial
3from statsmodels.multivariate.pca import PCA
4from statsmodels.nonparametric.kernel_density import KDEMultivariate
5from statsmodels.graphics.utils import _import_mpl
6from collections import OrderedDict
7from itertools import combinations
8import numpy as np
9try:
10 from scipy.optimize import differential_evolution, brute, fmin
11 have_de_optim = True
12except ImportError:
13 from scipy.optimize import brute, fmin
14 have_de_optim = False
15from multiprocessing import Pool
16import itertools
17from . import utils
20__all__ = ['hdrboxplot', 'fboxplot', 'rainbowplot', 'banddepth']
23class HdrResults(object):
24 """Wrap results and pretty print them."""
26 def __init__(self, kwds):
27 self.__dict__.update(kwds)
29 def __repr__(self):
30 msg = ("HDR boxplot summary:\n"
31 "-> median:\n{}\n"
32 "-> 50% HDR (max, min):\n{}\n"
33 "-> 90% HDR (max, min):\n{}\n"
34 "-> Extra quantiles (max, min):\n{}\n"
35 "-> Outliers:\n{}\n"
36 "-> Outliers indices:\n{}\n"
37 ).format(self.median, self.hdr_50, self.hdr_90,
38 self.extra_quantiles, self.outliers, self.outliers_idx)
40 return msg
43def _inverse_transform(pca, data):
44 """
45 Inverse transform on PCA.
47 Use PCA's `project` method by temporary replacing its factors with
48 `data`.
50 Parameters
51 ----------
52 pca : statsmodels Principal Component Analysis instance
53 The PCA object to use.
54 data : sequence of ndarrays or 2-D ndarray
55 The vectors of functions to create a functional boxplot from. If a
56 sequence of 1-D arrays, these should all be the same size.
57 The first axis is the function index, the second axis the one along
58 which the function is defined. So ``data[0, :]`` is the first
59 functional curve.
61 Returns
62 -------
63 projection : ndarray
64 nobs by nvar array of the projection onto ncomp factors
65 """
66 factors = pca.factors
67 pca.factors = data.reshape(-1, factors.shape[1])
68 projection = pca.project()
69 pca.factors = factors
70 return projection
73def _curve_constrained(x, idx, sign, band, pca, ks_gaussian):
74 """Find out if the curve is within the band.
76 The curve value at :attr:`idx` for a given PDF is only returned if
77 within bounds defined by the band. Otherwise, 1E6 is returned.
79 Parameters
80 ----------
81 x : float
82 Curve in reduced space.
83 idx : int
84 Index value of the components to compute.
85 sign : int
86 Return positive or negative value.
87 band : list of float
88 PDF values `[min_pdf, max_pdf]` to be within.
89 pca : statsmodels Principal Component Analysis instance
90 The PCA object to use.
91 ks_gaussian : KDEMultivariate instance
93 Returns
94 -------
95 value : float
96 Curve value at `idx`.
97 """
98 x = x.reshape(1, -1)
99 pdf = ks_gaussian.pdf(x)
100 if band[0] < pdf < band[1]:
101 value = sign * _inverse_transform(pca, x)[0][idx]
102 else:
103 value = 1E6
104 return value
107def _min_max_band(args):
108 """
109 Min and max values at `idx`.
111 Global optimization to find the extrema per component.
113 Parameters
114 ----------
115 args: list
116 It is a list of an idx and other arguments as a tuple:
117 idx : int
118 Index value of the components to compute
119 The tuple contains:
120 band : list of float
121 PDF values `[min_pdf, max_pdf]` to be within.
122 pca : statsmodels Principal Component Analysis instance
123 The PCA object to use.
124 bounds : sequence
125 ``(min, max)`` pair for each components
126 ks_gaussian : KDEMultivariate instance
128 Returns
129 -------
130 band : tuple of float
131 ``(max, min)`` curve values at `idx`
132 """
133 idx, (band, pca, bounds, ks_gaussian, use_brute, seed) = args
134 if have_de_optim and not use_brute:
135 max_ = differential_evolution(_curve_constrained, bounds=bounds,
136 args=(idx, -1, band, pca, ks_gaussian),
137 maxiter=7, seed=seed).x
138 min_ = differential_evolution(_curve_constrained, bounds=bounds,
139 args=(idx, 1, band, pca, ks_gaussian),
140 maxiter=7, seed=seed).x
141 else:
142 max_ = brute(_curve_constrained, ranges=bounds, finish=fmin,
143 args=(idx, -1, band, pca, ks_gaussian))
145 min_ = brute(_curve_constrained, ranges=bounds, finish=fmin,
146 args=(idx, 1, band, pca, ks_gaussian))
148 band = (_inverse_transform(pca, max_)[0][idx],
149 _inverse_transform(pca, min_)[0][idx])
150 return band
153def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
154 xdata=None, labels=None, ax=None, use_brute=False, seed=None):
155 """
156 High Density Region boxplot
158 Parameters
159 ----------
160 data : sequence of ndarrays or 2-D ndarray
161 The vectors of functions to create a functional boxplot from. If a
162 sequence of 1-D arrays, these should all be the same size.
163 The first axis is the function index, the second axis the one along
164 which the function is defined. So ``data[0, :]`` is the first
165 functional curve.
166 ncomp : int, optional
167 Number of components to use. If None, returns the as many as the
168 smaller of the number of rows or columns in data.
169 alpha : list of floats between 0 and 1, optional
170 Extra quantile values to compute. Default is None
171 threshold : float between 0 and 1, optional
172 Percentile threshold value for outliers detection. High value means
173 a lower sensitivity to outliers. Default is `0.95`.
174 bw : array_like or str, optional
175 If an array, it is a fixed user-specified bandwidth. If `None`, set to
176 `normal_reference`. If a string, should be one of:
178 - normal_reference: normal reference rule of thumb (default)
179 - cv_ml: cross validation maximum likelihood
180 - cv_ls: cross validation least squares
182 xdata : ndarray, optional
183 The independent variable for the data. If not given, it is assumed to
184 be an array of integers 0..N-1 with N the length of the vectors in
185 `data`.
186 labels : sequence of scalar or str, optional
187 The labels or identifiers of the curves in `data`. If not given,
188 outliers are labeled in the plot with array indices.
189 ax : AxesSubplot, optional
190 If given, this subplot is used to plot in instead of a new figure being
191 created.
192 use_brute : bool
193 Use the brute force optimizer instead of the default differential
194 evolution to find the curves. Default is False.
195 seed : {None, int, np.random.RandomState}
196 Seed value to pass to scipy.optimize.differential_evolution. Can be an
197 integer or RandomState instance. If None, then the default RandomState
198 provided by np.random is used.
200 Returns
201 -------
202 fig : Figure
203 If `ax` is None, the created figure. Otherwise the figure to which
204 `ax` is connected.
205 hdr_res : HdrResults instance
206 An `HdrResults` instance with the following attributes:
208 - 'median', array. Median curve.
209 - 'hdr_50', array. 50% quantile band. [sup, inf] curves
210 - 'hdr_90', list of array. 90% quantile band. [sup, inf]
211 curves.
212 - 'extra_quantiles', list of array. Extra quantile band.
213 [sup, inf] curves.
214 - 'outliers', ndarray. Outlier curves.
216 See Also
217 --------
218 banddepth, rainbowplot, fboxplot
220 Notes
221 -----
222 The median curve is the curve with the highest probability on the reduced
223 space of a Principal Component Analysis (PCA).
225 Outliers are defined as curves that fall outside the band corresponding
226 to the quantile given by `threshold`.
228 The non-outlying region is defined as the band made up of all the
229 non-outlying curves.
231 Behind the scene, the dataset is represented as a matrix. Each line
232 corresponding to a 1D curve. This matrix is then decomposed using Principal
233 Components Analysis (PCA). This allows to represent the data using a finite
234 number of modes, or components. This compression process allows to turn the
235 functional representation into a scalar representation of the matrix. In
236 other words, you can visualize each curve from its components. Each curve
237 is thus a point in this reduced space. With 2 components, this is called a
238 bivariate plot (2D plot).
240 In this plot, if some points are adjacent (similar components), it means
241 that back in the original space, the curves are similar. Then, finding the
242 median curve means finding the higher density region (HDR) in the reduced
243 space. Moreover, the more you get away from this HDR, the more the curve is
244 unlikely to be similar to the other curves.
246 Using a kernel smoothing technique, the probability density function (PDF)
247 of the multivariate space can be recovered. From this PDF, it is possible
248 to compute the density probability linked to the cluster of points and plot
249 its contours.
251 Finally, using these contours, the different quantiles can be extracted
252 along with the median curve and the outliers.
254 Steps to produce the HDR boxplot include:
256 1. Compute a multivariate kernel density estimation
257 2. Compute contour lines for quantiles 90%, 50% and `alpha` %
258 3. Plot the bivariate plot
259 4. Compute median curve along with quantiles and outliers curves.
261 References
262 ----------
263 [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
264 Functional Data", vol. 19, pp. 29-45, 2010.
266 Examples
267 --------
268 Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea
269 surface temperature data.
271 >>> import matplotlib.pyplot as plt
272 >>> import statsmodels.api as sm
273 >>> data = sm.datasets.elnino.load(as_pandas=False)
275 Create a functional boxplot. We see that the years 1982-83 and 1997-98 are
276 outliers; these are the years where El Nino (a climate pattern
277 characterized by warming up of the sea surface and higher air pressures)
278 occurred with unusual intensity.
280 >>> fig = plt.figure()
281 >>> ax = fig.add_subplot(111)
282 >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
283 ... labels=data.raw_data[:, 0].astype(int),
284 ... ax=ax)
286 >>> ax.set_xlabel("Month of the year")
287 >>> ax.set_ylabel("Sea surface temperature (C)")
288 >>> ax.set_xticks(np.arange(13, step=3) - 1)
289 >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
290 >>> ax.set_xlim([-0.2, 11.2])
292 >>> plt.show()
294 .. plot:: plots/graphics_functional_hdrboxplot.py
295 """
296 fig, ax = utils.create_mpl_ax(ax)
298 if labels is None:
299 # For use with pandas, get the labels
300 if hasattr(data, 'index'):
301 labels = data.index
302 else:
303 labels = np.arange(len(data))
305 data = np.asarray(data)
306 if xdata is None:
307 xdata = np.arange(data.shape[1])
309 n_samples, dim = data.shape
310 # PCA and bivariate plot
311 pca = PCA(data, ncomp=ncomp)
312 data_r = pca.factors
314 # Create gaussian kernel
315 ks_gaussian = KDEMultivariate(data_r, bw=bw,
316 var_type='c' * data_r.shape[1])
318 # Boundaries of the n-variate space
319 bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T
321 # Compute contour line of pvalue linked to a given probability level
322 if alpha is None:
323 alpha = [threshold, 0.9, 0.5]
324 else:
325 alpha.extend([threshold, 0.9, 0.5])
326 alpha = list(set(alpha))
327 alpha.sort(reverse=True)
329 n_quantiles = len(alpha)
330 pdf_r = ks_gaussian.pdf(data_r).flatten()
331 pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100,
332 interpolation='linear')
333 for i in range(n_quantiles)]
335 # Find mean, outliers curves
336 if have_de_optim and not use_brute:
337 median = differential_evolution(lambda x: - ks_gaussian.pdf(x),
338 bounds=bounds, maxiter=5, seed=seed).x
339 else:
340 median = brute(lambda x: - ks_gaussian.pdf(x),
341 ranges=bounds, finish=fmin)
343 outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
344 labels_outlier = [labels[i] for i in outliers_idx]
345 outliers = data[outliers_idx]
347 # Find HDR given some quantiles
349 def _band_quantiles(band, use_brute=use_brute, seed=seed):
350 """
351 Find extreme curves for a quantile band.
353 From the `band` of quantiles, the associated PDF extrema values
354 are computed. If `min_alpha` is not provided (single quantile value),
355 `max_pdf` is set to `1E6` in order not to constrain the problem on high
356 values.
358 An optimization is performed per component in order to find the min and
359 max curves. This is done by comparing the PDF value of a given curve
360 with the band PDF.
362 Parameters
363 ----------
364 band : array_like
365 alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
366 use_brute : bool
367 Use the brute force optimizer instead of the default differential
368 evolution to find the curves. Default is False.
369 seed : {None, int, np.random.RandomState}
370 Seed value to pass to scipy.optimize.differential_evolution. Can
371 be an integer or RandomState instance. If None, then the default
372 RandomState provided by np.random is used.
375 Returns
376 -------
377 band_quantiles : list of 1-D array
378 ``(max_quantile, min_quantile)`` (2, n_features)
379 """
380 min_pdf = pvalues[alpha.index(band[0])]
381 try:
382 max_pdf = pvalues[alpha.index(band[1])]
383 except IndexError:
384 max_pdf = 1E6
385 band = [min_pdf, max_pdf]
387 pool = Pool()
388 data = zip(range(dim), itertools.repeat((band, pca,
389 bounds, ks_gaussian,
390 seed, use_brute)))
391 band_quantiles = pool.map(_min_max_band, data)
392 pool.terminate()
393 pool.close()
395 band_quantiles = list(zip(*band_quantiles))
397 return band_quantiles
399 extra_alpha = [i for i in alpha
400 if 0.5 != i and 0.9 != i and threshold != i]
401 if len(extra_alpha) > 0:
402 extra_quantiles = []
403 for x in extra_alpha:
404 for y in _band_quantiles([x], use_brute=use_brute, seed=seed):
405 extra_quantiles.append(y)
406 else:
407 extra_quantiles = []
409 # Inverse transform from n-variate plot to dataset dataset's shape
410 median = _inverse_transform(pca, median)[0]
411 hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed)
412 hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed)
414 hdr_res = HdrResults({
415 "median": median,
416 "hdr_50": hdr_50,
417 "hdr_90": hdr_90,
418 "extra_quantiles": extra_quantiles,
419 "outliers": outliers,
420 "outliers_idx": outliers_idx
421 })
423 # Plots
424 ax.plot(np.array([xdata] * n_samples).T, data.T,
425 c='c', alpha=.1, label=None)
426 ax.plot(xdata, median, c='k', label='Median')
427 fill_betweens = []
428 fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray',
429 alpha=.4, label='50% HDR'))
430 fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray',
431 alpha=.3, label='90% HDR'))
433 if len(extra_quantiles) != 0:
434 ax.plot(np.array([xdata] * len(extra_quantiles)).T,
435 np.array(extra_quantiles).T,
436 c='y', ls='-.', alpha=.4, label='Extra quantiles')
438 if len(outliers) != 0:
439 for ii, outlier in enumerate(outliers):
440 if labels_outlier is None:
441 label = 'Outliers'
442 else:
443 label = str(labels_outlier[ii])
444 ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label)
446 handles, labels = ax.get_legend_handles_labels()
448 # Proxy artist for fill_between legend entry
449 # See https://matplotlib.org/1.3.1/users/legend_guide.html
450 plt = _import_mpl()
451 for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
452 p = plt.Rectangle((0, 0), 1, 1,
453 fc=fill_between.get_facecolor()[0])
454 handles.append(p)
455 labels.append(label)
457 by_label = OrderedDict(zip(labels, handles))
458 if len(outliers) != 0:
459 by_label.pop('Median')
460 by_label.pop('50% HDR')
461 by_label.pop('90% HDR')
463 ax.legend(by_label.values(), by_label.keys(), loc='best')
465 return fig, hdr_res
468def fboxplot(data, xdata=None, labels=None, depth=None, method='MBD',
469 wfactor=1.5, ax=None, plot_opts=None):
470 """
471 Plot functional boxplot.
473 A functional boxplot is the analog of a boxplot for functional data.
474 Functional data is any type of data that varies over a continuum, i.e.
475 curves, probability distributions, seasonal data, etc.
477 The data is first ordered, the order statistic used here is `banddepth`.
478 Plotted are then the median curve, the envelope of the 50% central region,
479 the maximum non-outlying envelope and the outlier curves.
481 Parameters
482 ----------
483 data : sequence of ndarrays or 2-D ndarray
484 The vectors of functions to create a functional boxplot from. If a
485 sequence of 1-D arrays, these should all be the same size.
486 The first axis is the function index, the second axis the one along
487 which the function is defined. So ``data[0, :]`` is the first
488 functional curve.
489 xdata : ndarray, optional
490 The independent variable for the data. If not given, it is assumed to
491 be an array of integers 0..N-1 with N the length of the vectors in
492 `data`.
493 labels : sequence of scalar or str, optional
494 The labels or identifiers of the curves in `data`. If given, outliers
495 are labeled in the plot.
496 depth : ndarray, optional
497 A 1-D array of band depths for `data`, or equivalent order statistic.
498 If not given, it will be calculated through `banddepth`.
499 method : {'MBD', 'BD2'}, optional
500 The method to use to calculate the band depth. Default is 'MBD'.
501 wfactor : float, optional
502 Factor by which the central 50% region is multiplied to find the outer
503 region (analog of "whiskers" of a classical boxplot).
504 ax : AxesSubplot, optional
505 If given, this subplot is used to plot in instead of a new figure being
506 created.
507 plot_opts : dict, optional
508 A dictionary with plotting options. Any of the following can be
509 provided, if not present in `plot_opts` the defaults will be used::
511 - 'cmap_outliers', a Matplotlib LinearSegmentedColormap instance.
512 - 'c_inner', valid MPL color. Color of the central 50% region
513 - 'c_outer', valid MPL color. Color of the non-outlying region
514 - 'c_median', valid MPL color. Color of the median.
515 - 'lw_outliers', scalar. Linewidth for drawing outlier curves.
516 - 'lw_median', scalar. Linewidth for drawing the median curve.
517 - 'draw_nonout', bool. If True, also draw non-outlying curves.
519 Returns
520 -------
521 fig : Figure
522 If `ax` is None, the created figure. Otherwise the figure to which
523 `ax` is connected.
524 depth : ndarray
525 A 1-D array containing the calculated band depths of the curves.
526 ix_depth : ndarray
527 A 1-D array of indices needed to order curves (or `depth`) from most to
528 least central curve.
529 ix_outliers : ndarray
530 A 1-D array of indices of outlying curves in `data`.
532 See Also
533 --------
534 banddepth, rainbowplot
536 Notes
537 -----
538 The median curve is the curve with the highest band depth.
540 Outliers are defined as curves that fall outside the band created by
541 multiplying the central region by `wfactor`. Note that the range over
542 which they fall outside this band does not matter, a single data point
543 outside the band is enough. If the data is noisy, smoothing may therefore
544 be required.
546 The non-outlying region is defined as the band made up of all the
547 non-outlying curves.
549 References
550 ----------
551 [1] Y. Sun and M.G. Genton, "Functional Boxplots", Journal of Computational
552 and Graphical Statistics, vol. 20, pp. 1-19, 2011.
553 [2] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
554 Functional Data", vol. 19, pp. 29-45, 2010.
556 Examples
557 --------
558 Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea
559 surface temperature data.
561 >>> import matplotlib.pyplot as plt
562 >>> import statsmodels.api as sm
563 >>> data = sm.datasets.elnino.load(as_pandas=False)
565 Create a functional boxplot. We see that the years 1982-83 and 1997-98 are
566 outliers; these are the years where El Nino (a climate pattern
567 characterized by warming up of the sea surface and higher air pressures)
568 occurred with unusual intensity.
570 >>> fig = plt.figure()
571 >>> ax = fig.add_subplot(111)
572 >>> res = sm.graphics.fboxplot(data.raw_data[:, 1:], wfactor=2.58,
573 ... labels=data.raw_data[:, 0].astype(int),
574 ... ax=ax)
576 >>> ax.set_xlabel("Month of the year")
577 >>> ax.set_ylabel("Sea surface temperature (C)")
578 >>> ax.set_xticks(np.arange(13, step=3) - 1)
579 >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
580 >>> ax.set_xlim([-0.2, 11.2])
582 >>> plt.show()
584 .. plot:: plots/graphics_functional_fboxplot.py
585 """
586 fig, ax = utils.create_mpl_ax(ax)
588 plot_opts = {} if plot_opts is None else plot_opts
589 if plot_opts.get('cmap_outliers') is None:
590 from matplotlib.cm import rainbow_r
591 plot_opts['cmap_outliers'] = rainbow_r
593 data = np.asarray(data)
594 if xdata is None:
595 xdata = np.arange(data.shape[1])
597 # Calculate band depth if required.
598 if depth is None:
599 if method not in ['MBD', 'BD2']:
600 raise ValueError("Unknown value for parameter `method`.")
602 depth = banddepth(data, method=method)
603 else:
604 if depth.size != data.shape[0]:
605 raise ValueError("Provided `depth` array is not of correct size.")
607 # Inner area is 25%-75% region of band-depth ordered curves.
608 ix_depth = np.argsort(depth)[::-1]
609 median_curve = data[ix_depth[0], :]
610 ix_IQR = data.shape[0] // 2
611 lower = data[ix_depth[0:ix_IQR], :].min(axis=0)
612 upper = data[ix_depth[0:ix_IQR], :].max(axis=0)
614 # Determine region for outlier detection
615 inner_median = np.median(data[ix_depth[0:ix_IQR], :], axis=0)
616 lower_fence = inner_median - (inner_median - lower) * wfactor
617 upper_fence = inner_median + (upper - inner_median) * wfactor
619 # Find outliers.
620 ix_outliers = []
621 ix_nonout = []
622 for ii in range(data.shape[0]):
623 if (np.any(data[ii, :] > upper_fence) or
624 np.any(data[ii, :] < lower_fence)):
625 ix_outliers.append(ii)
626 else:
627 ix_nonout.append(ii)
629 ix_outliers = np.asarray(ix_outliers)
631 # Plot envelope of all non-outlying data
632 lower_nonout = data[ix_nonout, :].min(axis=0)
633 upper_nonout = data[ix_nonout, :].max(axis=0)
634 ax.fill_between(xdata, lower_nonout, upper_nonout,
635 color=plot_opts.get('c_outer', (0.75, 0.75, 0.75)))
637 # Plot central 50% region
638 ax.fill_between(xdata, lower, upper,
639 color=plot_opts.get('c_inner', (0.5, 0.5, 0.5)))
641 # Plot median curve
642 ax.plot(xdata, median_curve, color=plot_opts.get('c_median', 'k'),
643 lw=plot_opts.get('lw_median', 2))
645 # Plot outliers
646 cmap = plot_opts.get('cmap_outliers')
647 for ii, ix in enumerate(ix_outliers):
648 label = str(labels[ix]) if labels is not None else None
649 ax.plot(xdata, data[ix, :],
650 color=cmap(float(ii) / (len(ix_outliers)-1)), label=label,
651 lw=plot_opts.get('lw_outliers', 1))
653 if plot_opts.get('draw_nonout', False):
654 for ix in ix_nonout:
655 ax.plot(xdata, data[ix, :], 'k-', lw=0.5)
657 if labels is not None:
658 ax.legend()
660 return fig, depth, ix_depth, ix_outliers
663def rainbowplot(data, xdata=None, depth=None, method='MBD', ax=None,
664 cmap=None):
665 """
666 Create a rainbow plot for a set of curves.
668 A rainbow plot contains line plots of all curves in the dataset, colored in
669 order of functional depth. The median curve is shown in black.
671 Parameters
672 ----------
673 data : sequence of ndarrays or 2-D ndarray
674 The vectors of functions to create a functional boxplot from. If a
675 sequence of 1-D arrays, these should all be the same size.
676 The first axis is the function index, the second axis the one along
677 which the function is defined. So ``data[0, :]`` is the first
678 functional curve.
679 xdata : ndarray, optional
680 The independent variable for the data. If not given, it is assumed to
681 be an array of integers 0..N-1 with N the length of the vectors in
682 `data`.
683 depth : ndarray, optional
684 A 1-D array of band depths for `data`, or equivalent order statistic.
685 If not given, it will be calculated through `banddepth`.
686 method : {'MBD', 'BD2'}, optional
687 The method to use to calculate the band depth. Default is 'MBD'.
688 ax : AxesSubplot, optional
689 If given, this subplot is used to plot in instead of a new figure being
690 created.
691 cmap : Matplotlib LinearSegmentedColormap instance, optional
692 The colormap used to color curves with. Default is a rainbow colormap,
693 with red used for the most central and purple for the least central
694 curves.
696 Returns
697 -------
698 Figure
699 If `ax` is None, the created figure. Otherwise the figure to which
700 `ax` is connected.
702 See Also
703 --------
704 banddepth, fboxplot
706 References
707 ----------
708 [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
709 Functional Data", vol. 19, pp. 29-25, 2010.
711 Examples
712 --------
713 Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea
714 surface temperature data.
716 >>> import matplotlib.pyplot as plt
717 >>> import statsmodels.api as sm
718 >>> data = sm.datasets.elnino.load(as_pandas=False)
720 Create a rainbow plot:
722 >>> fig = plt.figure()
723 >>> ax = fig.add_subplot(111)
724 >>> res = sm.graphics.rainbowplot(data.raw_data[:, 1:], ax=ax)
726 >>> ax.set_xlabel("Month of the year")
727 >>> ax.set_ylabel("Sea surface temperature (C)")
728 >>> ax.set_xticks(np.arange(13, step=3) - 1)
729 >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
730 >>> ax.set_xlim([-0.2, 11.2])
731 >>> plt.show()
733 .. plot:: plots/graphics_functional_rainbowplot.py
734 """
735 fig, ax = utils.create_mpl_ax(ax)
737 if cmap is None:
738 from matplotlib.cm import rainbow_r
739 cmap = rainbow_r
741 data = np.asarray(data)
742 if xdata is None:
743 xdata = np.arange(data.shape[1])
745 # Calculate band depth if required.
746 if depth is None:
747 if method not in ['MBD', 'BD2']:
748 raise ValueError("Unknown value for parameter `method`.")
750 depth = banddepth(data, method=method)
751 else:
752 if depth.size != data.shape[0]:
753 raise ValueError("Provided `depth` array is not of correct size.")
755 ix_depth = np.argsort(depth)[::-1]
757 # Plot all curves, colored by depth
758 num_curves = data.shape[0]
759 for ii in range(num_curves):
760 ax.plot(xdata, data[ix_depth[ii], :], c=cmap(ii / (num_curves - 1.)))
762 # Plot the median curve
763 median_curve = data[ix_depth[0], :]
764 ax.plot(xdata, median_curve, 'k-', lw=2)
766 return fig
769def banddepth(data, method='MBD'):
770 """
771 Calculate the band depth for a set of functional curves.
773 Band depth is an order statistic for functional data (see `fboxplot`), with
774 a higher band depth indicating larger "centrality". In analog to scalar
775 data, the functional curve with highest band depth is called the median
776 curve, and the band made up from the first N/2 of N curves is the 50%
777 central region.
779 Parameters
780 ----------
781 data : ndarray
782 The vectors of functions to create a functional boxplot from.
783 The first axis is the function index, the second axis the one along
784 which the function is defined. So ``data[0, :]`` is the first
785 functional curve.
786 method : {'MBD', 'BD2'}, optional
787 Whether to use the original band depth (with J=2) of [1]_ or the
788 modified band depth. See Notes for details.
790 Returns
791 -------
792 ndarray
793 Depth values for functional curves.
795 Notes
796 -----
797 Functional band depth as an order statistic for functional data was
798 proposed in [1]_ and applied to functional boxplots and bagplots in [2]_.
800 The method 'BD2' checks for each curve whether it lies completely inside
801 bands constructed from two curves. All permutations of two curves in the
802 set of curves are used, and the band depth is normalized to one. Due to
803 the complete curve having to fall within the band, this method yields a lot
804 of ties.
806 The method 'MBD' is similar to 'BD2', but checks the fraction of the curve
807 falling within the bands. It therefore generates very few ties.
809 References
810 ----------
811 .. [1] S. Lopez-Pintado and J. Romo, "On the Concept of Depth for
812 Functional Data", Journal of the American Statistical Association,
813 vol. 104, pp. 718-734, 2009.
814 .. [2] Y. Sun and M.G. Genton, "Functional Boxplots", Journal of
815 Computational and Graphical Statistics, vol. 20, pp. 1-19, 2011.
816 """
817 def _band2(x1, x2, curve):
818 xb = np.vstack([x1, x2])
819 if np.any(curve < xb.min(axis=0)) or np.any(curve > xb.max(axis=0)):
820 res = 0
821 else:
822 res = 1
824 return res
826 def _band_mod(x1, x2, curve):
827 xb = np.vstack([x1, x2])
828 res = np.logical_and(curve >= xb.min(axis=0),
829 curve <= xb.max(axis=0))
830 return np.sum(res) / float(res.size)
832 if method == 'BD2':
833 band = _band2
834 elif method == 'MBD':
835 band = _band_mod
836 else:
837 raise ValueError("Unknown input value for parameter `method`.")
839 num = data.shape[0]
840 ix = np.arange(num)
841 depth = []
842 for ii in range(num):
843 res = 0
844 for ix1, ix2 in combinations(ix, 2):
845 res += band(data[ix1, :], data[ix2, :], data[ii, :])
847 # Normalize by number of combinations to get band depth
848 normfactor = factorial(num) / 2. / factorial(num - 2)
849 depth.append(float(res) / normfactor)
851 return np.asarray(depth)