Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Variations on boxplots.""" 

2 

3# Author: Ralf Gommers 

4# Based on code by Flavio Coelho and Teemu Ikonen. 

5 

6import numpy as np 

7from scipy.stats import gaussian_kde 

8 

9from . import utils 

10 

11 

12__all__ = ['violinplot', 'beanplot'] 

13 

14 

15def violinplot(data, ax=None, labels=None, positions=None, side='both', 

16 show_boxplot=True, plot_opts=None): 

17 """ 

18 Make a violin plot of each dataset in the `data` sequence. 

19 

20 A violin plot is a boxplot combined with a kernel density estimate of the 

21 probability density function per point. 

22 

23 Parameters 

24 ---------- 

25 data : sequence[array_like] 

26 Data arrays, one array per value in `positions`. 

27 ax : AxesSubplot, optional 

28 If given, this subplot is used to plot in instead of a new figure being 

29 created. 

30 labels : list[str], optional 

31 Tick labels for the horizontal axis. If not given, integers 

32 ``1..len(data)`` are used. 

33 positions : array_like, optional 

34 Position array, used as the horizontal axis of the plot. If not given, 

35 spacing of the violins will be equidistant. 

36 side : {'both', 'left', 'right'}, optional 

37 How to plot the violin. Default is 'both'. The 'left', 'right' 

38 options can be used to create asymmetric violin plots. 

39 show_boxplot : bool, optional 

40 Whether or not to show normal box plots on top of the violins. 

41 Default is True. 

42 plot_opts : dict, optional 

43 A dictionary with plotting options. Any of the following can be 

44 provided, if not present in `plot_opts` the defaults will be used:: 

45 

46 - 'violin_fc', MPL color. Fill color for violins. Default is 'y'. 

47 - 'violin_ec', MPL color. Edge color for violins. Default is 'k'. 

48 - 'violin_lw', scalar. Edge linewidth for violins. Default is 1. 

49 - 'violin_alpha', float. Transparancy of violins. Default is 0.5. 

50 - 'cutoff', bool. If True, limit violin range to data range. 

51 Default is False. 

52 - 'cutoff_val', scalar. Where to cut off violins if `cutoff` is 

53 True. Default is 1.5 standard deviations. 

54 - 'cutoff_type', {'std', 'abs'}. Whether cutoff value is absolute, 

55 or in standard deviations. Default is 'std'. 

56 - 'violin_width' : float. Relative width of violins. Max available 

57 space is 1, default is 0.8. 

58 - 'label_fontsize', MPL fontsize. Adjusts fontsize only if given. 

59 - 'label_rotation', scalar. Adjusts label rotation only if given. 

60 Specify in degrees. 

61 - 'bw_factor', Adjusts the scipy gaussian_kde kernel. default: None. 

62 Options for scalar or callable. 

63 

64 Returns 

65 ------- 

66 Figure 

67 If `ax` is None, the created figure. Otherwise the figure to which 

68 `ax` is connected. 

69 

70 See Also 

71 -------- 

72 beanplot : Bean plot, builds on `violinplot`. 

73 matplotlib.pyplot.boxplot : Standard boxplot. 

74 

75 Notes 

76 ----- 

77 The appearance of violins can be customized with `plot_opts`. If 

78 customization of boxplot elements is required, set `show_boxplot` to False 

79 and plot it on top of the violins by calling the Matplotlib `boxplot` 

80 function directly. For example:: 

81 

82 violinplot(data, ax=ax, show_boxplot=False) 

83 ax.boxplot(data, sym='cv', whis=2.5) 

84 

85 It can happen that the axis labels or tick labels fall outside the plot 

86 area, especially with rotated labels on the horizontal axis. With 

87 Matplotlib 1.1 or higher, this can easily be fixed by calling 

88 ``ax.tight_layout()``. With older Matplotlib one has to use ``plt.rc`` or 

89 ``plt.rcParams`` to fix this, for example:: 

90 

91 plt.rc('figure.subplot', bottom=0.25) 

92 violinplot(data, ax=ax) 

93 

94 References 

95 ---------- 

96 J.L. Hintze and R.D. Nelson, "Violin Plots: A Box Plot-Density Trace 

97 Synergism", The American Statistician, Vol. 52, pp.181-84, 1998. 

98 

99 Examples 

100 -------- 

101 We use the American National Election Survey 1996 dataset, which has Party 

102 Identification of respondents as independent variable and (among other 

103 data) age as dependent variable. 

104 

105 >>> data = sm.datasets.anes96.load_pandas() 

106 >>> party_ID = np.arange(7) 

107 >>> labels = ["Strong Democrat", "Weak Democrat", "Independent-Democrat", 

108 ... "Independent-Indpendent", "Independent-Republican", 

109 ... "Weak Republican", "Strong Republican"] 

110 

111 Group age by party ID, and create a violin plot with it: 

112 

113 >>> plt.rcParams['figure.subplot.bottom'] = 0.23 # keep labels visible 

114 >>> age = [data.exog['age'][data.endog == id] for id in party_ID] 

115 >>> fig = plt.figure() 

116 >>> ax = fig.add_subplot(111) 

117 >>> sm.graphics.violinplot(age, ax=ax, labels=labels, 

118 ... plot_opts={'cutoff_val':5, 'cutoff_type':'abs', 

119 ... 'label_fontsize':'small', 

120 ... 'label_rotation':30}) 

121 >>> ax.set_xlabel("Party identification of respondent.") 

122 >>> ax.set_ylabel("Age") 

123 >>> plt.show() 

124 

125 .. plot:: plots/graphics_boxplot_violinplot.py 

126 """ 

127 plot_opts = {} if plot_opts is None else plot_opts 

128 if max([np.size(arr) for arr in data]) == 0: 

129 msg = "No Data to make Violin: Try again!" 

130 raise ValueError(msg) 

131 

132 fig, ax = utils.create_mpl_ax(ax) 

133 

134 data = list(map(np.asarray, data)) 

135 if positions is None: 

136 positions = np.arange(len(data)) + 1 

137 

138 # Determine available horizontal space for each individual violin. 

139 pos_span = np.max(positions) - np.min(positions) 

140 width = np.min([0.15 * np.max([pos_span, 1.]), 

141 plot_opts.get('violin_width', 0.8) / 2.]) 

142 

143 # Plot violins. 

144 for pos_data, pos in zip(data, positions): 

145 _single_violin(ax, pos, pos_data, width, side, plot_opts) 

146 

147 if show_boxplot: 

148 ax.boxplot(data, notch=1, positions=positions, vert=1) 

149 

150 # Set ticks and tick labels of horizontal axis. 

151 _set_ticks_labels(ax, data, labels, positions, plot_opts) 

152 

153 return fig 

154 

155 

156def _single_violin(ax, pos, pos_data, width, side, plot_opts): 

157 """""" 

158 bw_factor = plot_opts.get('bw_factor', None) 

159 

160 def _violin_range(pos_data, plot_opts): 

161 """Return array with correct range, with which violins can be plotted.""" 

162 cutoff = plot_opts.get('cutoff', False) 

163 cutoff_type = plot_opts.get('cutoff_type', 'std') 

164 cutoff_val = plot_opts.get('cutoff_val', 1.5) 

165 

166 s = 0.0 

167 if not cutoff: 

168 if cutoff_type == 'std': 

169 s = cutoff_val * np.std(pos_data) 

170 else: 

171 s = cutoff_val 

172 

173 x_lower = kde.dataset.min() - s 

174 x_upper = kde.dataset.max() + s 

175 return np.linspace(x_lower, x_upper, 100) 

176 

177 pos_data = np.asarray(pos_data) 

178 # Kernel density estimate for data at this position. 

179 kde = gaussian_kde(pos_data, bw_method=bw_factor) 

180 

181 # Create violin for pos, scaled to the available space. 

182 xvals = _violin_range(pos_data, plot_opts) 

183 violin = kde.evaluate(xvals) 

184 violin = width * violin / violin.max() 

185 

186 if side == 'both': 

187 envelope_l, envelope_r = (-violin + pos, violin + pos) 

188 elif side == 'right': 

189 envelope_l, envelope_r = (pos, violin + pos) 

190 elif side == 'left': 

191 envelope_l, envelope_r = (-violin + pos, pos) 

192 else: 

193 msg = "`side` parameter should be one of {'left', 'right', 'both'}." 

194 raise ValueError(msg) 

195 

196 # Draw the violin. 

197 ax.fill_betweenx(xvals, envelope_l, envelope_r, 

198 facecolor=plot_opts.get('violin_fc', '#66c2a5'), 

199 edgecolor=plot_opts.get('violin_ec', 'k'), 

200 lw=plot_opts.get('violin_lw', 1), 

201 alpha=plot_opts.get('violin_alpha', 0.5)) 

202 

203 return xvals, violin 

204 

205 

206def _set_ticks_labels(ax, data, labels, positions, plot_opts): 

207 """Set ticks and labels on horizontal axis.""" 

208 

209 # Set xticks and limits. 

210 ax.set_xlim([np.min(positions) - 0.5, np.max(positions) + 0.5]) 

211 ax.set_xticks(positions) 

212 

213 label_fontsize = plot_opts.get('label_fontsize') 

214 label_rotation = plot_opts.get('label_rotation') 

215 if label_fontsize or label_rotation: 

216 from matplotlib.artist import setp 

217 

218 if labels is not None: 

219 if not len(labels) == len(data): 

220 msg = "Length of `labels` should equal length of `data`." 

221 raise ValueError(msg) 

222 

223 xticknames = ax.set_xticklabels(labels) 

224 if label_fontsize: 

225 setp(xticknames, fontsize=label_fontsize) 

226 

227 if label_rotation: 

228 setp(xticknames, rotation=label_rotation) 

229 

230 return 

231 

232 

233def beanplot(data, ax=None, labels=None, positions=None, side='both', 

234 jitter=False, plot_opts={}): 

235 """ 

236 Bean plot of each dataset in a sequence. 

237 

238 A bean plot is a combination of a `violinplot` (kernel density estimate of 

239 the probability density function per point) with a line-scatter plot of all 

240 individual data points. 

241 

242 Parameters 

243 ---------- 

244 data : sequence[array_like] 

245 Data arrays, one array per value in `positions`. 

246 ax : AxesSubplot 

247 If given, this subplot is used to plot in instead of a new figure being 

248 created. 

249 labels : list[str], optional 

250 Tick labels for the horizontal axis. If not given, integers 

251 ``1..len(data)`` are used. 

252 positions : array_like, optional 

253 Position array, used as the horizontal axis of the plot. If not given, 

254 spacing of the violins will be equidistant. 

255 side : {'both', 'left', 'right'}, optional 

256 How to plot the violin. Default is 'both'. The 'left', 'right' 

257 options can be used to create asymmetric violin plots. 

258 jitter : bool, optional 

259 If True, jitter markers within violin instead of plotting regular lines 

260 around the center. This can be useful if the data is very dense. 

261 plot_opts : dict, optional 

262 A dictionary with plotting options. All the options for `violinplot` 

263 can be specified, they will simply be passed to `violinplot`. Options 

264 specific to `beanplot` are: 

265 

266 - 'violin_width' : float. Relative width of violins. Max available 

267 space is 1, default is 0.8. 

268 - 'bean_color', MPL color. Color of bean plot lines. Default is 'k'. 

269 Also used for jitter marker edge color if `jitter` is True. 

270 - 'bean_size', scalar. Line length as a fraction of maximum length. 

271 Default is 0.5. 

272 - 'bean_lw', scalar. Linewidth, default is 0.5. 

273 - 'bean_show_mean', bool. If True (default), show mean as a line. 

274 - 'bean_show_median', bool. If True (default), show median as a 

275 marker. 

276 - 'bean_mean_color', MPL color. Color of mean line. Default is 'b'. 

277 - 'bean_mean_lw', scalar. Linewidth of mean line, default is 2. 

278 - 'bean_mean_size', scalar. Line length as a fraction of maximum length. 

279 Default is 0.5. 

280 - 'bean_median_color', MPL color. Color of median marker. Default 

281 is 'r'. 

282 - 'bean_median_marker', MPL marker. Marker type, default is '+'. 

283 - 'jitter_marker', MPL marker. Marker type for ``jitter=True``. 

284 Default is 'o'. 

285 - 'jitter_marker_size', int. Marker size. Default is 4. 

286 - 'jitter_fc', MPL color. Jitter marker face color. Default is None. 

287 - 'bean_legend_text', str. If given, add a legend with given text. 

288 

289 Returns 

290 ------- 

291 Figure 

292 If `ax` is None, the created figure. Otherwise the figure to which 

293 `ax` is connected. 

294 

295 See Also 

296 -------- 

297 violinplot : Violin plot, also used internally in `beanplot`. 

298 matplotlib.pyplot.boxplot : Standard boxplot. 

299 

300 References 

301 ---------- 

302 P. Kampstra, "Beanplot: A Boxplot Alternative for Visual Comparison of 

303 Distributions", J. Stat. Soft., Vol. 28, pp. 1-9, 2008. 

304 

305 Examples 

306 -------- 

307 We use the American National Election Survey 1996 dataset, which has Party 

308 Identification of respondents as independent variable and (among other 

309 data) age as dependent variable. 

310 

311 >>> data = sm.datasets.anes96.load_pandas() 

312 >>> party_ID = np.arange(7) 

313 >>> labels = ["Strong Democrat", "Weak Democrat", "Independent-Democrat", 

314 ... "Independent-Indpendent", "Independent-Republican", 

315 ... "Weak Republican", "Strong Republican"] 

316 

317 Group age by party ID, and create a violin plot with it: 

318 

319 >>> plt.rcParams['figure.subplot.bottom'] = 0.23 # keep labels visible 

320 >>> age = [data.exog['age'][data.endog == id] for id in party_ID] 

321 >>> fig = plt.figure() 

322 >>> ax = fig.add_subplot(111) 

323 >>> sm.graphics.beanplot(age, ax=ax, labels=labels, 

324 ... plot_opts={'cutoff_val':5, 'cutoff_type':'abs', 

325 ... 'label_fontsize':'small', 

326 ... 'label_rotation':30}) 

327 >>> ax.set_xlabel("Party identification of respondent.") 

328 >>> ax.set_ylabel("Age") 

329 >>> plt.show() 

330 

331 .. plot:: plots/graphics_boxplot_beanplot.py 

332 """ 

333 fig, ax = utils.create_mpl_ax(ax) 

334 

335 data = list(map(np.asarray, data)) 

336 if positions is None: 

337 positions = np.arange(len(data)) + 1 

338 

339 # Determine available horizontal space for each individual violin. 

340 pos_span = np.max(positions) - np.min(positions) 

341 violin_width = np.min([0.15 * np.max([pos_span, 1.]), 

342 plot_opts.get('violin_width', 0.8) / 2.]) 

343 bean_width = np.min([0.15 * np.max([pos_span, 1.]), 

344 plot_opts.get('bean_size', 0.5) / 2.]) 

345 bean_mean_width = np.min([0.15 * np.max([pos_span, 1.]), 

346 plot_opts.get('bean_mean_size', 0.5) / 2.]) 

347 

348 legend_txt = plot_opts.get('bean_legend_text', None) 

349 for pos_data, pos in zip(data, positions): 

350 # Draw violins. 

351 xvals, violin = _single_violin(ax, pos, pos_data, violin_width, side, plot_opts) 

352 

353 if jitter: 

354 # Draw data points at random coordinates within violin envelope. 

355 jitter_coord = pos + _jitter_envelope(pos_data, xvals, violin, side) 

356 ax.plot(jitter_coord, pos_data, ls='', 

357 marker=plot_opts.get('jitter_marker', 'o'), 

358 ms=plot_opts.get('jitter_marker_size', 4), 

359 mec=plot_opts.get('bean_color', 'k'), 

360 mew=1, mfc=plot_opts.get('jitter_fc', 'none'), 

361 label=legend_txt) 

362 else: 

363 # Draw bean lines. 

364 ax.hlines(pos_data, pos - bean_width, pos + bean_width, 

365 lw=plot_opts.get('bean_lw', 0.5), 

366 color=plot_opts.get('bean_color', 'k'), 

367 label=legend_txt) 

368 

369 # Show legend if required. 

370 if legend_txt is not None: 

371 _show_legend(ax) 

372 legend_txt = None # ensure we get one entry per call to beanplot 

373 

374 # Draw mean line. 

375 if plot_opts.get('bean_show_mean', True): 

376 ax.hlines(np.mean(pos_data), pos - bean_mean_width, pos + bean_mean_width, 

377 lw=plot_opts.get('bean_mean_lw', 2.), 

378 color=plot_opts.get('bean_mean_color', 'b')) 

379 

380 # Draw median marker. 

381 if plot_opts.get('bean_show_median', True): 

382 ax.plot(pos, np.median(pos_data), 

383 marker=plot_opts.get('bean_median_marker', '+'), 

384 color=plot_opts.get('bean_median_color', 'r')) 

385 

386 # Set ticks and tick labels of horizontal axis. 

387 _set_ticks_labels(ax, data, labels, positions, plot_opts) 

388 

389 return fig 

390 

391 

392def _jitter_envelope(pos_data, xvals, violin, side): 

393 """Determine envelope for jitter markers.""" 

394 if side == 'both': 

395 low, high = (-1., 1.) 

396 elif side == 'right': 

397 low, high = (0, 1.) 

398 elif side == 'left': 

399 low, high = (-1., 0) 

400 else: 

401 raise ValueError("`side` input incorrect: %s" % side) 

402 

403 jitter_envelope = np.interp(pos_data, xvals, violin) 

404 jitter_coord = jitter_envelope * np.random.uniform(low=low, high=high, 

405 size=pos_data.size) 

406 

407 return jitter_coord 

408 

409 

410def _show_legend(ax): 

411 """Utility function to show legend.""" 

412 leg = ax.legend(loc=1, shadow=True, fancybox=True, labelspacing=0.2, 

413 borderpad=0.15) 

414 ltext = leg.get_texts() 

415 llines = leg.get_lines() 

416 frame = leg.get_frame() 

417 

418 from matplotlib.artist import setp 

419 setp(ltext, fontsize='small') 

420 setp(llines, linewidth=1)