Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Univariate Kernel Density Estimators
4References
5----------
6Racine, Jeff. (2008) "Nonparametric Econometrics: A Primer," Foundation and
7 Trends in Econometrics: Vol 3: No 1, pp1-88.
8 http://dx.doi.org/10.1561/0800000009
10https://en.wikipedia.org/wiki/Kernel_%28statistics%29
12Silverman, B.W. Density Estimation for Statistics and Data Analysis.
13"""
14import numpy as np
15from scipy import integrate, stats
16from statsmodels.sandbox.nonparametric import kernels
17from statsmodels.tools.decorators import cache_readonly
18from statsmodels.tools.validation import array_like
19from . import bandwidths
20from .kdetools import (forrt, revrt, silverman_transform)
21from .linbin import fast_linbin
23#### Kernels Switch for estimators ####
25kernel_switch = dict(gau=kernels.Gaussian, epa=kernels.Epanechnikov,
26 uni=kernels.Uniform, tri=kernels.Triangular,
27 biw=kernels.Biweight, triw=kernels.Triweight,
28 cos=kernels.Cosine, cos2=kernels.Cosine2)
30def _checkisfit(self):
31 try:
32 self.density
33 except:
34 raise ValueError("Call fit to fit the density first")
37#### Kernel Density Estimator Class ###
40class KDEUnivariate(object):
41 """
42 Univariate Kernel Density Estimator.
44 Parameters
45 ----------
46 endog : array_like
47 The variable for which the density estimate is desired.
49 Notes
50 -----
51 If cdf, sf, cumhazard, or entropy are computed, they are computed based on
52 the definition of the kernel rather than the FFT approximation, even if
53 the density is fit with FFT = True.
55 `KDEUnivariate` is much faster than `KDEMultivariate`, due to its FFT-based
56 implementation. It should be preferred for univariate, continuous data.
57 `KDEMultivariate` also supports mixed data.
59 See Also
60 --------
61 KDEMultivariate
62 kdensity, kdensityfft
64 Examples
65 --------
66 >>> import statsmodels.api as sm
67 >>> import matplotlib.pyplot as plt
69 >>> nobs = 300
70 >>> np.random.seed(1234) # Seed random generator
71 >>> dens = sm.nonparametric.KDEUnivariate(np.random.normal(size=nobs))
72 >>> dens.fit()
73 >>> plt.plot(dens.cdf)
74 >>> plt.show()
75 """
77 def __init__(self, endog):
78 self.endog = array_like(endog, "endog", ndim=1, contiguous=True)
80 def fit(self, kernel="gau", bw="normal_reference", fft=True, weights=None,
81 gridsize=None, adjust=1, cut=3, clip=(-np.inf, np.inf)):
82 """
83 Attach the density estimate to the KDEUnivariate class.
85 Parameters
86 ----------
87 kernel : str
88 The Kernel to be used. Choices are:
90 - "biw" for biweight
91 - "cos" for cosine
92 - "epa" for Epanechnikov
93 - "gau" for Gaussian.
94 - "tri" for triangular
95 - "triw" for triweight
96 - "uni" for uniform
98 bw : str, float
99 The bandwidth to use. Choices are:
101 - "scott" - 1.059 * A * nobs ** (-1/5.), where A is
102 `min(std(X),IQR/1.34)`
103 - "silverman" - .9 * A * nobs ** (-1/5.), where A is
104 `min(std(X),IQR/1.34)`
105 - "normal_reference" - C * A * nobs ** (-1/5.), where C is
106 calculated from the kernel. Equivalent (up to 2 dp) to the
107 "scott" bandwidth for gaussian kernels. See bandwidths.py
108 - If a float is given, it is the bandwidth.
110 fft : bool
111 Whether or not to use FFT. FFT implementation is more
112 computationally efficient. However, only the Gaussian kernel
113 is implemented. If FFT is False, then a 'nobs' x 'gridsize'
114 intermediate array is created.
115 gridsize : int
116 If gridsize is None, max(len(X), 50) is used.
117 cut : float
118 Defines the length of the grid past the lowest and highest values
119 of X so that the kernel goes to zero. The end points are
120 -/+ cut*bw*{min(X) or max(X)}
121 adjust : float
122 An adjustment factor for the bw. Bandwidth becomes bw * adjust.
123 """
124 try:
125 bw = float(bw)
126 self.bw_method = "user-given"
127 except:
128 self.bw_method = bw
129 endog = self.endog
131 if fft:
132 if kernel != "gau":
133 msg = "Only gaussian kernel is available for fft"
134 raise NotImplementedError(msg)
135 if weights is not None:
136 msg = "Weights are not implemented for fft"
137 raise NotImplementedError(msg)
138 density, grid, bw = kdensityfft(endog, kernel=kernel, bw=bw,
139 adjust=adjust, weights=weights, gridsize=gridsize,
140 clip=clip, cut=cut)
141 else:
142 density, grid, bw = kdensity(endog, kernel=kernel, bw=bw,
143 adjust=adjust, weights=weights, gridsize=gridsize,
144 clip=clip, cut=cut)
145 self.density = density
146 self.support = grid
147 self.bw = bw
148 self.kernel = kernel_switch[kernel](h=bw) # we instantiate twice,
149 # should this passed to funcs?
150 # put here to ensure empty cache after re-fit with new options
151 self.kernel.weights = weights
152 if weights is not None:
153 self.kernel.weights /= weights.sum()
154 self._cache = {}
156 @cache_readonly
157 def cdf(self):
158 """
159 Returns the cumulative distribution function evaluated at the support.
161 Notes
162 -----
163 Will not work if fit has not been called.
164 """
165 _checkisfit(self)
166 kern = self.kernel
167 if kern.domain is None: # TODO: test for grid point at domain bound
168 a,b = -np.inf,np.inf
169 else:
170 a,b = kern.domain
171 func = lambda x,s: kern.density(s,x)
173 support = self.support
174 support = np.r_[a,support]
175 gridsize = len(support)
176 endog = self.endog
177 probs = [integrate.quad(func, support[i - 1], support[i],
178 args=endog)[0] for i in range(1, gridsize)]
179 return np.cumsum(probs)
181 @cache_readonly
182 def cumhazard(self):
183 """
184 Returns the hazard function evaluated at the support.
186 Notes
187 -----
188 Will not work if fit has not been called.
189 """
190 _checkisfit(self)
191 return -np.log(self.sf)
193 @cache_readonly
194 def sf(self):
195 """
196 Returns the survival function evaluated at the support.
198 Notes
199 -----
200 Will not work if fit has not been called.
201 """
202 _checkisfit(self)
203 return 1 - self.cdf
205 @cache_readonly
206 def entropy(self):
207 """
208 Returns the differential entropy evaluated at the support
210 Notes
211 -----
212 Will not work if fit has not been called. 1e-12 is added to each
213 probability to ensure that log(0) is not called.
214 """
215 _checkisfit(self)
217 def entr(x,s):
218 pdf = kern.density(s,x)
219 return pdf*np.log(pdf+1e-12)
221 kern = self.kernel
223 if kern.domain is not None:
224 a, b = self.domain
225 else:
226 a, b = -np.inf, np.inf
227 endog = self.endog
228 #TODO: below could run into integr problems, cf. stats.dist._entropy
229 return -integrate.quad(entr, a, b, args=(endog,))[0]
231 @cache_readonly
232 def icdf(self):
233 """
234 Inverse Cumulative Distribution (Quantile) Function
236 Notes
237 -----
238 Will not work if fit has not been called. Uses
239 `scipy.stats.mstats.mquantiles`.
240 """
241 _checkisfit(self)
242 gridsize = len(self.density)
243 return stats.mstats.mquantiles(self.endog, np.linspace(0, 1, gridsize))
245 def evaluate(self, point):
246 """
247 Evaluate density at a single point.
249 Parameters
250 ----------
251 point : float
252 Point at which to evaluate the density.
253 """
254 _checkisfit(self)
255 return self.kernel.density(self.endog, point)
258#### Kernel Density Estimator Functions ####
260def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
261 adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True):
262 """
263 Rosenblatt-Parzen univariate kernel density estimator.
265 Parameters
266 ----------
267 X : array_like
268 The variable for which the density estimate is desired.
269 kernel : str
270 The Kernel to be used. Choices are
271 - "biw" for biweight
272 - "cos" for cosine
273 - "epa" for Epanechnikov
274 - "gau" for Gaussian.
275 - "tri" for triangular
276 - "triw" for triweight
277 - "uni" for uniform
278 bw : str, float
279 "scott" - 1.059 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34)
280 "silverman" - .9 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34)
281 If a float is given, it is the bandwidth.
282 weights : array or None
283 Optional weights. If the X value is clipped, then this weight is
284 also dropped.
285 gridsize : int
286 If gridsize is None, max(len(X), 50) is used.
287 adjust : float
288 An adjustment factor for the bw. Bandwidth becomes bw * adjust.
289 clip : tuple
290 Observations in X that are outside of the range given by clip are
291 dropped. The number of observations in X is then shortened.
292 cut : float
293 Defines the length of the grid past the lowest and highest values of X
294 so that the kernel goes to zero. The end points are
295 -/+ cut*bw*{min(X) or max(X)}
296 retgrid : bool
297 Whether or not to return the grid over which the density is estimated.
299 Returns
300 -------
301 density : ndarray
302 The densities estimated at the grid points.
303 grid : ndarray, optional
304 The grid points at which the density is estimated.
306 Notes
307 -----
308 Creates an intermediate (`gridsize` x `nobs`) array. Use FFT for a more
309 computationally efficient version.
310 """
311 X = np.asarray(X)
312 if X.ndim == 1:
313 X = X[:, None]
314 clip_x = np.logical_and(X > clip[0], X < clip[1])
315 X = X[clip_x]
317 nobs = len(X) # after trim
319 if gridsize is None:
320 gridsize = max(nobs,50) # do not need to resize if no FFT
322 # handle weights
323 if weights is None:
324 weights = np.ones(nobs)
325 q = nobs
326 else:
327 # ensure weights is a numpy array
328 weights = np.asarray(weights)
330 if len(weights) != len(clip_x):
331 msg = "The length of the weights must be the same as the given X."
332 raise ValueError(msg)
333 weights = weights[clip_x.squeeze()]
334 q = weights.sum()
336 # Get kernel object corresponding to selection
337 kern = kernel_switch[kernel]()
339 # if bw is None, select optimal bandwidth for kernel
340 try:
341 bw = float(bw)
342 except:
343 bw = bandwidths.select_bandwidth(X, bw, kern)
344 bw *= adjust
346 a = np.min(X, axis=0) - cut * bw
347 b = np.max(X, axis=0) + cut * bw
348 grid = np.linspace(a, b, gridsize)
350 k = (X.T - grid[:, None])/bw # uses broadcasting to make a gridsize x nobs
352 # set kernel bandwidth
353 kern.seth(bw)
355 # truncate to domain
356 if kern.domain is not None: # will not work for piecewise kernels like parzen
357 z_lo, z_high = kern.domain
358 domain_mask = (k < z_lo) | (k > z_high)
359 k = kern(k) # estimate density
360 k[domain_mask] = 0
361 else:
362 k = kern(k) # estimate density
364 k[k < 0] = 0 # get rid of any negative values, do we need this?
366 dens = np.dot(k, weights)/(q*bw)
368 if retgrid:
369 return dens, grid, bw
370 else:
371 return dens, bw
373def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
374 adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True):
375 """
376 Rosenblatt-Parzen univariate kernel density estimator
378 Parameters
379 ----------
380 X : array_like
381 The variable for which the density estimate is desired.
382 kernel : str
383 ONLY GAUSSIAN IS CURRENTLY IMPLEMENTED.
384 "bi" for biweight
385 "cos" for cosine
386 "epa" for Epanechnikov, default
387 "epa2" for alternative Epanechnikov
388 "gau" for Gaussian.
389 "par" for Parzen
390 "rect" for rectangular
391 "tri" for triangular
392 bw : str, float
393 "scott" - 1.059 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34)
394 "silverman" - .9 * A * nobs ** (-1/5.), where A is min(std(X),IQR/1.34)
395 If a float is given, it is the bandwidth.
396 weights : array or None
397 WEIGHTS ARE NOT CURRENTLY IMPLEMENTED.
398 Optional weights. If the X value is clipped, then this weight is
399 also dropped.
400 gridsize : int
401 If gridsize is None, min(len(X), 512) is used. Note that the provided
402 number is rounded up to the next highest power of 2.
403 adjust : float
404 An adjustment factor for the bw. Bandwidth becomes bw * adjust.
405 clip : tuple
406 Observations in X that are outside of the range given by clip are
407 dropped. The number of observations in X is then shortened.
408 cut : float
409 Defines the length of the grid past the lowest and highest values of X
410 so that the kernel goes to zero. The end points are
411 -/+ cut*bw*{X.min() or X.max()}
412 retgrid : bool
413 Whether or not to return the grid over which the density is estimated.
415 Returns
416 -------
417 density : ndarray
418 The densities estimated at the grid points.
419 grid : ndarray, optional
420 The grid points at which the density is estimated.
422 Notes
423 -----
424 Only the default kernel is implemented. Weights are not implemented yet.
425 This follows Silverman (1982) with changes suggested by Jones and Lotwick
426 (1984). However, the discretization step is replaced by linear binning
427 of Fan and Marron (1994). This should be extended to accept the parts
428 that are dependent only on the data to speed things up for
429 cross-validation.
431 References
432 ----------
433 Fan, J. and J.S. Marron. (1994) `Fast implementations of nonparametric
434 curve estimators`. Journal of Computational and Graphical Statistics.
435 3.1, 35-56.
436 Jones, M.C. and H.W. Lotwick. (1984) `Remark AS R50: A Remark on Algorithm
437 AS 176. Kernal Density Estimation Using the Fast Fourier Transform`.
438 Journal of the Royal Statistical Society. Series C. 33.1, 120-2.
439 Silverman, B.W. (1982) `Algorithm AS 176. Kernel density estimation using
440 the Fast Fourier Transform. Journal of the Royal Statistical Society.
441 Series C. 31.2, 93-9.
442 """
443 X = np.asarray(X)
444 X = X[np.logical_and(X > clip[0], X < clip[1])] # will not work for two columns.
445 # will affect underlying data?
447 # Get kernel object corresponding to selection
448 kern = kernel_switch[kernel]()
450 try:
451 bw = float(bw)
452 except:
453 bw = bandwidths.select_bandwidth(X, bw, kern) # will cross-val fit this pattern?
454 bw *= adjust
456 nobs = len(X) # after trim
458 # 1 Make grid and discretize the data
459 if gridsize is None:
460 gridsize = np.max((nobs, 512.))
461 gridsize = 2**np.ceil(np.log2(gridsize)) # round to next power of 2
463 a = np.min(X) - cut * bw
464 b = np.max(X) + cut * bw
465 grid,delta = np.linspace(a, b, int(gridsize), retstep=True)
466 RANGE = b - a
468#TODO: Fix this?
469# This is the Silverman binning function, but I believe it's buggy (SS)
470# weighting according to Silverman
471# count = counts(X,grid)
472# binned = np.zeros_like(grid) #xi_{k} in Silverman
473# j = 0
474# for k in range(int(gridsize-1)):
475# if count[k]>0: # there are points of X in the grid here
476# Xingrid = X[j:j+count[k]] # get all these points
477# # get weights at grid[k],grid[k+1]
478# binned[k] += np.sum(grid[k+1]-Xingrid)
479# binned[k+1] += np.sum(Xingrid-grid[k])
480# j += count[k]
481# binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta
483#NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING
484 binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
486 # step 2 compute FFT of the weights, using Munro (1976) FFT convention
487 y = forrt(binned)
489 # step 3 and 4 for optimal bw compute zstar and the density estimate f
490 # do not have to redo the above if just changing bw, ie., for cross val
492#NOTE: silverman_transform is the closed form solution of the FFT of the
493#gaussian kernel. Not yet sure how to generalize it.
494 zstar = silverman_transform(bw, gridsize, RANGE)*y # 3.49 in Silverman
495 # 3.50 w Gaussian kernel
496 f = revrt(zstar)
497 if retgrid:
498 return f, grid, bw
499 else:
500 return f, bw
502if __name__ == "__main__":
503 import numpy as np
504 np.random.seed(12345)
505 xi = np.random.randn(100)
506 f,grid, bw1 = kdensity(xi, kernel="gau", bw=.372735, retgrid=True)
507 f2, bw2 = kdensityfft(xi, kernel="gau", bw="silverman",retgrid=False)
509# do some checking vs. silverman algo.
510# you need denes.f, http://lib.stat.cmu.edu/apstat/176
511#NOTE: I (SS) made some changes to the Fortran
512# and the FFT stuff from Munro http://lib.stat.cmu.edu/apstat/97o
513# then compile everything and link to denest with f2py
514#Make pyf file as usual, then compile shared object
515#f2py denest.f -m denest2 -h denest.pyf
516#edit pyf
517#-c flag makes it available to other programs, fPIC builds a shared library
518#/usr/bin/gfortran -Wall -c -fPIC fft.f
519#f2py -c denest.pyf ./fft.o denest.f
521 try:
522 from denest2 import denest # @UnresolvedImport
523 a = -3.4884382032045504
524 b = 4.3671504686785605
525 RANGE = b - a
526 bw = bandwidths.bw_silverman(xi)
528 ft,smooth,ifault,weights,smooth1 = denest(xi,a,b,bw,np.zeros(512),np.zeros(512),0,
529 np.zeros(512), np.zeros(512))
530# We use a different binning algo, so only accurate up to 3 decimal places
531 np.testing.assert_almost_equal(f2, smooth, 3)
532#NOTE: for debugging
533# y2 = forrt(weights)
534# RJ = np.arange(512/2+1)
535# FAC1 = 2*(np.pi*bw/RANGE)**2
536# RJFAC = RJ**2*FAC1
537# BC = 1 - RJFAC/(6*(bw/((b-a)/M))**2)
538# FAC = np.exp(-RJFAC)/BC
539# SMOOTH = np.r_[FAC,FAC[1:-1]] * y2
541# dens = revrt(SMOOTH)
543 except:
544# ft = np.loadtxt('./ft_silver.csv')
545# smooth = np.loadtxt('./smooth_silver.csv')
546 print("Did not get the estimates from the Silverman algorithm")