Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/nonparametric/_kernel_base.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Module containing the base object for multivariate kernel density and
3regression, plus some utilities.
4"""
5import copy
7import numpy as np
8from scipy import optimize
9from scipy.stats.mstats import mquantiles
11try:
12 import joblib
13 has_joblib = True
14except ImportError:
15 has_joblib = False
17from . import kernels
20kernel_func = dict(wangryzin=kernels.wang_ryzin,
21 aitchisonaitken=kernels.aitchison_aitken,
22 gaussian=kernels.gaussian,
23 aitchison_aitken_reg = kernels.aitchison_aitken_reg,
24 wangryzin_reg = kernels.wang_ryzin_reg,
25 gauss_convolution=kernels.gaussian_convolution,
26 wangryzin_convolution=kernels.wang_ryzin_convolution,
27 aitchisonaitken_convolution=kernels.aitchison_aitken_convolution,
28 gaussian_cdf=kernels.gaussian_cdf,
29 aitchisonaitken_cdf=kernels.aitchison_aitken_cdf,
30 wangryzin_cdf=kernels.wang_ryzin_cdf,
31 d_gaussian=kernels.d_gaussian,
32 tricube=kernels.tricube)
35def _compute_min_std_IQR(data):
36 """Compute minimum of std and IQR for each variable."""
37 s1 = np.std(data, axis=0)
38 q75 = mquantiles(data, 0.75, axis=0).data[0]
39 q25 = mquantiles(data, 0.25, axis=0).data[0]
40 s2 = (q75 - q25) / 1.349 # IQR
41 dispersion = np.minimum(s1, s2)
42 return dispersion
45def _compute_subset(class_type, data, bw, co, do, n_cvars, ix_ord,
46 ix_unord, n_sub, class_vars, randomize, bound):
47 """"Compute bw on subset of data.
49 Called from ``GenericKDE._compute_efficient_*``.
51 Notes
52 -----
53 Needs to be outside the class in order for joblib to be able to pickle it.
54 """
55 if randomize:
56 np.random.shuffle(data)
57 sub_data = data[:n_sub, :]
58 else:
59 sub_data = data[bound[0]:bound[1], :]
61 if class_type == 'KDEMultivariate':
62 from .kernel_density import KDEMultivariate
63 var_type = class_vars[0]
64 sub_model = KDEMultivariate(sub_data, var_type, bw=bw,
65 defaults=EstimatorSettings(efficient=False))
66 elif class_type == 'KDEMultivariateConditional':
67 from .kernel_density import KDEMultivariateConditional
68 k_dep, dep_type, indep_type = class_vars
69 endog = sub_data[:, :k_dep]
70 exog = sub_data[:, k_dep:]
71 sub_model = KDEMultivariateConditional(endog, exog, dep_type,
72 indep_type, bw=bw, defaults=EstimatorSettings(efficient=False))
73 elif class_type == 'KernelReg':
74 from .kernel_regression import KernelReg
75 var_type, k_vars, reg_type = class_vars
76 endog = _adjust_shape(sub_data[:, 0], 1)
77 exog = _adjust_shape(sub_data[:, 1:], k_vars)
78 sub_model = KernelReg(endog=endog, exog=exog, reg_type=reg_type,
79 var_type=var_type, bw=bw,
80 defaults=EstimatorSettings(efficient=False))
81 else:
82 raise ValueError("class_type not recognized, should be one of " \
83 "{KDEMultivariate, KDEMultivariateConditional, KernelReg}")
85 # Compute dispersion in next 4 lines
86 if class_type == 'KernelReg':
87 sub_data = sub_data[:, 1:]
89 dispersion = _compute_min_std_IQR(sub_data)
91 fct = dispersion * n_sub**(-1. / (n_cvars + co))
92 fct[ix_unord] = n_sub**(-2. / (n_cvars + do))
93 fct[ix_ord] = n_sub**(-2. / (n_cvars + do))
94 sample_scale_sub = sub_model.bw / fct #TODO: check if correct
95 bw_sub = sub_model.bw
96 return sample_scale_sub, bw_sub
99class GenericKDE (object):
100 """
101 Base class for density estimation and regression KDE classes.
102 """
103 def _compute_bw(self, bw):
104 """
105 Computes the bandwidth of the data.
107 Parameters
108 ----------
109 bw : {array_like, str}
110 If array_like: user-specified bandwidth.
111 If a string, should be one of:
113 - cv_ml: cross validation maximum likelihood
114 - normal_reference: normal reference rule of thumb
115 - cv_ls: cross validation least squares
117 Notes
118 -----
119 The default values for bw is 'normal_reference'.
120 """
121 if bw is None:
122 bw = 'normal_reference'
124 if not isinstance(bw, str):
125 self._bw_method = "user-specified"
126 res = np.asarray(bw)
127 else:
128 # The user specified a bandwidth selection method
129 self._bw_method = bw
130 # Workaround to avoid instance methods in __dict__
131 if bw == 'normal_reference':
132 bwfunc = self._normal_reference
133 elif bw == 'cv_ml':
134 bwfunc = self._cv_ml
135 else: # bw == 'cv_ls'
136 bwfunc = self._cv_ls
137 res = bwfunc()
139 return res
141 def _compute_dispersion(self, data):
142 """
143 Computes the measure of dispersion.
145 The minimum of the standard deviation and interquartile range / 1.349
147 Notes
148 -----
149 Reimplemented in `KernelReg`, because the first column of `data` has to
150 be removed.
152 References
153 ----------
154 See the user guide for the np package in R.
155 In the notes on bwscaling option in npreg, npudens, npcdens there is
156 a discussion on the measure of dispersion
157 """
158 return _compute_min_std_IQR(data)
160 def _get_class_vars_type(self):
161 """Helper method to be able to pass needed vars to _compute_subset.
163 Needs to be implemented by subclasses."""
164 pass
166 def _compute_efficient(self, bw):
167 """
168 Computes the bandwidth by estimating the scaling factor (c)
169 in n_res resamples of size ``n_sub`` (in `randomize` case), or by
170 dividing ``nobs`` into as many ``n_sub`` blocks as needed (if
171 `randomize` is False).
173 References
174 ----------
175 See p.9 in socserv.mcmaster.ca/racine/np_faq.pdf
176 """
178 if bw is None:
179 self._bw_method = 'normal_reference'
180 if isinstance(bw, str):
181 self._bw_method = bw
182 else:
183 self._bw_method = "user-specified"
184 return bw
186 nobs = self.nobs
187 n_sub = self.n_sub
188 data = copy.deepcopy(self.data)
189 n_cvars = self.data_type.count('c')
190 co = 4 # 2*order of continuous kernel
191 do = 4 # 2*order of discrete kernel
192 _, ix_ord, ix_unord = _get_type_pos(self.data_type)
194 # Define bounds for slicing the data
195 if self.randomize:
196 # randomize chooses blocks of size n_sub, independent of nobs
197 bounds = [None] * self.n_res
198 else:
199 bounds = [(i * n_sub, (i+1) * n_sub) for i in range(nobs // n_sub)]
200 if nobs % n_sub > 0:
201 bounds.append((nobs - nobs % n_sub, nobs))
203 n_blocks = self.n_res if self.randomize else len(bounds)
204 sample_scale = np.empty((n_blocks, self.k_vars))
205 only_bw = np.empty((n_blocks, self.k_vars))
207 class_type, class_vars = self._get_class_vars_type()
208 if has_joblib:
209 # `res` is a list of tuples (sample_scale_sub, bw_sub)
210 res = joblib.Parallel(n_jobs=self.n_jobs)(
211 joblib.delayed(_compute_subset)(
212 class_type, data, bw, co, do, n_cvars, ix_ord, ix_unord, \
213 n_sub, class_vars, self.randomize, bounds[i]) \
214 for i in range(n_blocks))
215 else:
216 res = []
217 for i in range(n_blocks):
218 res.append(_compute_subset(class_type, data, bw, co, do,
219 n_cvars, ix_ord, ix_unord, n_sub,
220 class_vars, self.randomize,
221 bounds[i]))
223 for i in range(n_blocks):
224 sample_scale[i, :] = res[i][0]
225 only_bw[i, :] = res[i][1]
227 s = self._compute_dispersion(data)
228 order_func = np.median if self.return_median else np.mean
229 m_scale = order_func(sample_scale, axis=0)
230 # TODO: Check if 1/5 is correct in line below!
231 bw = m_scale * s * nobs**(-1. / (n_cvars + co))
232 bw[ix_ord] = m_scale[ix_ord] * nobs**(-2./ (n_cvars + do))
233 bw[ix_unord] = m_scale[ix_unord] * nobs**(-2./ (n_cvars + do))
235 if self.return_only_bw:
236 bw = np.median(only_bw, axis=0)
238 return bw
240 def _set_defaults(self, defaults):
241 """Sets the default values for the efficient estimation"""
242 self.n_res = defaults.n_res
243 self.n_sub = defaults.n_sub
244 self.randomize = defaults.randomize
245 self.return_median = defaults.return_median
246 self.efficient = defaults.efficient
247 self.return_only_bw = defaults.return_only_bw
248 self.n_jobs = defaults.n_jobs
250 def _normal_reference(self):
251 """
252 Returns Scott's normal reference rule of thumb bandwidth parameter.
254 Notes
255 -----
256 See p.13 in [2] for an example and discussion. The formula for the
257 bandwidth is
259 .. math:: h = 1.06n^{-1/(4+q)}
261 where ``n`` is the number of observations and ``q`` is the number of
262 variables.
263 """
264 X = np.std(self.data, axis=0)
265 return 1.06 * X * self.nobs ** (- 1. / (4 + self.data.shape[1]))
267 def _set_bw_bounds(self, bw):
268 """
269 Sets bandwidth lower bound to effectively zero )1e-10), and for
270 discrete values upper bound to 1.
271 """
272 bw[bw < 0] = 1e-10
273 _, ix_ord, ix_unord = _get_type_pos(self.data_type)
274 bw[ix_ord] = np.minimum(bw[ix_ord], 1.)
275 bw[ix_unord] = np.minimum(bw[ix_unord], 1.)
277 return bw
279 def _cv_ml(self):
280 r"""
281 Returns the cross validation maximum likelihood bandwidth parameter.
283 Notes
284 -----
285 For more details see p.16, 18, 27 in Ref. [1] (see module docstring).
287 Returns the bandwidth estimate that maximizes the leave-out-out
288 likelihood. The leave-one-out log likelihood function is:
290 .. math:: \ln L=\sum_{i=1}^{n}\ln f_{-i}(X_{i})
292 The leave-one-out kernel estimator of :math:`f_{-i}` is:
294 .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
295 \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})
297 where :math:`K_{h}` represents the Generalized product kernel
298 estimator:
300 .. math:: K_{h}(X_{i},X_{j})=\prod_{s=1}^
301 {q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
302 """
303 # the initial value for the optimization is the normal_reference
304 h0 = self._normal_reference()
305 bw = optimize.fmin(self.loo_likelihood, x0=h0, args=(np.log, ),
306 maxiter=1e3, maxfun=1e3, disp=0, xtol=1e-3)
307 bw = self._set_bw_bounds(bw) # bound bw if necessary
308 return bw
310 def _cv_ls(self):
311 r"""
312 Returns the cross-validation least squares bandwidth parameter(s).
314 Notes
315 -----
316 For more details see pp. 16, 27 in Ref. [1] (see module docstring).
318 Returns the value of the bandwidth that maximizes the integrated mean
319 square error between the estimated and actual distribution. The
320 integrated mean square error (IMSE) is given by:
322 .. math:: \int\left[\hat{f}(x)-f(x)\right]^{2}dx
324 This is the general formula for the IMSE. The IMSE differs for
325 conditional (``KDEMultivariateConditional``) and unconditional
326 (``KDEMultivariate``) kernel density estimation.
327 """
328 h0 = self._normal_reference()
329 bw = optimize.fmin(self.imse, x0=h0, maxiter=1e3, maxfun=1e3, disp=0,
330 xtol=1e-3)
331 bw = self._set_bw_bounds(bw) # bound bw if necessary
332 return bw
334 def loo_likelihood(self):
335 raise NotImplementedError
338class EstimatorSettings(object):
339 """
340 Object to specify settings for density estimation or regression.
342 `EstimatorSettings` has several properties related to how bandwidth
343 estimation for the `KDEMultivariate`, `KDEMultivariateConditional`,
344 `KernelReg` and `CensoredKernelReg` classes behaves.
346 Parameters
347 ----------
348 efficient : bool, optional
349 If True, the bandwidth estimation is to be performed
350 efficiently -- by taking smaller sub-samples and estimating
351 the scaling factor of each subsample. This is useful for large
352 samples (nobs >> 300) and/or multiple variables (k_vars > 3).
353 If False (default), all data is used at the same time.
354 randomize : bool, optional
355 If True, the bandwidth estimation is to be performed by
356 taking `n_res` random resamples (with replacement) of size `n_sub` from
357 the full sample. If set to False (default), the estimation is
358 performed by slicing the full sample in sub-samples of size `n_sub` so
359 that all samples are used once.
360 n_sub : int, optional
361 Size of the sub-samples. Default is 50.
362 n_res : int, optional
363 The number of random re-samples used to estimate the bandwidth.
364 Only has an effect if ``randomize == True``. Default value is 25.
365 return_median : bool, optional
366 If True (default), the estimator uses the median of all scaling factors
367 for each sub-sample to estimate the bandwidth of the full sample.
368 If False, the estimator uses the mean.
369 return_only_bw : bool, optional
370 If True, the estimator is to use the bandwidth and not the
371 scaling factor. This is *not* theoretically justified.
372 Should be used only for experimenting.
373 n_jobs : int, optional
374 The number of jobs to use for parallel estimation with
375 ``joblib.Parallel``. Default is -1, meaning ``n_cores - 1``, with
376 ``n_cores`` the number of available CPU cores.
377 See the `joblib documentation
378 <https://pythonhosted.org/joblib/parallel.html>`_ for more details.
380 Examples
381 --------
382 >>> settings = EstimatorSettings(randomize=True, n_jobs=3)
383 >>> k_dens = KDEMultivariate(data, var_type, defaults=settings)
384 """
385 def __init__(self, efficient=False, randomize=False, n_res=25, n_sub=50,
386 return_median=True, return_only_bw=False, n_jobs=-1):
387 self.efficient = efficient
388 self.randomize = randomize
389 self.n_res = n_res
390 self.n_sub = n_sub
391 self.return_median = return_median
392 self.return_only_bw = return_only_bw # TODO: remove this?
393 self.n_jobs = n_jobs
396class LeaveOneOut(object):
397 """
398 Generator to give leave-one-out views on X.
400 Parameters
401 ----------
402 X : array_like
403 2-D array.
405 Examples
406 --------
407 >>> X = np.random.normal(0, 1, [10,2])
408 >>> loo = LeaveOneOut(X)
409 >>> for x in loo:
410 ... print x
412 Notes
413 -----
414 A little lighter weight than sklearn LOO. We do not need test index.
415 Also passes views on X, not the index.
416 """
417 def __init__(self, X):
418 self.X = np.asarray(X)
420 def __iter__(self):
421 X = self.X
422 nobs, k_vars = np.shape(X)
424 for i in range(nobs):
425 index = np.ones(nobs, dtype=np.bool)
426 index[i] = False
427 yield X[index, :]
430def _get_type_pos(var_type):
431 ix_cont = np.array([c == 'c' for c in var_type])
432 ix_ord = np.array([c == 'o' for c in var_type])
433 ix_unord = np.array([c == 'u' for c in var_type])
434 return ix_cont, ix_ord, ix_unord
437def _adjust_shape(dat, k_vars):
438 """ Returns an array of shape (nobs, k_vars) for use with `gpke`."""
439 dat = np.asarray(dat)
440 if dat.ndim > 2:
441 dat = np.squeeze(dat)
442 if dat.ndim == 1 and k_vars > 1: # one obs many vars
443 nobs = 1
444 elif dat.ndim == 1 and k_vars == 1: # one obs one var
445 nobs = len(dat)
446 else:
447 if np.shape(dat)[0] == k_vars and np.shape(dat)[1] != k_vars:
448 dat = dat.T
450 nobs = np.shape(dat)[0] # ndim >1 so many obs many vars
452 dat = np.reshape(dat, (nobs, k_vars))
453 return dat
456def gpke(bw, data, data_predict, var_type, ckertype='gaussian',
457 okertype='wangryzin', ukertype='aitchisonaitken', tosum=True):
458 r"""
459 Returns the non-normalized Generalized Product Kernel Estimator
461 Parameters
462 ----------
463 bw : 1-D ndarray
464 The user-specified bandwidth parameters.
465 data : 1D or 2-D ndarray
466 The training data.
467 data_predict : 1-D ndarray
468 The evaluation points at which the kernel estimation is performed.
469 var_type : str, optional
470 The variable type (continuous, ordered, unordered).
471 ckertype : str, optional
472 The kernel used for the continuous variables.
473 okertype : str, optional
474 The kernel used for the ordered discrete variables.
475 ukertype : str, optional
476 The kernel used for the unordered discrete variables.
477 tosum : bool, optional
478 Whether or not to sum the calculated array of densities. Default is
479 True.
481 Returns
482 -------
483 dens : array_like
484 The generalized product kernel density estimator.
486 Notes
487 -----
488 The formula for the multivariate kernel estimator for the pdf is:
490 .. math:: f(x)=\frac{1}{nh_{1}...h_{q}}\sum_{i=1}^
491 {n}K\left(\frac{X_{i}-x}{h}\right)
493 where
495 .. math:: K\left(\frac{X_{i}-x}{h}\right) =
496 k\left( \frac{X_{i1}-x_{1}}{h_{1}}\right)\times
497 k\left( \frac{X_{i2}-x_{2}}{h_{2}}\right)\times...\times
498 k\left(\frac{X_{iq}-x_{q}}{h_{q}}\right)
499 """
500 kertypes = dict(c=ckertype, o=okertype, u=ukertype)
501 #Kval = []
502 #for ii, vtype in enumerate(var_type):
503 # func = kernel_func[kertypes[vtype]]
504 # Kval.append(func(bw[ii], data[:, ii], data_predict[ii]))
506 #Kval = np.column_stack(Kval)
508 Kval = np.empty(data.shape)
509 for ii, vtype in enumerate(var_type):
510 func = kernel_func[kertypes[vtype]]
511 Kval[:, ii] = func(bw[ii], data[:, ii], data_predict[ii])
513 iscontinuous = np.array([c == 'c' for c in var_type])
514 dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous])
515 if tosum:
516 return dens.sum(axis=0)
517 else:
518 return dens