Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Module containing the base object for multivariate kernel density and 

3regression, plus some utilities. 

4""" 

5import copy 

6 

7import numpy as np 

8from scipy import optimize 

9from scipy.stats.mstats import mquantiles 

10 

11try: 

12 import joblib 

13 has_joblib = True 

14except ImportError: 

15 has_joblib = False 

16 

17from . import kernels 

18 

19 

20kernel_func = dict(wangryzin=kernels.wang_ryzin, 

21 aitchisonaitken=kernels.aitchison_aitken, 

22 gaussian=kernels.gaussian, 

23 aitchison_aitken_reg = kernels.aitchison_aitken_reg, 

24 wangryzin_reg = kernels.wang_ryzin_reg, 

25 gauss_convolution=kernels.gaussian_convolution, 

26 wangryzin_convolution=kernels.wang_ryzin_convolution, 

27 aitchisonaitken_convolution=kernels.aitchison_aitken_convolution, 

28 gaussian_cdf=kernels.gaussian_cdf, 

29 aitchisonaitken_cdf=kernels.aitchison_aitken_cdf, 

30 wangryzin_cdf=kernels.wang_ryzin_cdf, 

31 d_gaussian=kernels.d_gaussian, 

32 tricube=kernels.tricube) 

33 

34 

35def _compute_min_std_IQR(data): 

36 """Compute minimum of std and IQR for each variable.""" 

37 s1 = np.std(data, axis=0) 

38 q75 = mquantiles(data, 0.75, axis=0).data[0] 

39 q25 = mquantiles(data, 0.25, axis=0).data[0] 

40 s2 = (q75 - q25) / 1.349 # IQR 

41 dispersion = np.minimum(s1, s2) 

42 return dispersion 

43 

44 

45def _compute_subset(class_type, data, bw, co, do, n_cvars, ix_ord, 

46 ix_unord, n_sub, class_vars, randomize, bound): 

47 """"Compute bw on subset of data. 

48 

49 Called from ``GenericKDE._compute_efficient_*``. 

50 

51 Notes 

52 ----- 

53 Needs to be outside the class in order for joblib to be able to pickle it. 

54 """ 

55 if randomize: 

56 np.random.shuffle(data) 

57 sub_data = data[:n_sub, :] 

58 else: 

59 sub_data = data[bound[0]:bound[1], :] 

60 

61 if class_type == 'KDEMultivariate': 

62 from .kernel_density import KDEMultivariate 

63 var_type = class_vars[0] 

64 sub_model = KDEMultivariate(sub_data, var_type, bw=bw, 

65 defaults=EstimatorSettings(efficient=False)) 

66 elif class_type == 'KDEMultivariateConditional': 

67 from .kernel_density import KDEMultivariateConditional 

68 k_dep, dep_type, indep_type = class_vars 

69 endog = sub_data[:, :k_dep] 

70 exog = sub_data[:, k_dep:] 

71 sub_model = KDEMultivariateConditional(endog, exog, dep_type, 

72 indep_type, bw=bw, defaults=EstimatorSettings(efficient=False)) 

73 elif class_type == 'KernelReg': 

74 from .kernel_regression import KernelReg 

75 var_type, k_vars, reg_type = class_vars 

76 endog = _adjust_shape(sub_data[:, 0], 1) 

77 exog = _adjust_shape(sub_data[:, 1:], k_vars) 

78 sub_model = KernelReg(endog=endog, exog=exog, reg_type=reg_type, 

79 var_type=var_type, bw=bw, 

80 defaults=EstimatorSettings(efficient=False)) 

81 else: 

82 raise ValueError("class_type not recognized, should be one of " \ 

83 "{KDEMultivariate, KDEMultivariateConditional, KernelReg}") 

84 

85 # Compute dispersion in next 4 lines 

86 if class_type == 'KernelReg': 

87 sub_data = sub_data[:, 1:] 

88 

89 dispersion = _compute_min_std_IQR(sub_data) 

90 

91 fct = dispersion * n_sub**(-1. / (n_cvars + co)) 

92 fct[ix_unord] = n_sub**(-2. / (n_cvars + do)) 

93 fct[ix_ord] = n_sub**(-2. / (n_cvars + do)) 

94 sample_scale_sub = sub_model.bw / fct #TODO: check if correct 

95 bw_sub = sub_model.bw 

96 return sample_scale_sub, bw_sub 

97 

98 

99class GenericKDE (object): 

100 """ 

101 Base class for density estimation and regression KDE classes. 

102 """ 

103 def _compute_bw(self, bw): 

104 """ 

105 Computes the bandwidth of the data. 

106 

107 Parameters 

108 ---------- 

109 bw : {array_like, str} 

110 If array_like: user-specified bandwidth. 

111 If a string, should be one of: 

112 

113 - cv_ml: cross validation maximum likelihood 

114 - normal_reference: normal reference rule of thumb 

115 - cv_ls: cross validation least squares 

116 

117 Notes 

118 ----- 

119 The default values for bw is 'normal_reference'. 

120 """ 

121 if bw is None: 

122 bw = 'normal_reference' 

123 

124 if not isinstance(bw, str): 

125 self._bw_method = "user-specified" 

126 res = np.asarray(bw) 

127 else: 

128 # The user specified a bandwidth selection method 

129 self._bw_method = bw 

130 # Workaround to avoid instance methods in __dict__ 

131 if bw == 'normal_reference': 

132 bwfunc = self._normal_reference 

133 elif bw == 'cv_ml': 

134 bwfunc = self._cv_ml 

135 else: # bw == 'cv_ls' 

136 bwfunc = self._cv_ls 

137 res = bwfunc() 

138 

139 return res 

140 

141 def _compute_dispersion(self, data): 

142 """ 

143 Computes the measure of dispersion. 

144 

145 The minimum of the standard deviation and interquartile range / 1.349 

146 

147 Notes 

148 ----- 

149 Reimplemented in `KernelReg`, because the first column of `data` has to 

150 be removed. 

151 

152 References 

153 ---------- 

154 See the user guide for the np package in R. 

155 In the notes on bwscaling option in npreg, npudens, npcdens there is 

156 a discussion on the measure of dispersion 

157 """ 

158 return _compute_min_std_IQR(data) 

159 

160 def _get_class_vars_type(self): 

161 """Helper method to be able to pass needed vars to _compute_subset. 

162 

163 Needs to be implemented by subclasses.""" 

164 pass 

165 

166 def _compute_efficient(self, bw): 

167 """ 

168 Computes the bandwidth by estimating the scaling factor (c) 

169 in n_res resamples of size ``n_sub`` (in `randomize` case), or by 

170 dividing ``nobs`` into as many ``n_sub`` blocks as needed (if 

171 `randomize` is False). 

172 

173 References 

174 ---------- 

175 See p.9 in socserv.mcmaster.ca/racine/np_faq.pdf 

176 """ 

177 

178 if bw is None: 

179 self._bw_method = 'normal_reference' 

180 if isinstance(bw, str): 

181 self._bw_method = bw 

182 else: 

183 self._bw_method = "user-specified" 

184 return bw 

185 

186 nobs = self.nobs 

187 n_sub = self.n_sub 

188 data = copy.deepcopy(self.data) 

189 n_cvars = self.data_type.count('c') 

190 co = 4 # 2*order of continuous kernel 

191 do = 4 # 2*order of discrete kernel 

192 _, ix_ord, ix_unord = _get_type_pos(self.data_type) 

193 

194 # Define bounds for slicing the data 

195 if self.randomize: 

196 # randomize chooses blocks of size n_sub, independent of nobs 

197 bounds = [None] * self.n_res 

198 else: 

199 bounds = [(i * n_sub, (i+1) * n_sub) for i in range(nobs // n_sub)] 

200 if nobs % n_sub > 0: 

201 bounds.append((nobs - nobs % n_sub, nobs)) 

202 

203 n_blocks = self.n_res if self.randomize else len(bounds) 

204 sample_scale = np.empty((n_blocks, self.k_vars)) 

205 only_bw = np.empty((n_blocks, self.k_vars)) 

206 

207 class_type, class_vars = self._get_class_vars_type() 

208 if has_joblib: 

209 # `res` is a list of tuples (sample_scale_sub, bw_sub) 

210 res = joblib.Parallel(n_jobs=self.n_jobs)( 

211 joblib.delayed(_compute_subset)( 

212 class_type, data, bw, co, do, n_cvars, ix_ord, ix_unord, \ 

213 n_sub, class_vars, self.randomize, bounds[i]) \ 

214 for i in range(n_blocks)) 

215 else: 

216 res = [] 

217 for i in range(n_blocks): 

218 res.append(_compute_subset(class_type, data, bw, co, do, 

219 n_cvars, ix_ord, ix_unord, n_sub, 

220 class_vars, self.randomize, 

221 bounds[i])) 

222 

223 for i in range(n_blocks): 

224 sample_scale[i, :] = res[i][0] 

225 only_bw[i, :] = res[i][1] 

226 

227 s = self._compute_dispersion(data) 

228 order_func = np.median if self.return_median else np.mean 

229 m_scale = order_func(sample_scale, axis=0) 

230 # TODO: Check if 1/5 is correct in line below! 

231 bw = m_scale * s * nobs**(-1. / (n_cvars + co)) 

232 bw[ix_ord] = m_scale[ix_ord] * nobs**(-2./ (n_cvars + do)) 

233 bw[ix_unord] = m_scale[ix_unord] * nobs**(-2./ (n_cvars + do)) 

234 

235 if self.return_only_bw: 

236 bw = np.median(only_bw, axis=0) 

237 

238 return bw 

239 

240 def _set_defaults(self, defaults): 

241 """Sets the default values for the efficient estimation""" 

242 self.n_res = defaults.n_res 

243 self.n_sub = defaults.n_sub 

244 self.randomize = defaults.randomize 

245 self.return_median = defaults.return_median 

246 self.efficient = defaults.efficient 

247 self.return_only_bw = defaults.return_only_bw 

248 self.n_jobs = defaults.n_jobs 

249 

250 def _normal_reference(self): 

251 """ 

252 Returns Scott's normal reference rule of thumb bandwidth parameter. 

253 

254 Notes 

255 ----- 

256 See p.13 in [2] for an example and discussion. The formula for the 

257 bandwidth is 

258 

259 .. math:: h = 1.06n^{-1/(4+q)} 

260 

261 where ``n`` is the number of observations and ``q`` is the number of 

262 variables. 

263 """ 

264 X = np.std(self.data, axis=0) 

265 return 1.06 * X * self.nobs ** (- 1. / (4 + self.data.shape[1])) 

266 

267 def _set_bw_bounds(self, bw): 

268 """ 

269 Sets bandwidth lower bound to effectively zero )1e-10), and for 

270 discrete values upper bound to 1. 

271 """ 

272 bw[bw < 0] = 1e-10 

273 _, ix_ord, ix_unord = _get_type_pos(self.data_type) 

274 bw[ix_ord] = np.minimum(bw[ix_ord], 1.) 

275 bw[ix_unord] = np.minimum(bw[ix_unord], 1.) 

276 

277 return bw 

278 

279 def _cv_ml(self): 

280 r""" 

281 Returns the cross validation maximum likelihood bandwidth parameter. 

282 

283 Notes 

284 ----- 

285 For more details see p.16, 18, 27 in Ref. [1] (see module docstring). 

286 

287 Returns the bandwidth estimate that maximizes the leave-out-out 

288 likelihood. The leave-one-out log likelihood function is: 

289 

290 .. math:: \ln L=\sum_{i=1}^{n}\ln f_{-i}(X_{i}) 

291 

292 The leave-one-out kernel estimator of :math:`f_{-i}` is: 

293 

294 .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h} 

295 \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j}) 

296 

297 where :math:`K_{h}` represents the Generalized product kernel 

298 estimator: 

299 

300 .. math:: K_{h}(X_{i},X_{j})=\prod_{s=1}^ 

301 {q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right) 

302 """ 

303 # the initial value for the optimization is the normal_reference 

304 h0 = self._normal_reference() 

305 bw = optimize.fmin(self.loo_likelihood, x0=h0, args=(np.log, ), 

306 maxiter=1e3, maxfun=1e3, disp=0, xtol=1e-3) 

307 bw = self._set_bw_bounds(bw) # bound bw if necessary 

308 return bw 

309 

310 def _cv_ls(self): 

311 r""" 

312 Returns the cross-validation least squares bandwidth parameter(s). 

313 

314 Notes 

315 ----- 

316 For more details see pp. 16, 27 in Ref. [1] (see module docstring). 

317 

318 Returns the value of the bandwidth that maximizes the integrated mean 

319 square error between the estimated and actual distribution. The 

320 integrated mean square error (IMSE) is given by: 

321 

322 .. math:: \int\left[\hat{f}(x)-f(x)\right]^{2}dx 

323 

324 This is the general formula for the IMSE. The IMSE differs for 

325 conditional (``KDEMultivariateConditional``) and unconditional 

326 (``KDEMultivariate``) kernel density estimation. 

327 """ 

328 h0 = self._normal_reference() 

329 bw = optimize.fmin(self.imse, x0=h0, maxiter=1e3, maxfun=1e3, disp=0, 

330 xtol=1e-3) 

331 bw = self._set_bw_bounds(bw) # bound bw if necessary 

332 return bw 

333 

334 def loo_likelihood(self): 

335 raise NotImplementedError 

336 

337 

338class EstimatorSettings(object): 

339 """ 

340 Object to specify settings for density estimation or regression. 

341 

342 `EstimatorSettings` has several properties related to how bandwidth 

343 estimation for the `KDEMultivariate`, `KDEMultivariateConditional`, 

344 `KernelReg` and `CensoredKernelReg` classes behaves. 

345 

346 Parameters 

347 ---------- 

348 efficient : bool, optional 

349 If True, the bandwidth estimation is to be performed 

350 efficiently -- by taking smaller sub-samples and estimating 

351 the scaling factor of each subsample. This is useful for large 

352 samples (nobs >> 300) and/or multiple variables (k_vars > 3). 

353 If False (default), all data is used at the same time. 

354 randomize : bool, optional 

355 If True, the bandwidth estimation is to be performed by 

356 taking `n_res` random resamples (with replacement) of size `n_sub` from 

357 the full sample. If set to False (default), the estimation is 

358 performed by slicing the full sample in sub-samples of size `n_sub` so 

359 that all samples are used once. 

360 n_sub : int, optional 

361 Size of the sub-samples. Default is 50. 

362 n_res : int, optional 

363 The number of random re-samples used to estimate the bandwidth. 

364 Only has an effect if ``randomize == True``. Default value is 25. 

365 return_median : bool, optional 

366 If True (default), the estimator uses the median of all scaling factors 

367 for each sub-sample to estimate the bandwidth of the full sample. 

368 If False, the estimator uses the mean. 

369 return_only_bw : bool, optional 

370 If True, the estimator is to use the bandwidth and not the 

371 scaling factor. This is *not* theoretically justified. 

372 Should be used only for experimenting. 

373 n_jobs : int, optional 

374 The number of jobs to use for parallel estimation with 

375 ``joblib.Parallel``. Default is -1, meaning ``n_cores - 1``, with 

376 ``n_cores`` the number of available CPU cores. 

377 See the `joblib documentation 

378 <https://pythonhosted.org/joblib/parallel.html>`_ for more details. 

379 

380 Examples 

381 -------- 

382 >>> settings = EstimatorSettings(randomize=True, n_jobs=3) 

383 >>> k_dens = KDEMultivariate(data, var_type, defaults=settings) 

384 """ 

385 def __init__(self, efficient=False, randomize=False, n_res=25, n_sub=50, 

386 return_median=True, return_only_bw=False, n_jobs=-1): 

387 self.efficient = efficient 

388 self.randomize = randomize 

389 self.n_res = n_res 

390 self.n_sub = n_sub 

391 self.return_median = return_median 

392 self.return_only_bw = return_only_bw # TODO: remove this? 

393 self.n_jobs = n_jobs 

394 

395 

396class LeaveOneOut(object): 

397 """ 

398 Generator to give leave-one-out views on X. 

399 

400 Parameters 

401 ---------- 

402 X : array_like 

403 2-D array. 

404 

405 Examples 

406 -------- 

407 >>> X = np.random.normal(0, 1, [10,2]) 

408 >>> loo = LeaveOneOut(X) 

409 >>> for x in loo: 

410 ... print x 

411 

412 Notes 

413 ----- 

414 A little lighter weight than sklearn LOO. We do not need test index. 

415 Also passes views on X, not the index. 

416 """ 

417 def __init__(self, X): 

418 self.X = np.asarray(X) 

419 

420 def __iter__(self): 

421 X = self.X 

422 nobs, k_vars = np.shape(X) 

423 

424 for i in range(nobs): 

425 index = np.ones(nobs, dtype=np.bool) 

426 index[i] = False 

427 yield X[index, :] 

428 

429 

430def _get_type_pos(var_type): 

431 ix_cont = np.array([c == 'c' for c in var_type]) 

432 ix_ord = np.array([c == 'o' for c in var_type]) 

433 ix_unord = np.array([c == 'u' for c in var_type]) 

434 return ix_cont, ix_ord, ix_unord 

435 

436 

437def _adjust_shape(dat, k_vars): 

438 """ Returns an array of shape (nobs, k_vars) for use with `gpke`.""" 

439 dat = np.asarray(dat) 

440 if dat.ndim > 2: 

441 dat = np.squeeze(dat) 

442 if dat.ndim == 1 and k_vars > 1: # one obs many vars 

443 nobs = 1 

444 elif dat.ndim == 1 and k_vars == 1: # one obs one var 

445 nobs = len(dat) 

446 else: 

447 if np.shape(dat)[0] == k_vars and np.shape(dat)[1] != k_vars: 

448 dat = dat.T 

449 

450 nobs = np.shape(dat)[0] # ndim >1 so many obs many vars 

451 

452 dat = np.reshape(dat, (nobs, k_vars)) 

453 return dat 

454 

455 

456def gpke(bw, data, data_predict, var_type, ckertype='gaussian', 

457 okertype='wangryzin', ukertype='aitchisonaitken', tosum=True): 

458 r""" 

459 Returns the non-normalized Generalized Product Kernel Estimator 

460 

461 Parameters 

462 ---------- 

463 bw : 1-D ndarray 

464 The user-specified bandwidth parameters. 

465 data : 1D or 2-D ndarray 

466 The training data. 

467 data_predict : 1-D ndarray 

468 The evaluation points at which the kernel estimation is performed. 

469 var_type : str, optional 

470 The variable type (continuous, ordered, unordered). 

471 ckertype : str, optional 

472 The kernel used for the continuous variables. 

473 okertype : str, optional 

474 The kernel used for the ordered discrete variables. 

475 ukertype : str, optional 

476 The kernel used for the unordered discrete variables. 

477 tosum : bool, optional 

478 Whether or not to sum the calculated array of densities. Default is 

479 True. 

480 

481 Returns 

482 ------- 

483 dens : array_like 

484 The generalized product kernel density estimator. 

485 

486 Notes 

487 ----- 

488 The formula for the multivariate kernel estimator for the pdf is: 

489 

490 .. math:: f(x)=\frac{1}{nh_{1}...h_{q}}\sum_{i=1}^ 

491 {n}K\left(\frac{X_{i}-x}{h}\right) 

492 

493 where 

494 

495 .. math:: K\left(\frac{X_{i}-x}{h}\right) = 

496 k\left( \frac{X_{i1}-x_{1}}{h_{1}}\right)\times 

497 k\left( \frac{X_{i2}-x_{2}}{h_{2}}\right)\times...\times 

498 k\left(\frac{X_{iq}-x_{q}}{h_{q}}\right) 

499 """ 

500 kertypes = dict(c=ckertype, o=okertype, u=ukertype) 

501 #Kval = [] 

502 #for ii, vtype in enumerate(var_type): 

503 # func = kernel_func[kertypes[vtype]] 

504 # Kval.append(func(bw[ii], data[:, ii], data_predict[ii])) 

505 

506 #Kval = np.column_stack(Kval) 

507 

508 Kval = np.empty(data.shape) 

509 for ii, vtype in enumerate(var_type): 

510 func = kernel_func[kertypes[vtype]] 

511 Kval[:, ii] = func(bw[ii], data[:, ii], data_predict[ii]) 

512 

513 iscontinuous = np.array([c == 'c' for c in var_type]) 

514 dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous]) 

515 if tosum: 

516 return dens.sum(axis=0) 

517 else: 

518 return dens