Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Multivariate Conditional and Unconditional Kernel Density Estimation 

3with Mixed Data Types. 

4 

5References 

6---------- 

7[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice. 

8 Princeton University Press. (2007) 

9[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation 

10 and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008) 

11 http://dx.doi.org/10.1561/0800000009 

12[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions 

13 with Categorical and Continuous Data." Working Paper. (2000) 

14[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional 

15 Distributions Annals of Economics and Finance 5, 211-235 (2004) 

16[5] Liu, R., Yang, L. "Kernel estimation of multivariate 

17 cumulative distribution function." 

18 Journal of Nonparametric Statistics (2008) 

19[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF 

20 with Categorical and Continuous Data." Working Paper 

21[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric 

22 regression" Statistica Sinica 14(2004), pp. 485-512 

23[8] Racine, J.: "Consistent Significance Testing for Nonparametric 

24 Regression" Journal of Business & Economics Statistics 

25[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of 

26 Categorical Predictor Variables in Nonparametric Regression 

27 Models", 2006, Econometric Reviews 25, 523-544 

28 

29""" 

30# TODO: make default behavior efficient=True above a certain n_obs 

31import numpy as np 

32 

33from . import kernels 

34from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \ 

35 LeaveOneOut, _adjust_shape 

36 

37 

38__all__ = ['KDEMultivariate', 'KDEMultivariateConditional', 'EstimatorSettings'] 

39 

40 

41class KDEMultivariate(GenericKDE): 

42 """ 

43 Multivariate kernel density estimator. 

44 

45 This density estimator can handle univariate as well as multivariate data, 

46 including mixed continuous / ordered discrete / unordered discrete data. 

47 It also provides cross-validated bandwidth selection methods (least 

48 squares, maximum likelihood). 

49 

50 Parameters 

51 ---------- 

52 data : list of ndarrays or 2-D ndarray 

53 The training data for the Kernel Density Estimation, used to determine 

54 the bandwidth(s). If a 2-D array, should be of shape 

55 (num_observations, num_variables). If a list, each list element is a 

56 separate observation. 

57 var_type : str 

58 The type of the variables: 

59 

60 - c : continuous 

61 - u : unordered (discrete) 

62 - o : ordered (discrete) 

63 

64 The string should contain a type specifier for each variable, so for 

65 example ``var_type='ccuo'``. 

66 bw : array_like or str, optional 

67 If an array, it is a fixed user-specified bandwidth. If a string, 

68 should be one of: 

69 

70 - normal_reference: normal reference rule of thumb (default) 

71 - cv_ml: cross validation maximum likelihood 

72 - cv_ls: cross validation least squares 

73 

74 defaults : EstimatorSettings instance, optional 

75 The default values for (efficient) bandwidth estimation. 

76 

77 Attributes 

78 ---------- 

79 bw : array_like 

80 The bandwidth parameters. 

81 

82 See Also 

83 -------- 

84 KDEMultivariateConditional 

85 

86 Examples 

87 -------- 

88 >>> import statsmodels.api as sm 

89 >>> nobs = 300 

90 >>> np.random.seed(1234) # Seed random generator 

91 >>> c1 = np.random.normal(size=(nobs,1)) 

92 >>> c2 = np.random.normal(2, 1, size=(nobs,1)) 

93 

94 Estimate a bivariate distribution and display the bandwidth found: 

95 

96 >>> dens_u = sm.nonparametric.KDEMultivariate(data=[c1,c2], 

97 ... var_type='cc', bw='normal_reference') 

98 >>> dens_u.bw 

99 array([ 0.39967419, 0.38423292]) 

100 """ 

101 def __init__(self, data, var_type, bw=None, defaults=None): 

102 self.var_type = var_type 

103 self.k_vars = len(self.var_type) 

104 self.data = _adjust_shape(data, self.k_vars) 

105 self.data_type = var_type 

106 self.nobs, self.k_vars = np.shape(self.data) 

107 if self.nobs <= self.k_vars: 

108 raise ValueError("The number of observations must be larger " \ 

109 "than the number of variables.") 

110 defaults = EstimatorSettings() if defaults is None else defaults 

111 self._set_defaults(defaults) 

112 if not self.efficient: 

113 self.bw = self._compute_bw(bw) 

114 else: 

115 self.bw = self._compute_efficient(bw) 

116 

117 def __repr__(self): 

118 """Provide something sane to print.""" 

119 rpr = "KDE instance\n" 

120 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n" 

121 rpr += "Number of samples: nobs = " + str(self.nobs) + "\n" 

122 rpr += "Variable types: " + self.var_type + "\n" 

123 rpr += "BW selection method: " + self._bw_method + "\n" 

124 return rpr 

125 

126 def loo_likelihood(self, bw, func=lambda x: x): 

127 r""" 

128 Returns the leave-one-out likelihood function. 

129 

130 The leave-one-out likelihood function for the unconditional KDE. 

131 

132 Parameters 

133 ---------- 

134 bw : array_like 

135 The value for the bandwidth parameter(s). 

136 func : callable, optional 

137 Function to transform the likelihood values (before summing); for 

138 the log likelihood, use ``func=np.log``. Default is ``f(x) = x``. 

139 

140 Notes 

141 ----- 

142 The leave-one-out kernel estimator of :math:`f_{-i}` is: 

143 

144 .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h} 

145 \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j}) 

146 

147 where :math:`K_{h}` represents the generalized product kernel 

148 estimator: 

149 

150 .. math:: K_{h}(X_{i},X_{j}) = 

151 \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right) 

152 """ 

153 LOO = LeaveOneOut(self.data) 

154 L = 0 

155 for i, X_not_i in enumerate(LOO): 

156 f_i = gpke(bw, data=-X_not_i, data_predict=-self.data[i, :], 

157 var_type=self.var_type) 

158 L += func(f_i) 

159 

160 return -L 

161 

162 def pdf(self, data_predict=None): 

163 r""" 

164 Evaluate the probability density function. 

165 

166 Parameters 

167 ---------- 

168 data_predict : array_like, optional 

169 Points to evaluate at. If unspecified, the training data is used. 

170 

171 Returns 

172 ------- 

173 pdf_est : array_like 

174 Probability density function evaluated at `data_predict`. 

175 

176 Notes 

177 ----- 

178 The probability density is given by the generalized product kernel 

179 estimator: 

180 

181 .. math:: K_{h}(X_{i},X_{j}) = 

182 \prod_{s=1}^{q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right) 

183 """ 

184 if data_predict is None: 

185 data_predict = self.data 

186 else: 

187 data_predict = _adjust_shape(data_predict, self.k_vars) 

188 

189 pdf_est = [] 

190 for i in range(np.shape(data_predict)[0]): 

191 pdf_est.append(gpke(self.bw, data=self.data, 

192 data_predict=data_predict[i, :], 

193 var_type=self.var_type) / self.nobs) 

194 

195 pdf_est = np.squeeze(pdf_est) 

196 return pdf_est 

197 

198 def cdf(self, data_predict=None): 

199 r""" 

200 Evaluate the cumulative distribution function. 

201 

202 Parameters 

203 ---------- 

204 data_predict : array_like, optional 

205 Points to evaluate at. If unspecified, the training data is used. 

206 

207 Returns 

208 ------- 

209 cdf_est : array_like 

210 The estimate of the cdf. 

211 

212 Notes 

213 ----- 

214 See https://en.wikipedia.org/wiki/Cumulative_distribution_function 

215 For more details on the estimation see Ref. [5] in module docstring. 

216 

217 The multivariate CDF for mixed data (continuous and ordered/unordered 

218 discrete) is estimated by: 

219 

220 .. math:: 

221 

222 F(x^{c},x^{d})=n^{-1}\sum_{i=1}^{n}\left[G(\frac{x^{c}-X_{i}}{h})\sum_{u\leq x^{d}}L(X_{i}^{d},x_{i}^{d}, \lambda)\right] 

223 

224 where G() is the product kernel CDF estimator for the continuous 

225 and L() for the discrete variables. 

226 

227 Used bandwidth is ``self.bw``. 

228 """ 

229 if data_predict is None: 

230 data_predict = self.data 

231 else: 

232 data_predict = _adjust_shape(data_predict, self.k_vars) 

233 

234 cdf_est = [] 

235 for i in range(np.shape(data_predict)[0]): 

236 cdf_est.append(gpke(self.bw, data=self.data, 

237 data_predict=data_predict[i, :], 

238 var_type=self.var_type, 

239 ckertype="gaussian_cdf", 

240 ukertype="aitchisonaitken_cdf", 

241 okertype='wangryzin_cdf') / self.nobs) 

242 

243 cdf_est = np.squeeze(cdf_est) 

244 return cdf_est 

245 

246 def imse(self, bw): 

247 r""" 

248 Returns the Integrated Mean Square Error for the unconditional KDE. 

249 

250 Parameters 

251 ---------- 

252 bw : array_like 

253 The bandwidth parameter(s). 

254 

255 Returns 

256 ------- 

257 CV : float 

258 The cross-validation objective function. 

259 

260 Notes 

261 ----- 

262 See p. 27 in [1]_ for details on how to handle the multivariate 

263 estimation with mixed data types see p.6 in [2]_. 

264 

265 The formula for the cross-validation objective function is: 

266 

267 .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N} 

268 \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n} 

269 \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j}) 

270 

271 Where :math:`\bar{K}_{h}` is the multivariate product convolution 

272 kernel (consult [2]_ for mixed data types). 

273 

274 References 

275 ---------- 

276 .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and 

277 practice. Princeton University Press. (2007) 

278 .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions 

279 with Categorical and Continuous Data." Working Paper. (2000) 

280 """ 

281 #F = 0 

282 #for i in range(self.nobs): 

283 # k_bar_sum = gpke(bw, data=-self.data, 

284 # data_predict=-self.data[i, :], 

285 # var_type=self.var_type, 

286 # ckertype='gauss_convolution', 

287 # okertype='wangryzin_convolution', 

288 # ukertype='aitchisonaitken_convolution') 

289 # F += k_bar_sum 

290 ## there is a + because loo_likelihood returns the negative 

291 #return (F / self.nobs**2 + self.loo_likelihood(bw) * \ 

292 # 2 / ((self.nobs) * (self.nobs - 1))) 

293 

294 # The code below is equivalent to the commented-out code above. It's 

295 # about 20% faster due to some code being moved outside the for-loops 

296 # and shared by gpke() and loo_likelihood(). 

297 F = 0 

298 kertypes = dict(c=kernels.gaussian_convolution, 

299 o=kernels.wang_ryzin_convolution, 

300 u=kernels.aitchison_aitken_convolution) 

301 nobs = self.nobs 

302 data = -self.data 

303 var_type = self.var_type 

304 ix_cont = np.array([c == 'c' for c in var_type]) 

305 _bw_cont_product = bw[ix_cont].prod() 

306 Kval = np.empty(data.shape) 

307 for i in range(nobs): 

308 for ii, vtype in enumerate(var_type): 

309 Kval[:, ii] = kertypes[vtype](bw[ii], 

310 data[:, ii], 

311 data[i, ii]) 

312 

313 dens = Kval.prod(axis=1) / _bw_cont_product 

314 k_bar_sum = dens.sum(axis=0) 

315 F += k_bar_sum # sum of prod kernel over nobs 

316 

317 kertypes = dict(c=kernels.gaussian, 

318 o=kernels.wang_ryzin, 

319 u=kernels.aitchison_aitken) 

320 LOO = LeaveOneOut(self.data) 

321 L = 0 # leave-one-out likelihood 

322 Kval = np.empty((data.shape[0]-1, data.shape[1])) 

323 for i, X_not_i in enumerate(LOO): 

324 for ii, vtype in enumerate(var_type): 

325 Kval[:, ii] = kertypes[vtype](bw[ii], 

326 -X_not_i[:, ii], 

327 data[i, ii]) 

328 dens = Kval.prod(axis=1) / _bw_cont_product 

329 L += dens.sum(axis=0) 

330 

331 # CV objective function, eq. (2.4) of Ref. [3] 

332 return (F / nobs**2 - 2 * L / (nobs * (nobs - 1))) 

333 

334 def _get_class_vars_type(self): 

335 """Helper method to be able to pass needed vars to _compute_subset.""" 

336 class_type = 'KDEMultivariate' 

337 class_vars = (self.var_type, ) 

338 return class_type, class_vars 

339 

340 

341class KDEMultivariateConditional(GenericKDE): 

342 """ 

343 Conditional multivariate kernel density estimator. 

344 

345 Calculates ``P(Y_1,Y_2,...Y_n | X_1,X_2...X_m) = 

346 P(X_1, X_2,...X_n, Y_1, Y_2,..., Y_m)/P(X_1, X_2,..., X_m)``. 

347 The conditional density is by definition the ratio of the two densities, 

348 see [1]_. 

349 

350 Parameters 

351 ---------- 

352 endog : list of ndarrays or 2-D ndarray 

353 The training data for the dependent variables, used to determine 

354 the bandwidth(s). If a 2-D array, should be of shape 

355 (num_observations, num_variables). If a list, each list element is a 

356 separate observation. 

357 exog : list of ndarrays or 2-D ndarray 

358 The training data for the independent variable; same shape as `endog`. 

359 dep_type : str 

360 The type of the dependent variables: 

361 

362 c : Continuous 

363 u : Unordered (Discrete) 

364 o : Ordered (Discrete) 

365 

366 The string should contain a type specifier for each variable, so for 

367 example ``dep_type='ccuo'``. 

368 indep_type : str 

369 The type of the independent variables; specified like `dep_type`. 

370 bw : array_like or str, optional 

371 If an array, it is a fixed user-specified bandwidth. If a string, 

372 should be one of: 

373 

374 - normal_reference: normal reference rule of thumb (default) 

375 - cv_ml: cross validation maximum likelihood 

376 - cv_ls: cross validation least squares 

377 

378 defaults : Instance of class EstimatorSettings 

379 The default values for the efficient bandwidth estimation 

380 

381 Attributes 

382 ---------- 

383 bw : array_like 

384 The bandwidth parameters 

385 

386 See Also 

387 -------- 

388 KDEMultivariate 

389 

390 References 

391 ---------- 

392 .. [1] https://en.wikipedia.org/wiki/Conditional_probability_distribution 

393 

394 Examples 

395 -------- 

396 >>> import statsmodels.api as sm 

397 >>> nobs = 300 

398 >>> c1 = np.random.normal(size=(nobs,1)) 

399 >>> c2 = np.random.normal(2,1,size=(nobs,1)) 

400 

401 >>> dens_c = sm.nonparametric.KDEMultivariateConditional(endog=[c1], 

402 ... exog=[c2], dep_type='c', indep_type='c', bw='normal_reference') 

403 >>> dens_c.bw # show computed bandwidth 

404 array([ 0.41223484, 0.40976931]) 

405 """ 

406 

407 def __init__(self, endog, exog, dep_type, indep_type, bw, 

408 defaults=None): 

409 self.dep_type = dep_type 

410 self.indep_type = indep_type 

411 self.data_type = dep_type + indep_type 

412 self.k_dep = len(self.dep_type) 

413 self.k_indep = len(self.indep_type) 

414 self.endog = _adjust_shape(endog, self.k_dep) 

415 self.exog = _adjust_shape(exog, self.k_indep) 

416 self.nobs, self.k_dep = np.shape(self.endog) 

417 self.data = np.column_stack((self.endog, self.exog)) 

418 self.k_vars = np.shape(self.data)[1] 

419 defaults = EstimatorSettings() if defaults is None else defaults 

420 self._set_defaults(defaults) 

421 if not self.efficient: 

422 self.bw = self._compute_bw(bw) 

423 else: 

424 self.bw = self._compute_efficient(bw) 

425 

426 def __repr__(self): 

427 """Provide something sane to print.""" 

428 rpr = "KDEMultivariateConditional instance\n" 

429 rpr += "Number of independent variables: k_indep = " + \ 

430 str(self.k_indep) + "\n" 

431 rpr += "Number of dependent variables: k_dep = " + \ 

432 str(self.k_dep) + "\n" 

433 rpr += "Number of observations: nobs = " + str(self.nobs) + "\n" 

434 rpr += "Independent variable types: " + self.indep_type + "\n" 

435 rpr += "Dependent variable types: " + self.dep_type + "\n" 

436 rpr += "BW selection method: " + self._bw_method + "\n" 

437 return rpr 

438 

439 def loo_likelihood(self, bw, func=lambda x: x): 

440 """ 

441 Returns the leave-one-out conditional likelihood of the data. 

442 

443 If `func` is not equal to the default, what's calculated is a function 

444 of the leave-one-out conditional likelihood. 

445 

446 Parameters 

447 ---------- 

448 bw : array_like 

449 The bandwidth parameter(s). 

450 func : callable, optional 

451 Function to transform the likelihood values (before summing); for 

452 the log likelihood, use ``func=np.log``. Default is ``f(x) = x``. 

453 

454 Returns 

455 ------- 

456 L : float 

457 The value of the leave-one-out function for the data. 

458 

459 Notes 

460 ----- 

461 Similar to ``KDE.loo_likelihood`, but substitute ``f(y|x)=f(x,y)/f(x)`` 

462 for ``f(x)``. 

463 """ 

464 yLOO = LeaveOneOut(self.data) 

465 xLOO = LeaveOneOut(self.exog).__iter__() 

466 L = 0 

467 for i, Y_j in enumerate(yLOO): 

468 X_not_i = next(xLOO) 

469 f_yx = gpke(bw, data=-Y_j, data_predict=-self.data[i, :], 

470 var_type=(self.dep_type + self.indep_type)) 

471 f_x = gpke(bw[self.k_dep:], data=-X_not_i, 

472 data_predict=-self.exog[i, :], 

473 var_type=self.indep_type) 

474 f_i = f_yx / f_x 

475 L += func(f_i) 

476 

477 return -L 

478 

479 def pdf(self, endog_predict=None, exog_predict=None): 

480 r""" 

481 Evaluate the probability density function. 

482 

483 Parameters 

484 ---------- 

485 endog_predict : array_like, optional 

486 Evaluation data for the dependent variables. If unspecified, the 

487 training data is used. 

488 exog_predict : array_like, optional 

489 Evaluation data for the independent variables. 

490 

491 Returns 

492 ------- 

493 pdf : array_like 

494 The value of the probability density at `endog_predict` and `exog_predict`. 

495 

496 Notes 

497 ----- 

498 The formula for the conditional probability density is: 

499 

500 .. math:: f(y|x)=\frac{f(x,y)}{f(x)} 

501 

502 with 

503 

504 .. math:: f(x)=\prod_{s=1}^{q}h_{s}^{-1}k 

505 \left(\frac{x_{is}-x_{js}}{h_{s}}\right) 

506 

507 where :math:`k` is the appropriate kernel for each variable. 

508 """ 

509 if endog_predict is None: 

510 endog_predict = self.endog 

511 else: 

512 endog_predict = _adjust_shape(endog_predict, self.k_dep) 

513 if exog_predict is None: 

514 exog_predict = self.exog 

515 else: 

516 exog_predict = _adjust_shape(exog_predict, self.k_indep) 

517 

518 pdf_est = [] 

519 data_predict = np.column_stack((endog_predict, exog_predict)) 

520 for i in range(np.shape(data_predict)[0]): 

521 f_yx = gpke(self.bw, data=self.data, 

522 data_predict=data_predict[i, :], 

523 var_type=(self.dep_type + self.indep_type)) 

524 f_x = gpke(self.bw[self.k_dep:], data=self.exog, 

525 data_predict=exog_predict[i, :], 

526 var_type=self.indep_type) 

527 pdf_est.append(f_yx / f_x) 

528 

529 return np.squeeze(pdf_est) 

530 

531 def cdf(self, endog_predict=None, exog_predict=None): 

532 r""" 

533 Cumulative distribution function for the conditional density. 

534 

535 Parameters 

536 ---------- 

537 endog_predict : array_like, optional 

538 The evaluation dependent variables at which the cdf is estimated. 

539 If not specified the training dependent variables are used. 

540 exog_predict : array_like, optional 

541 The evaluation independent variables at which the cdf is estimated. 

542 If not specified the training independent variables are used. 

543 

544 Returns 

545 ------- 

546 cdf_est : array_like 

547 The estimate of the cdf. 

548 

549 Notes 

550 ----- 

551 For more details on the estimation see [2]_, and p.181 in [1]_. 

552 

553 The multivariate conditional CDF for mixed data (continuous and 

554 ordered/unordered discrete) is estimated by: 

555 

556 .. math:: 

557 

558 F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}}) W_{h}(X_{i},x)}{\widehat{\mu}(x)} 

559 

560 where G() is the product kernel CDF estimator for the dependent (y) 

561 variable(s) and W() is the product kernel CDF estimator for the 

562 independent variable(s). 

563 

564 References 

565 ---------- 

566 .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and 

567 practice. Princeton University Press. (2007) 

568 .. [2] Liu, R., Yang, L. "Kernel estimation of multivariate cumulative 

569 distribution function." Journal of Nonparametric 

570 Statistics (2008) 

571 """ 

572 if endog_predict is None: 

573 endog_predict = self.endog 

574 else: 

575 endog_predict = _adjust_shape(endog_predict, self.k_dep) 

576 if exog_predict is None: 

577 exog_predict = self.exog 

578 else: 

579 exog_predict = _adjust_shape(exog_predict, self.k_indep) 

580 

581 N_data_predict = np.shape(exog_predict)[0] 

582 cdf_est = np.empty(N_data_predict) 

583 for i in range(N_data_predict): 

584 mu_x = gpke(self.bw[self.k_dep:], data=self.exog, 

585 data_predict=exog_predict[i, :], 

586 var_type=self.indep_type) / self.nobs 

587 mu_x = np.squeeze(mu_x) 

588 cdf_endog = gpke(self.bw[0:self.k_dep], data=self.endog, 

589 data_predict=endog_predict[i, :], 

590 var_type=self.dep_type, 

591 ckertype="gaussian_cdf", 

592 ukertype="aitchisonaitken_cdf", 

593 okertype='wangryzin_cdf', tosum=False) 

594 

595 cdf_exog = gpke(self.bw[self.k_dep:], data=self.exog, 

596 data_predict=exog_predict[i, :], 

597 var_type=self.indep_type, tosum=False) 

598 S = (cdf_endog * cdf_exog).sum(axis=0) 

599 cdf_est[i] = S / (self.nobs * mu_x) 

600 

601 return cdf_est 

602 

603 def imse(self, bw): 

604 r""" 

605 The integrated mean square error for the conditional KDE. 

606 

607 Parameters 

608 ---------- 

609 bw : array_like 

610 The bandwidth parameter(s). 

611 

612 Returns 

613 ------- 

614 CV : float 

615 The cross-validation objective function. 

616 

617 Notes 

618 ----- 

619 For more details see pp. 156-166 in [1]_. For details on how to 

620 handle the mixed variable types see [2]_. 

621 

622 The formula for the cross-validation objective function for mixed 

623 variable types is: 

624 

625 .. math:: CV(h,\lambda)=\frac{1}{n}\sum_{l=1}^{n} 

626 \frac{G_{-l}(X_{l})}{\left[\mu_{-l}(X_{l})\right]^{2}}- 

627 \frac{2}{n}\sum_{l=1}^{n}\frac{f_{-l}(X_{l},Y_{l})}{\mu_{-l}(X_{l})} 

628 

629 where 

630 

631 .. math:: G_{-l}(X_{l}) = n^{-2}\sum_{i\neq l}\sum_{j\neq l} 

632 K_{X_{i},X_{l}} K_{X_{j},X_{l}}K_{Y_{i},Y_{j}}^{(2)} 

633 

634 where :math:`K_{X_{i},X_{l}}` is the multivariate product kernel and 

635 :math:`\mu_{-l}(X_{l})` is the leave-one-out estimator of the pdf. 

636 

637 :math:`K_{Y_{i},Y_{j}}^{(2)}` is the convolution kernel. 

638 

639 The value of the function is minimized by the ``_cv_ls`` method of the 

640 `GenericKDE` class to return the bw estimates that minimize the 

641 distance between the estimated and "true" probability density. 

642 

643 References 

644 ---------- 

645 .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and 

646 practice. Princeton University Press. (2007) 

647 .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions 

648 with Categorical and Continuous Data." Working Paper. (2000) 

649 """ 

650 zLOO = LeaveOneOut(self.data) 

651 CV = 0 

652 nobs = float(self.nobs) 

653 expander = np.ones((self.nobs - 1, 1)) 

654 for ii, Z in enumerate(zLOO): 

655 X = Z[:, self.k_dep:] 

656 Y = Z[:, :self.k_dep] 

657 Ye_L = np.kron(Y, expander) 

658 Ye_R = np.kron(expander, Y) 

659 Xe_L = np.kron(X, expander) 

660 Xe_R = np.kron(expander, X) 

661 K_Xi_Xl = gpke(bw[self.k_dep:], data=Xe_L, 

662 data_predict=self.exog[ii, :], 

663 var_type=self.indep_type, tosum=False) 

664 K_Xj_Xl = gpke(bw[self.k_dep:], data=Xe_R, 

665 data_predict=self.exog[ii, :], 

666 var_type=self.indep_type, tosum=False) 

667 K2_Yi_Yj = gpke(bw[0:self.k_dep], data=Ye_L, 

668 data_predict=Ye_R, var_type=self.dep_type, 

669 ckertype='gauss_convolution', 

670 okertype='wangryzin_convolution', 

671 ukertype='aitchisonaitken_convolution', 

672 tosum=False) 

673 G = (K_Xi_Xl * K_Xj_Xl * K2_Yi_Yj).sum() / nobs**2 

674 f_X_Y = gpke(bw, data=-Z, data_predict=-self.data[ii, :], 

675 var_type=(self.dep_type + self.indep_type)) / nobs 

676 m_x = gpke(bw[self.k_dep:], data=-X, 

677 data_predict=-self.exog[ii, :], 

678 var_type=self.indep_type) / nobs 

679 CV += (G / m_x ** 2) - 2 * (f_X_Y / m_x) 

680 

681 return CV / nobs 

682 

683 def _get_class_vars_type(self): 

684 """Helper method to be able to pass needed vars to _compute_subset.""" 

685 class_type = 'KDEMultivariateConditional' 

686 class_vars = (self.k_dep, self.dep_type, self.indep_type) 

687 return class_type, class_vars