Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Multivariate Conditional and Unconditional Kernel Density Estimation 

3with Mixed Data Types 

4 

5References 

6---------- 

7[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice. 

8 Princeton University Press. (2007) 

9[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation 

10 and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008) 

11 http://dx.doi.org/10.1561/0800000009 

12[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions 

13 with Categorical and Continuous Data." Working Paper. (2000) 

14[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional 

15 Distributions Annals of Economics and Finance 5, 211-235 (2004) 

16[5] Liu, R., Yang, L. "Kernel estimation of multivariate 

17 cumulative distribution function." 

18 Journal of Nonparametric Statistics (2008) 

19[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF 

20 with Categorical and Continuous Data." Working Paper 

21[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric 

22 regression" Statistica Sinica 14(2004), pp. 485-512 

23[8] Racine, J.: "Consistent Significance Testing for Nonparametric 

24 Regression" Journal of Business & Economics Statistics 

25[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of 

26 Categorical Predictor Variables in Nonparametric Regression 

27 Models", 2006, Econometric Reviews 25, 523-544 

28 

29""" 

30 

31# TODO: make default behavior efficient=True above a certain n_obs 

32import copy 

33 

34import numpy as np 

35from scipy import optimize 

36from scipy.stats.mstats import mquantiles 

37 

38from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \ 

39 LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, kernel_func 

40 

41 

42__all__ = ['KernelReg', 'KernelCensoredReg'] 

43 

44 

45class KernelReg(GenericKDE): 

46 """ 

47 Nonparametric kernel regression class. 

48 

49 Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``. 

50 Note that the "local constant" type of regression provided here is also 

51 known as Nadaraya-Watson kernel regression; "local linear" is an extension 

52 of that which suffers less from bias issues at the edge of the support. Note 

53 that specifying a custom kernel works only with "local linear" kernel 

54 regression. For example, a custom ``tricube`` kernel yields LOESS regression. 

55 

56 Parameters 

57 ---------- 

58 endog : array_like 

59 This is the dependent variable. 

60 exog : array_like 

61 The training data for the independent variable(s) 

62 Each element in the list is a separate variable 

63 var_type : str 

64 The type of the variables, one character per variable: 

65 

66 - c: continuous 

67 - u: unordered (discrete) 

68 - o: ordered (discrete) 

69 

70 reg_type : {'lc', 'll'}, optional 

71 Type of regression estimator. 'lc' means local constant and 

72 'll' local Linear estimator. Default is 'll' 

73 bw : str or array_like, optional 

74 Either a user-specified bandwidth or the method for bandwidth 

75 selection. If a string, valid values are 'cv_ls' (least-squares 

76 cross-validation) and 'aic' (AIC Hurvich bandwidth estimation). 

77 Default is 'cv_ls'. User specified bandwidth must have as many 

78 entries as the number of variables. 

79 ckertype : str, optional 

80 The kernel used for the continuous variables. 

81 okertype : str, optional 

82 The kernel used for the ordered discrete variables. 

83 ukertype : str, optional 

84 The kernel used for the unordered discrete variables. 

85 defaults : EstimatorSettings instance, optional 

86 The default values for the efficient bandwidth estimation. 

87 

88 Attributes 

89 ---------- 

90 bw : array_like 

91 The bandwidth parameters. 

92 """ 

93 def __init__(self, endog, exog, var_type, reg_type='ll', bw='cv_ls', 

94 ckertype='gaussian', okertype='wangryzin', 

95 ukertype='aitchisonaitken', defaults=None): 

96 self.var_type = var_type 

97 self.data_type = var_type 

98 self.reg_type = reg_type 

99 self.ckertype = ckertype 

100 self.okertype = okertype 

101 self.ukertype = ukertype 

102 if not (self.ckertype in kernel_func and self.ukertype in kernel_func 

103 and self.okertype in kernel_func): 

104 raise ValueError('user specified kernel must be a supported ' 

105 'kernel from statsmodels.nonparametric.kernels.') 

106 

107 self.k_vars = len(self.var_type) 

108 self.endog = _adjust_shape(endog, 1) 

109 self.exog = _adjust_shape(exog, self.k_vars) 

110 self.data = np.column_stack((self.endog, self.exog)) 

111 self.nobs = np.shape(self.exog)[0] 

112 self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear) 

113 defaults = EstimatorSettings() if defaults is None else defaults 

114 self._set_defaults(defaults) 

115 if not isinstance(bw, str): 

116 bw = np.asarray(bw) 

117 if len(bw) != self.k_vars: 

118 raise ValueError('bw must have the same dimension as the ' 

119 'number of variables.') 

120 if not self.efficient: 

121 self.bw = self._compute_reg_bw(bw) 

122 else: 

123 self.bw = self._compute_efficient(bw) 

124 

125 def _compute_reg_bw(self, bw): 

126 if not isinstance(bw, str): 

127 self._bw_method = "user-specified" 

128 return np.asarray(bw) 

129 else: 

130 # The user specified a bandwidth selection method e.g. 'cv_ls' 

131 self._bw_method = bw 

132 # Workaround to avoid instance methods in __dict__ 

133 if bw == 'cv_ls': 

134 res = self.cv_loo 

135 else: # bw == 'aic' 

136 res = self.aic_hurvich 

137 X = np.std(self.exog, axis=0) 

138 h0 = 1.06 * X * \ 

139 self.nobs ** (- 1. / (4 + np.size(self.exog, axis=1))) 

140 

141 func = self.est[self.reg_type] 

142 bw_estimated = optimize.fmin(res, x0=h0, args=(func, ), 

143 maxiter=1e3, maxfun=1e3, disp=0) 

144 return bw_estimated 

145 

146 def _est_loc_linear(self, bw, endog, exog, data_predict): 

147 """ 

148 Local linear estimator of g(x) in the regression ``y = g(x) + e``. 

149 

150 Parameters 

151 ---------- 

152 bw : array_like 

153 Vector of bandwidth value(s). 

154 endog : 1D array_like 

155 The dependent variable. 

156 exog : 1D or 2D array_like 

157 The independent variable(s). 

158 data_predict : 1D array_like of length K, where K is the number of variables. 

159 The point at which the density is estimated. 

160 

161 Returns 

162 ------- 

163 D_x : array_like 

164 The value of the conditional mean at `data_predict`. 

165 

166 Notes 

167 ----- 

168 See p. 81 in [1] and p.38 in [2] for the formulas. 

169 Unlike other methods, this one requires that `data_predict` be 1D. 

170 """ 

171 nobs, k_vars = exog.shape 

172 ker = gpke(bw, data=exog, data_predict=data_predict, 

173 var_type=self.var_type, 

174 ckertype=self.ckertype, 

175 ukertype=self.ukertype, 

176 okertype=self.okertype, 

177 tosum=False) / float(nobs) 

178 # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij 

179 # See also p. 38 in [2] 

180 #ix_cont = np.arange(self.k_vars) # Use all vars instead of continuous only 

181 # Note: because ix_cont was defined here such that it selected all 

182 # columns, I removed the indexing with it from exog/data_predict. 

183 

184 # Convert ker to a 2-D array to make matrix operations below work 

185 ker = ker[:, np.newaxis] 

186 

187 M12 = exog - data_predict 

188 M22 = np.dot(M12.T, M12 * ker) 

189 M12 = (M12 * ker).sum(axis=0) 

190 M = np.empty((k_vars + 1, k_vars + 1)) 

191 M[0, 0] = ker.sum() 

192 M[0, 1:] = M12 

193 M[1:, 0] = M12 

194 M[1:, 1:] = M22 

195 

196 ker_endog = ker * endog 

197 V = np.empty((k_vars + 1, 1)) 

198 V[0, 0] = ker_endog.sum() 

199 V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0) 

200 

201 mean_mfx = np.dot(np.linalg.pinv(M), V) 

202 mean = mean_mfx[0] 

203 mfx = mean_mfx[1:, :] 

204 return mean, mfx 

205 

206 def _est_loc_constant(self, bw, endog, exog, data_predict): 

207 """ 

208 Local constant estimator of g(x) in the regression 

209 y = g(x) + e 

210 

211 Parameters 

212 ---------- 

213 bw : array_like 

214 Array of bandwidth value(s). 

215 endog : 1D array_like 

216 The dependent variable. 

217 exog : 1D or 2D array_like 

218 The independent variable(s). 

219 data_predict : 1D or 2D array_like 

220 The point(s) at which the density is estimated. 

221 

222 Returns 

223 ------- 

224 G : ndarray 

225 The value of the conditional mean at `data_predict`. 

226 B_x : ndarray 

227 The marginal effects. 

228 """ 

229 ker_x = gpke(bw, data=exog, data_predict=data_predict, 

230 var_type=self.var_type, 

231 ckertype=self.ckertype, 

232 ukertype=self.ukertype, 

233 okertype=self.okertype, 

234 tosum=False) 

235 ker_x = np.reshape(ker_x, np.shape(endog)) 

236 G_numer = (ker_x * endog).sum(axis=0) 

237 G_denom = ker_x.sum(axis=0) 

238 G = G_numer / G_denom 

239 nobs = exog.shape[0] 

240 f_x = G_denom / float(nobs) 

241 ker_xc = gpke(bw, data=exog, data_predict=data_predict, 

242 var_type=self.var_type, 

243 ckertype='d_gaussian', 

244 #okertype='wangryzin_reg', 

245 tosum=False) 

246 

247 ker_xc = ker_xc[:, np.newaxis] 

248 d_mx = -(endog * ker_xc).sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont])) 

249 d_fx = -ker_xc.sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont])) 

250 B_x = d_mx / f_x - G * d_fx / f_x 

251 B_x = (G_numer * d_fx - G_denom * d_mx) / (G_denom**2) 

252 #B_x = (f_x * d_mx - m_x * d_fx) / (f_x ** 2) 

253 return G, B_x 

254 

255 def aic_hurvich(self, bw, func=None): 

256 """ 

257 Computes the AIC Hurvich criteria for the estimation of the bandwidth. 

258 

259 Parameters 

260 ---------- 

261 bw : str or array_like 

262 See the ``bw`` parameter of `KernelReg` for details. 

263 

264 Returns 

265 ------- 

266 aic : ndarray 

267 The AIC Hurvich criteria, one element for each variable. 

268 func : None 

269 Unused here, needed in signature because it's used in `cv_loo`. 

270 

271 References 

272 ---------- 

273 See ch.2 in [1] and p.35 in [2]. 

274 """ 

275 H = np.empty((self.nobs, self.nobs)) 

276 for j in range(self.nobs): 

277 H[:, j] = gpke(bw, data=self.exog, data_predict=self.exog[j,:], 

278 ckertype=self.ckertype, ukertype=self.ukertype, 

279 okertype=self.okertype, var_type=self.var_type, 

280 tosum=False) 

281 

282 denom = H.sum(axis=1) 

283 H = H / denom 

284 gx = KernelReg(endog=self.endog, exog=self.exog, var_type=self.var_type, 

285 reg_type=self.reg_type, bw=bw, 

286 defaults=EstimatorSettings(efficient=False)).fit()[0] 

287 gx = np.reshape(gx, (self.nobs, 1)) 

288 sigma = ((self.endog - gx)**2).sum(axis=0) / float(self.nobs) 

289 

290 frac = (1 + np.trace(H) / float(self.nobs)) / \ 

291 (1 - (np.trace(H) + 2) / float(self.nobs)) 

292 #siga = np.dot(self.endog.T, (I - H).T) 

293 #sigb = np.dot((I - H), self.endog) 

294 #sigma = np.dot(siga, sigb) / float(self.nobs) 

295 aic = np.log(sigma) + frac 

296 return aic 

297 

298 def cv_loo(self, bw, func): 

299 r""" 

300 The cross-validation function with leave-one-out estimator. 

301 

302 Parameters 

303 ---------- 

304 bw : array_like 

305 Vector of bandwidth values. 

306 func : callable function 

307 Returns the estimator of g(x). Can be either ``_est_loc_constant`` 

308 (local constant) or ``_est_loc_linear`` (local_linear). 

309 

310 Returns 

311 ------- 

312 L : float 

313 The value of the CV function. 

314 

315 Notes 

316 ----- 

317 Calculates the cross-validation least-squares function. This function 

318 is minimized by compute_bw to calculate the optimal value of `bw`. 

319 

320 For details see p.35 in [2] 

321 

322 .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} 

323 

324 where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) 

325 and :math:`h` is the vector of bandwidths 

326 """ 

327 LOO_X = LeaveOneOut(self.exog) 

328 LOO_Y = LeaveOneOut(self.endog).__iter__() 

329 L = 0 

330 for ii, X_not_i in enumerate(LOO_X): 

331 Y = next(LOO_Y) 

332 G = func(bw, endog=Y, exog=-X_not_i, 

333 data_predict=-self.exog[ii, :])[0] 

334 L += (self.endog[ii] - G) ** 2 

335 

336 # Note: There might be a way to vectorize this. See p.72 in [1] 

337 return L / self.nobs 

338 

339 def r_squared(self): 

340 r""" 

341 Returns the R-Squared for the nonparametric regression. 

342 

343 Notes 

344 ----- 

345 For more details see p.45 in [2] 

346 The R-Squared is calculated by: 

347 

348 .. math:: R^{2}=\frac{\left[\sum_{i=1}^{n} 

349 (Y_{i}-\bar{y})(\hat{Y_{i}}-\bar{y}\right]^{2}}{\sum_{i=1}^{n} 

350 (Y_{i}-\bar{y})^{2}\sum_{i=1}^{n}(\hat{Y_{i}}-\bar{y})^{2}}, 

351 

352 where :math:`\hat{Y_{i}}` is the mean calculated in `fit` at the exog 

353 points. 

354 """ 

355 Y = np.squeeze(self.endog) 

356 Yhat = self.fit()[0] 

357 Y_bar = np.mean(Yhat) 

358 R2_numer = (((Y - Y_bar) * (Yhat - Y_bar)).sum())**2 

359 R2_denom = ((Y - Y_bar)**2).sum(axis=0) * \ 

360 ((Yhat - Y_bar)**2).sum(axis=0) 

361 return R2_numer / R2_denom 

362 

363 def fit(self, data_predict=None): 

364 """ 

365 Returns the mean and marginal effects at the `data_predict` points. 

366 

367 Parameters 

368 ---------- 

369 data_predict : array_like, optional 

370 Points at which to return the mean and marginal effects. If not 

371 given, ``data_predict == exog``. 

372 

373 Returns 

374 ------- 

375 mean : ndarray 

376 The regression result for the mean (i.e. the actual curve). 

377 mfx : ndarray 

378 The marginal effects, i.e. the partial derivatives of the mean. 

379 """ 

380 func = self.est[self.reg_type] 

381 if data_predict is None: 

382 data_predict = self.exog 

383 else: 

384 data_predict = _adjust_shape(data_predict, self.k_vars) 

385 

386 N_data_predict = np.shape(data_predict)[0] 

387 mean = np.empty((N_data_predict,)) 

388 mfx = np.empty((N_data_predict, self.k_vars)) 

389 for i in range(N_data_predict): 

390 mean_mfx = func(self.bw, self.endog, self.exog, 

391 data_predict=data_predict[i, :]) 

392 mean[i] = mean_mfx[0] 

393 mfx_c = np.squeeze(mean_mfx[1]) 

394 mfx[i, :] = mfx_c 

395 

396 return mean, mfx 

397 

398 def sig_test(self, var_pos, nboot=50, nested_res=25, pivot=False): 

399 """ 

400 Significance test for the variables in the regression. 

401 

402 Parameters 

403 ---------- 

404 var_pos : sequence 

405 The position of the variable in exog to be tested. 

406 

407 Returns 

408 ------- 

409 sig : str 

410 The level of significance: 

411 

412 - `*` : at 90% confidence level 

413 - `**` : at 95% confidence level 

414 - `***` : at 99* confidence level 

415 - "Not Significant" : if not significant 

416 """ 

417 var_pos = np.asarray(var_pos) 

418 ix_cont, ix_ord, ix_unord = _get_type_pos(self.var_type) 

419 if np.any(ix_cont[var_pos]): 

420 if np.any(ix_ord[var_pos]) or np.any(ix_unord[var_pos]): 

421 raise ValueError("Discrete variable in hypothesis. Must be continuous") 

422 

423 Sig = TestRegCoefC(self, var_pos, nboot, nested_res, pivot) 

424 else: 

425 Sig = TestRegCoefD(self, var_pos, nboot) 

426 

427 return Sig.sig 

428 

429 def __repr__(self): 

430 """Provide something sane to print.""" 

431 rpr = "KernelReg instance\n" 

432 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n" 

433 rpr += "Number of samples: N = " + str(self.nobs) + "\n" 

434 rpr += "Variable types: " + self.var_type + "\n" 

435 rpr += "BW selection method: " + self._bw_method + "\n" 

436 rpr += "Estimator type: " + self.reg_type + "\n" 

437 return rpr 

438 

439 def _get_class_vars_type(self): 

440 """Helper method to be able to pass needed vars to _compute_subset.""" 

441 class_type = 'KernelReg' 

442 class_vars = (self.var_type, self.k_vars, self.reg_type) 

443 return class_type, class_vars 

444 

445 def _compute_dispersion(self, data): 

446 """ 

447 Computes the measure of dispersion. 

448 

449 The minimum of the standard deviation and interquartile range / 1.349 

450 

451 References 

452 ---------- 

453 See the user guide for the np package in R. 

454 In the notes on bwscaling option in npreg, npudens, npcdens there is 

455 a discussion on the measure of dispersion 

456 """ 

457 data = data[:, 1:] 

458 return _compute_min_std_IQR(data) 

459 

460 

461class KernelCensoredReg(KernelReg): 

462 """ 

463 Nonparametric censored regression. 

464 

465 Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``, 

466 where y is left-censored. Left censored variable Y is defined as 

467 ``Y = min {Y', L}`` where ``L`` is the value at which ``Y`` is censored 

468 and ``Y'`` is the true value of the variable. 

469 

470 Parameters 

471 ---------- 

472 endog : list with one element which is array_like 

473 This is the dependent variable. 

474 exog : list 

475 The training data for the independent variable(s) 

476 Each element in the list is a separate variable 

477 dep_type : str 

478 The type of the dependent variable(s) 

479 c: Continuous 

480 u: Unordered (Discrete) 

481 o: Ordered (Discrete) 

482 reg_type : str 

483 Type of regression estimator 

484 lc: Local Constant Estimator 

485 ll: Local Linear Estimator 

486 bw : array_like 

487 Either a user-specified bandwidth or 

488 the method for bandwidth selection. 

489 cv_ls: cross-validation least squares 

490 aic: AIC Hurvich Estimator 

491 ckertype : str, optional 

492 The kernel used for the continuous variables. 

493 okertype : str, optional 

494 The kernel used for the ordered discrete variables. 

495 ukertype : str, optional 

496 The kernel used for the unordered discrete variables. 

497 censor_val : float 

498 Value at which the dependent variable is censored 

499 defaults : EstimatorSettings instance, optional 

500 The default values for the efficient bandwidth estimation 

501 

502 Attributes 

503 ---------- 

504 bw : array_like 

505 The bandwidth parameters 

506 """ 

507 def __init__(self, endog, exog, var_type, reg_type, bw='cv_ls', 

508 ckertype='gaussian', 

509 ukertype='aitchison_aitken_reg', 

510 okertype='wangryzin_reg', 

511 censor_val=0, defaults=None): 

512 self.var_type = var_type 

513 self.data_type = var_type 

514 self.reg_type = reg_type 

515 self.ckertype = ckertype 

516 self.okertype = okertype 

517 self.ukertype = ukertype 

518 if not (self.ckertype in kernel_func and self.ukertype in kernel_func 

519 and self.okertype in kernel_func): 

520 raise ValueError('user specified kernel must be a supported ' 

521 'kernel from statsmodels.nonparametric.kernels.') 

522 

523 self.k_vars = len(self.var_type) 

524 self.endog = _adjust_shape(endog, 1) 

525 self.exog = _adjust_shape(exog, self.k_vars) 

526 self.data = np.column_stack((self.endog, self.exog)) 

527 self.nobs = np.shape(self.exog)[0] 

528 self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear) 

529 defaults = EstimatorSettings() if defaults is None else defaults 

530 self._set_defaults(defaults) 

531 self.censor_val = censor_val 

532 if self.censor_val is not None: 

533 self.censored(censor_val) 

534 else: 

535 self.W_in = np.ones((self.nobs, 1)) 

536 

537 if not self.efficient: 

538 self.bw = self._compute_reg_bw(bw) 

539 else: 

540 self.bw = self._compute_efficient(bw) 

541 

542 def censored(self, censor_val): 

543 # see pp. 341-344 in [1] 

544 self.d = (self.endog != censor_val) * 1. 

545 ix = np.argsort(np.squeeze(self.endog)) 

546 self.sortix = ix 

547 self.sortix_rev = np.zeros(ix.shape, int) 

548 self.sortix_rev[ix] = np.arange(len(ix)) 

549 self.endog = np.squeeze(self.endog[ix]) 

550 self.endog = _adjust_shape(self.endog, 1) 

551 self.exog = np.squeeze(self.exog[ix]) 

552 self.d = np.squeeze(self.d[ix]) 

553 self.W_in = np.empty((self.nobs, 1)) 

554 for i in range(1, self.nobs + 1): 

555 P=1 

556 for j in range(1, i): 

557 P *= ((self.nobs - j)/(float(self.nobs)-j+1))**self.d[j-1] 

558 self.W_in[i-1,0] = P * self.d[i-1] / (float(self.nobs) - i + 1 ) 

559 

560 def __repr__(self): 

561 """Provide something sane to print.""" 

562 rpr = "KernelCensoredReg instance\n" 

563 rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n" 

564 rpr += "Number of samples: nobs = " + str(self.nobs) + "\n" 

565 rpr += "Variable types: " + self.var_type + "\n" 

566 rpr += "BW selection method: " + self._bw_method + "\n" 

567 rpr += "Estimator type: " + self.reg_type + "\n" 

568 return rpr 

569 

570 def _est_loc_linear(self, bw, endog, exog, data_predict, W): 

571 """ 

572 Local linear estimator of g(x) in the regression ``y = g(x) + e``. 

573 

574 Parameters 

575 ---------- 

576 bw : array_like 

577 Vector of bandwidth value(s) 

578 endog : 1D array_like 

579 The dependent variable 

580 exog : 1D or 2D array_like 

581 The independent variable(s) 

582 data_predict : 1D array_like of length K, where K is 

583 the number of variables. The point at which 

584 the density is estimated 

585 

586 Returns 

587 ------- 

588 D_x : array_like 

589 The value of the conditional mean at data_predict 

590 

591 Notes 

592 ----- 

593 See p. 81 in [1] and p.38 in [2] for the formulas 

594 Unlike other methods, this one requires that data_predict be 1D 

595 """ 

596 nobs, k_vars = exog.shape 

597 ker = gpke(bw, data=exog, data_predict=data_predict, 

598 var_type=self.var_type, 

599 ckertype=self.ckertype, 

600 ukertype=self.ukertype, 

601 okertype=self.okertype, tosum=False) 

602 # Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij 

603 # See also p. 38 in [2] 

604 

605 # Convert ker to a 2-D array to make matrix operations below work 

606 ker = W * ker[:, np.newaxis] 

607 

608 M12 = exog - data_predict 

609 M22 = np.dot(M12.T, M12 * ker) 

610 M12 = (M12 * ker).sum(axis=0) 

611 M = np.empty((k_vars + 1, k_vars + 1)) 

612 M[0, 0] = ker.sum() 

613 M[0, 1:] = M12 

614 M[1:, 0] = M12 

615 M[1:, 1:] = M22 

616 

617 ker_endog = ker * endog 

618 V = np.empty((k_vars + 1, 1)) 

619 V[0, 0] = ker_endog.sum() 

620 V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0) 

621 

622 mean_mfx = np.dot(np.linalg.pinv(M), V) 

623 mean = mean_mfx[0] 

624 mfx = mean_mfx[1:, :] 

625 return mean, mfx 

626 

627 

628 def cv_loo(self, bw, func): 

629 r""" 

630 The cross-validation function with leave-one-out 

631 estimator 

632 

633 Parameters 

634 ---------- 

635 bw : array_like 

636 Vector of bandwidth values 

637 func : callable function 

638 Returns the estimator of g(x). 

639 Can be either ``_est_loc_constant`` (local constant) or 

640 ``_est_loc_linear`` (local_linear). 

641 

642 Returns 

643 ------- 

644 L : float 

645 The value of the CV function 

646 

647 Notes 

648 ----- 

649 Calculates the cross-validation least-squares 

650 function. This function is minimized by compute_bw 

651 to calculate the optimal value of bw 

652 

653 For details see p.35 in [2] 

654 

655 .. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2} 

656 

657 where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X) 

658 and :math:`h` is the vector of bandwidths 

659 """ 

660 LOO_X = LeaveOneOut(self.exog) 

661 LOO_Y = LeaveOneOut(self.endog).__iter__() 

662 LOO_W = LeaveOneOut(self.W_in).__iter__() 

663 L = 0 

664 for ii, X_not_i in enumerate(LOO_X): 

665 Y = next(LOO_Y) 

666 w = next(LOO_W) 

667 G = func(bw, endog=Y, exog=-X_not_i, 

668 data_predict=-self.exog[ii, :], W=w)[0] 

669 L += (self.endog[ii] - G) ** 2 

670 

671 # Note: There might be a way to vectorize this. See p.72 in [1] 

672 return L / self.nobs 

673 

674 def fit(self, data_predict=None): 

675 """ 

676 Returns the marginal effects at the data_predict points. 

677 """ 

678 func = self.est[self.reg_type] 

679 if data_predict is None: 

680 data_predict = self.exog 

681 else: 

682 data_predict = _adjust_shape(data_predict, self.k_vars) 

683 

684 N_data_predict = np.shape(data_predict)[0] 

685 mean = np.empty((N_data_predict,)) 

686 mfx = np.empty((N_data_predict, self.k_vars)) 

687 for i in range(N_data_predict): 

688 mean_mfx = func(self.bw, self.endog, self.exog, 

689 data_predict=data_predict[i, :], 

690 W=self.W_in) 

691 mean[i] = mean_mfx[0] 

692 mfx_c = np.squeeze(mean_mfx[1]) 

693 mfx[i, :] = mfx_c 

694 

695 return mean, mfx 

696 

697 

698class TestRegCoefC(object): 

699 """ 

700 Significance test for continuous variables in a nonparametric regression. 

701 

702 The null hypothesis is ``dE(Y|X)/dX_not_i = 0``, the alternative hypothesis 

703 is ``dE(Y|X)/dX_not_i != 0``. 

704 

705 Parameters 

706 ---------- 

707 model : KernelReg instance 

708 This is the nonparametric regression model whose elements 

709 are tested for significance. 

710 test_vars : tuple, list of integers, array_like 

711 index of position of the continuous variables to be tested 

712 for significance. E.g. (1,3,5) jointly tests variables at 

713 position 1,3 and 5 for significance. 

714 nboot : int 

715 Number of bootstrap samples used to determine the distribution 

716 of the test statistic in a finite sample. Default is 400 

717 nested_res : int 

718 Number of nested resamples used to calculate lambda. 

719 Must enable the pivot option 

720 pivot : bool 

721 Pivot the test statistic by dividing by its standard error 

722 Significantly increases computational time. But pivot statistics 

723 have more desirable properties 

724 (See references) 

725 

726 Attributes 

727 ---------- 

728 sig : str 

729 The significance level of the variable(s) tested 

730 "Not Significant": Not significant at the 90% confidence level 

731 Fails to reject the null 

732 "*": Significant at the 90% confidence level 

733 "**": Significant at the 95% confidence level 

734 "***": Significant at the 99% confidence level 

735 

736 Notes 

737 ----- 

738 This class allows testing of joint hypothesis as long as all variables 

739 are continuous. 

740 

741 References 

742 ---------- 

743 Racine, J.: "Consistent Significance Testing for Nonparametric Regression" 

744 Journal of Business & Economics Statistics. 

745 

746 Chapter 12 in [1]. 

747 """ 

748 # Significance of continuous vars in nonparametric regression 

749 # Racine: Consistent Significance Testing for Nonparametric Regression 

750 # Journal of Business & Economics Statistics 

751 def __init__(self, model, test_vars, nboot=400, nested_res=400, 

752 pivot=False): 

753 self.nboot = nboot 

754 self.nres = nested_res 

755 self.test_vars = test_vars 

756 self.model = model 

757 self.bw = model.bw 

758 self.var_type = model.var_type 

759 self.k_vars = len(self.var_type) 

760 self.endog = model.endog 

761 self.exog = model.exog 

762 self.gx = model.est[model.reg_type] 

763 self.test_vars = test_vars 

764 self.pivot = pivot 

765 self.run() 

766 

767 def run(self): 

768 self.test_stat = self._compute_test_stat(self.endog, self.exog) 

769 self.sig = self._compute_sig() 

770 

771 def _compute_test_stat(self, Y, X): 

772 """ 

773 Computes the test statistic. See p.371 in [8]. 

774 """ 

775 lam = self._compute_lambda(Y, X) 

776 t = lam 

777 if self.pivot: 

778 se_lam = self._compute_se_lambda(Y, X) 

779 t = lam / float(se_lam) 

780 

781 return t 

782 

783 def _compute_lambda(self, Y, X): 

784 """Computes only lambda -- the main part of the test statistic""" 

785 n = np.shape(X)[0] 

786 Y = _adjust_shape(Y, 1) 

787 X = _adjust_shape(X, self.k_vars) 

788 b = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, 

789 defaults = EstimatorSettings(efficient=False)).fit()[1] 

790 

791 b = b[:, self.test_vars] 

792 b = np.reshape(b, (n, len(self.test_vars))) 

793 #fct = np.std(b) # Pivot the statistic by dividing by SE 

794 fct = 1. # Do not Pivot -- Bootstrapping works better if Pivot 

795 lam = ((b / fct) ** 2).sum() / float(n) 

796 return lam 

797 

798 def _compute_se_lambda(self, Y, X): 

799 """ 

800 Calculates the SE of lambda by nested resampling 

801 Used to pivot the statistic. 

802 Bootstrapping works better with estimating pivotal statistics 

803 but slows down computation significantly. 

804 """ 

805 n = np.shape(Y)[0] 

806 lam = np.empty(shape=(self.nres,)) 

807 for i in range(self.nres): 

808 ind = np.random.randint(0, n, size=(n, 1)) 

809 Y1 = Y[ind, 0] 

810 X1 = X[ind, :] 

811 lam[i] = self._compute_lambda(Y1, X1) 

812 

813 se_lambda = np.std(lam) 

814 return se_lambda 

815 

816 def _compute_sig(self): 

817 """ 

818 Computes the significance value for the variable(s) tested. 

819 

820 The empirical distribution of the test statistic is obtained through 

821 bootstrapping the sample. The null hypothesis is rejected if the test 

822 statistic is larger than the 90, 95, 99 percentiles. 

823 """ 

824 t_dist = np.empty(shape=(self.nboot, )) 

825 Y = self.endog 

826 X = copy.deepcopy(self.exog) 

827 n = np.shape(Y)[0] 

828 

829 X[:, self.test_vars] = np.mean(X[:, self.test_vars], axis=0) 

830 # Calculate the restricted mean. See p. 372 in [8] 

831 M = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, 

832 defaults=EstimatorSettings(efficient=False)).fit()[0] 

833 M = np.reshape(M, (n, 1)) 

834 e = Y - M 

835 e = e - np.mean(e) # recenter residuals 

836 for i in range(self.nboot): 

837 ind = np.random.randint(0, n, size=(n, 1)) 

838 e_boot = e[ind, 0] 

839 Y_boot = M + e_boot 

840 t_dist[i] = self._compute_test_stat(Y_boot, self.exog) 

841 

842 self.t_dist = t_dist 

843 sig = "Not Significant" 

844 if self.test_stat > mquantiles(t_dist, 0.9): 

845 sig = "*" 

846 if self.test_stat > mquantiles(t_dist, 0.95): 

847 sig = "**" 

848 if self.test_stat > mquantiles(t_dist, 0.99): 

849 sig = "***" 

850 

851 return sig 

852 

853 

854class TestRegCoefD(TestRegCoefC): 

855 """ 

856 Significance test for the categorical variables in a nonparametric 

857 regression. 

858 

859 Parameters 

860 ---------- 

861 model : Instance of KernelReg class 

862 This is the nonparametric regression model whose elements 

863 are tested for significance. 

864 test_vars : tuple, list of one element 

865 index of position of the discrete variable to be tested 

866 for significance. E.g. (3) tests variable at 

867 position 3 for significance. 

868 nboot : int 

869 Number of bootstrap samples used to determine the distribution 

870 of the test statistic in a finite sample. Default is 400 

871 

872 Attributes 

873 ---------- 

874 sig : str 

875 The significance level of the variable(s) tested 

876 "Not Significant": Not significant at the 90% confidence level 

877 Fails to reject the null 

878 "*": Significant at the 90% confidence level 

879 "**": Significant at the 95% confidence level 

880 "***": Significant at the 99% confidence level 

881 

882 Notes 

883 ----- 

884 This class currently does not allow joint hypothesis. 

885 Only one variable can be tested at a time 

886 

887 References 

888 ---------- 

889 See [9] and chapter 12 in [1]. 

890 """ 

891 

892 def _compute_test_stat(self, Y, X): 

893 """Computes the test statistic""" 

894 

895 dom_x = np.sort(np.unique(self.exog[:, self.test_vars])) 

896 

897 n = np.shape(X)[0] 

898 model = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw, 

899 defaults = EstimatorSettings(efficient=False)) 

900 X1 = copy.deepcopy(X) 

901 X1[:, self.test_vars] = 0 

902 

903 m0 = model.fit(data_predict=X1)[0] 

904 m0 = np.reshape(m0, (n, 1)) 

905 zvec = np.zeros((n, 1)) # noqa:E741 

906 for i in dom_x[1:] : 

907 X1[:, self.test_vars] = i 

908 m1 = model.fit(data_predict=X1)[0] 

909 m1 = np.reshape(m1, (n, 1)) 

910 zvec += (m1 - m0) ** 2 # noqa:E741 

911 

912 avg = zvec.sum(axis=0) / float(n) 

913 return avg 

914 

915 def _compute_sig(self): 

916 """Calculates the significance level of the variable tested""" 

917 

918 m = self._est_cond_mean() 

919 Y = self.endog 

920 X = self.exog 

921 n = np.shape(X)[0] 

922 u = Y - m 

923 u = u - np.mean(u) # center 

924 fct1 = (1 - 5**0.5) / 2. 

925 fct2 = (1 + 5**0.5) / 2. 

926 u1 = fct1 * u 

927 u2 = fct2 * u 

928 r = fct2 / (5 ** 0.5) 

929 I_dist = np.empty((self.nboot,1)) 

930 for j in range(self.nboot): 

931 u_boot = copy.deepcopy(u2) 

932 

933 prob = np.random.uniform(0,1, size = (n,1)) 

934 ind = prob < r 

935 u_boot[ind] = u1[ind] 

936 Y_boot = m + u_boot 

937 I_dist[j] = self._compute_test_stat(Y_boot, X) 

938 

939 sig = "Not Significant" 

940 if self.test_stat > mquantiles(I_dist, 0.9): 

941 sig = "*" 

942 if self.test_stat > mquantiles(I_dist, 0.95): 

943 sig = "**" 

944 if self.test_stat > mquantiles(I_dist, 0.99): 

945 sig = "***" 

946 

947 return sig 

948 

949 def _est_cond_mean(self): 

950 """ 

951 Calculates the expected conditional mean 

952 m(X, Z=l) for all possible l 

953 """ 

954 self.dom_x = np.sort(np.unique(self.exog[:, self.test_vars])) 

955 X = copy.deepcopy(self.exog) 

956 m=0 

957 for i in self.dom_x: 

958 X[:, self.test_vars] = i 

959 m += self.model.fit(data_predict = X)[0] 

960 

961 m = m / float(len(self.dom_x)) 

962 m = np.reshape(m, (np.shape(self.exog)[0], 1)) 

963 return m