Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2 

3""" 

4This models contains the Kernels for Kernel smoothing. 

5 

6Hopefully in the future they may be reused/extended for other kernel based 

7method 

8 

9References: 

10---------- 

11 

12Pointwise Kernel Confidence Bounds 

13(smoothconf) 

14http://fedc.wiwi.hu-berlin.de/xplore/ebooks/html/anr/anrhtmlframe62.html 

15""" 

16 

17# pylint: disable-msg=C0103 

18# pylint: disable-msg=W0142 

19# pylint: disable-msg=E1101 

20# pylint: disable-msg=E0611 

21from statsmodels.compat.python import lzip, lfilter 

22import numpy as np 

23import scipy.integrate 

24from scipy.special import factorial 

25from numpy import exp, multiply, square, divide, subtract, inf 

26 

27 

28class NdKernel(object): 

29 """Generic N-dimensial kernel 

30 

31 Parameters 

32 ---------- 

33 n : int 

34 The number of series for kernel estimates 

35 kernels : list 

36 kernels 

37 

38 Can be constructed from either 

39 a) a list of n kernels which will be treated as 

40 indepent marginals on a gaussian copula (specified by H) 

41 or b) a single univariate kernel which will be applied radially to the 

42 mahalanobis distance defined by H. 

43 

44 In the case of the Gaussian these are both equivalent, and the second constructiong 

45 is prefered. 

46 """ 

47 def __init__(self, n, kernels = None, H = None): 

48 if kernels is None: 

49 kernels = Gaussian() 

50 

51 self._kernels = kernels 

52 self.weights = None 

53 

54 if H is None: 

55 H = np.matrix( np.identity(n)) 

56 

57 self._H = H 

58 self._Hrootinv = np.linalg.cholesky( H.I ) 

59 

60 def getH(self): 

61 """Getter for kernel bandwidth, H""" 

62 return self._H 

63 

64 def setH(self, value): 

65 """Setter for kernel bandwidth, H""" 

66 self._H = value 

67 

68 H = property(getH, setH, doc="Kernel bandwidth matrix") 

69 

70 def density(self, xs, x): 

71 

72 n = len(xs) 

73 #xs = self.in_domain( xs, xs, x )[0] 

74 

75 if len(xs)>0: ## Need to do product of marginal distributions 

76 #w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n 

77 #vectorized does not work: 

78 if self.weights is not None: 

79 w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights) 

80 else: 

81 w = np.mean(self((xs-x) * self._Hrootinv )) #transposed 

82 #w = np.mean([self(xd) for xd in ((xs-x) * self._Hrootinv)] ) #transposed 

83 return w 

84 else: 

85 return np.nan 

86 

87 def _kernweight(self, x ): 

88 """returns the kernel weight for the independent multivariate kernel""" 

89 if isinstance( self._kernels, CustomKernel ): 

90 ## Radial case 

91 #d = x.T * x 

92 #x is matrix, 2d, element wise sqrt looks wrong 

93 #d = np.sqrt( x.T * x ) 

94 x = np.asarray(x) 

95 #d = np.sqrt( (x * x).sum(-1) ) 

96 d = (x * x).sum(-1) 

97 return self._kernels( np.asarray(d) ) 

98 

99 def __call__(self, x): 

100 """ 

101 This simply returns the value of the kernel function at x 

102 

103 Does the same as weight if the function is normalised 

104 """ 

105 return self._kernweight(x) 

106 

107 

108class CustomKernel(object): 

109 """ 

110 Generic 1D Kernel object. 

111 Can be constructed by selecting a standard named Kernel, 

112 or providing a lambda expression and domain. 

113 The domain allows some algorithms to run faster for finite domain kernels. 

114 """ 

115 # MC: Not sure how this will look in the end - or even still exist. 

116 # Main purpose of this is to allow custom kernels and to allow speed up 

117 # from finite support. 

118 

119 def __init__(self, shape, h = 1.0, domain = None, norm = None): 

120 """ 

121 shape should be a function taking and returning numeric type. 

122 

123 For sanity it should always return positive or zero but this is not 

124 enforced in case you want to do weird things. Bear in mind that the 

125 statistical tests etc. may not be valid for non-positive kernels. 

126 

127 The bandwidth of the kernel is supplied as h. 

128 

129 You may specify a domain as a list of 2 values [min, max], in which case 

130 kernel will be treated as zero outside these values. This will speed up 

131 calculation. 

132 

133 You may also specify the normalisation constant for the supplied Kernel. 

134 If you do this number will be stored and used as the normalisation 

135 without calculation. It is recommended you do this if you know the 

136 constant, to speed up calculation. In particular if the shape function 

137 provided is already normalised you should provide norm = 1.0. 

138 

139 Warning: I think several calculations assume that the kernel is 

140 normalized. No tests for non-normalized kernel. 

141 """ 

142 self._normconst = norm # a value or None, if None, then calculate 

143 self.domain = domain 

144 self.weights = None 

145 if callable(shape): 

146 self._shape = shape 

147 else: 

148 raise TypeError("shape must be a callable object/function") 

149 self._h = h 

150 self._L2Norm = None 

151 self._kernel_var = None 

152 self._normal_reference_constant = None 

153 self._order = None 

154 

155 def geth(self): 

156 """Getter for kernel bandwidth, h""" 

157 return self._h 

158 def seth(self, value): 

159 """Setter for kernel bandwidth, h""" 

160 self._h = value 

161 h = property(geth, seth, doc="Kernel Bandwidth") 

162 

163 def in_domain(self, xs, ys, x): 

164 """ 

165 Returns the filtered (xs, ys) based on the Kernel domain centred on x 

166 """ 

167 # Disable black-list functions: filter used for speed instead of 

168 # list-comprehension 

169 # pylint: disable-msg=W0141 

170 def isInDomain(xy): 

171 """Used for filter to check if point is in the domain""" 

172 u = (xy[0]-x)/self.h 

173 return u >= self.domain[0] and u <= self.domain[1] 

174 

175 if self.domain is None: 

176 return (xs, ys) 

177 else: 

178 filtered = lfilter(isInDomain, lzip(xs, ys)) 

179 if len(filtered) > 0: 

180 xs, ys = lzip(*filtered) 

181 return (xs, ys) 

182 else: 

183 return ([], []) 

184 

185 def density(self, xs, x): 

186 """Returns the kernel density estimate for point x based on x-values 

187 xs 

188 """ 

189 xs = np.asarray(xs) 

190 n = len(xs) # before in_domain? 

191 if self.weights is not None: 

192 xs, weights = self.in_domain( xs, self.weights, x ) 

193 else: 

194 xs = self.in_domain( xs, xs, x )[0] 

195 xs = np.asarray(xs) 

196 #print 'len(xs)', len(xs), x 

197 if xs.ndim == 1: 

198 xs = xs[:,None] 

199 if len(xs)>0: 

200 h = self.h 

201 if self.weights is not None: 

202 w = 1 / h * np.sum(self((xs-x)/h).T * weights, axis=1) 

203 else: 

204 w = 1. / (h * n) * np.sum(self((xs-x)/h), axis=0) 

205 return w 

206 else: 

207 return np.nan 

208 

209 def density_var(self, density, nobs): 

210 """approximate pointwise variance for kernel density 

211 

212 not verified 

213 

214 Parameters 

215 ---------- 

216 density : array_lie 

217 pdf of the kernel density 

218 nobs : int 

219 number of observations used in the KDE estimation 

220 

221 Returns 

222 ------- 

223 kde_var : ndarray 

224 estimated variance of the density estimate 

225 

226 Notes 

227 ----- 

228 This uses the asymptotic normal approximation to the distribution of 

229 the density estimate. 

230 """ 

231 return np.asarray(density) * self.L2Norm / self.h / nobs 

232 

233 def density_confint(self, density, nobs, alpha=0.05): 

234 """approximate pointwise confidence interval for kernel density 

235 

236 The confidence interval is centered at the estimated density and 

237 ignores the bias of the density estimate. 

238 

239 not verified 

240 

241 Parameters 

242 ---------- 

243 density : array_lie 

244 pdf of the kernel density 

245 nobs : int 

246 number of observations used in the KDE estimation 

247 

248 Returns 

249 ------- 

250 conf_int : ndarray 

251 estimated confidence interval of the density estimate, lower bound 

252 in first column and upper bound in second column 

253 

254 Notes 

255 ----- 

256 This uses the asymptotic normal approximation to the distribution of 

257 the density estimate. The lower bound can be negative for density 

258 values close to zero. 

259 """ 

260 from scipy import stats 

261 crit = stats.norm.isf(alpha / 2.) 

262 density = np.asarray(density) 

263 half_width = crit * np.sqrt(self.density_var(density, nobs)) 

264 conf_int = np.column_stack((density - half_width, density + half_width)) 

265 return conf_int 

266 

267 def smooth(self, xs, ys, x): 

268 """Returns the kernel smoothing estimate for point x based on x-values 

269 xs and y-values ys. 

270 Not expected to be called by the user. 

271 """ 

272 xs, ys = self.in_domain(xs, ys, x) 

273 

274 if len(xs)>0: 

275 w = np.sum(self((xs-x)/self.h)) 

276 #TODO: change the below to broadcasting when shape is sorted 

277 v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)]) 

278 return v / w 

279 else: 

280 return np.nan 

281 

282 def smoothvar(self, xs, ys, x): 

283 """Returns the kernel smoothing estimate of the variance at point x. 

284 """ 

285 xs, ys = self.in_domain(xs, ys, x) 

286 

287 if len(xs) > 0: 

288 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs]) 

289 sqresid = square( subtract(ys, fittedvals) ) 

290 w = np.sum(self((xs-x)/self.h)) 

291 v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)]) 

292 return v / w 

293 else: 

294 return np.nan 

295 

296 def smoothconf(self, xs, ys, x, alpha=0.05): 

297 """Returns the kernel smoothing estimate with confidence 1sigma bounds 

298 """ 

299 xs, ys = self.in_domain(xs, ys, x) 

300 

301 if len(xs) > 0: 

302 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs]) 

303 #fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle 

304 sqresid = square( 

305 subtract(ys, fittedvals) 

306 ) 

307 w = np.sum(self((xs-x)/self.h)) 

308 #var = sqresid.sum() / (len(sqresid) - 0) # nonlocal var ? JP just trying 

309 v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)]) 

310 var = v / w 

311 sd = np.sqrt(var) 

312 K = self.L2Norm 

313 yhat = self.smooth(xs, ys, x) 

314 from scipy import stats 

315 crit = stats.norm.isf(alpha / 2) 

316 err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const) 

317 return (yhat - err, yhat, yhat + err) 

318 else: 

319 return (np.nan, np.nan, np.nan) 

320 

321 @property 

322 def L2Norm(self): 

323 """Returns the integral of the square of the kernal from -inf to inf""" 

324 if self._L2Norm is None: 

325 L2Func = lambda x: (self.norm_const*self._shape(x))**2 

326 if self.domain is None: 

327 self._L2Norm = scipy.integrate.quad(L2Func, -inf, inf)[0] 

328 else: 

329 self._L2Norm = scipy.integrate.quad(L2Func, self.domain[0], 

330 self.domain[1])[0] 

331 return self._L2Norm 

332 

333 @property 

334 def norm_const(self): 

335 """ 

336 Normalising constant for kernel (integral from -inf to inf) 

337 """ 

338 if self._normconst is None: 

339 if self.domain is None: 

340 quadres = scipy.integrate.quad(self._shape, -inf, inf) 

341 else: 

342 quadres = scipy.integrate.quad(self._shape, self.domain[0], 

343 self.domain[1]) 

344 self._normconst = 1.0/(quadres[0]) 

345 return self._normconst 

346 

347 @property 

348 def kernel_var(self): 

349 """Returns the second moment of the kernel""" 

350 if self._kernel_var is None: 

351 func = lambda x: x**2 * self.norm_const * self._shape(x) 

352 if self.domain is None: 

353 self._kernel_var = scipy.integrate.quad(func, -inf, inf)[0] 

354 else: 

355 self._kernel_var = scipy.integrate.quad(func, self.domain[0], 

356 self.domain[1])[0] 

357 return self._kernel_var 

358 

359 def moments(self, n): 

360 

361 if n > 2: 

362 msg = "Only first and second moment currently implemented" 

363 raise NotImplementedError(msg) 

364 

365 if n == 1: 

366 return 0 

367 

368 if n == 2: 

369 return self.kernel_var 

370 

371 @property 

372 def normal_reference_constant(self): 

373 """ 

374 Constant used for silverman normal reference asymtotic bandwidth 

375 calculation. 

376 

377 C = 2((pi^(1/2)*(nu!)^3 R(k))/(2nu(2nu)!kap_nu(k)^2))^(1/(2nu+1)) 

378 nu = kernel order 

379 kap_nu = nu'th moment of kernel 

380 R = kernel roughness (square of L^2 norm) 

381 

382 Note: L2Norm property returns square of norm. 

383 """ 

384 nu = self._order 

385 

386 if not nu == 2: 

387 msg = "Only implemented for second order kernels" 

388 raise NotImplementedError(msg) 

389 

390 if self._normal_reference_constant is None: 

391 C = np.pi**(.5) * factorial(nu)**3 * self.L2Norm 

392 C /= (2 * nu * factorial(2 * nu) * self.moments(nu)**2) 

393 C = 2*C**(1.0/(2*nu+1)) 

394 self._normal_reference_constant = C 

395 

396 return self._normal_reference_constant 

397 

398 

399 def weight(self, x): 

400 """This returns the normalised weight at distance x""" 

401 return self.norm_const*self._shape(x) 

402 

403 def __call__(self, x): 

404 """ 

405 This simply returns the value of the kernel function at x 

406 

407 Does the same as weight if the function is normalised 

408 """ 

409 return self._shape(x) 

410 

411 

412class Uniform(CustomKernel): 

413 def __init__(self, h=1.0): 

414 CustomKernel.__init__(self, shape=lambda x: 0.5 * np.ones(x.shape), h=h, 

415 domain=[-1.0, 1.0], norm = 1.0) 

416 self._L2Norm = 0.5 

417 self._kernel_var = 1. / 3 

418 self._order = 2 

419 

420 

421class Triangular(CustomKernel): 

422 def __init__(self, h=1.0): 

423 CustomKernel.__init__(self, shape=lambda x: 1 - abs(x), h=h, 

424 domain=[-1.0, 1.0], norm = 1.0) 

425 self._L2Norm = 2.0/3.0 

426 self._kernel_var = 1. / 6 

427 self._order = 2 

428 

429 

430class Epanechnikov(CustomKernel): 

431 def __init__(self, h=1.0): 

432 CustomKernel.__init__(self, shape=lambda x: 0.75*(1 - x*x), h=h, 

433 domain=[-1.0, 1.0], norm = 1.0) 

434 self._L2Norm = 0.6 

435 self._kernel_var = 0.2 

436 self._order = 2 

437 

438 

439class Biweight(CustomKernel): 

440 def __init__(self, h=1.0): 

441 CustomKernel.__init__(self, shape=lambda x: 0.9375*(1 - x*x)**2, h=h, 

442 domain=[-1.0, 1.0], norm = 1.0) 

443 self._L2Norm = 5.0/7.0 

444 self._kernel_var = 1. / 7 

445 self._order = 2 

446 

447 def smooth(self, xs, ys, x): 

448 """Returns the kernel smoothing estimate for point x based on x-values 

449 xs and y-values ys. 

450 Not expected to be called by the user. 

451 

452 Special implementation optimised for Biweight. 

453 """ 

454 xs, ys = self.in_domain(xs, ys, x) 

455 

456 if len(xs) > 0: 

457 w = np.sum(square(subtract(1, square(divide(subtract(xs, x), 

458 self.h))))) 

459 v = np.sum(multiply(ys, square(subtract(1, square(divide( 

460 subtract(xs, x), self.h)))))) 

461 return v / w 

462 else: 

463 return np.nan 

464 

465 def smoothvar(self, xs, ys, x): 

466 """ 

467 Returns the kernel smoothing estimate of the variance at point x. 

468 """ 

469 xs, ys = self.in_domain(xs, ys, x) 

470 

471 if len(xs) > 0: 

472 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs]) 

473 rs = square(subtract(ys, fittedvals)) 

474 w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x), 

475 self.h))))) 

476 v = np.sum(multiply(rs, square(subtract(1, square(divide( 

477 subtract(xs, x), self.h)))))) 

478 return v / w 

479 else: 

480 return np.nan 

481 

482 def smoothconf_(self, xs, ys, x): 

483 """Returns the kernel smoothing estimate with confidence 1sigma bounds 

484 """ 

485 xs, ys = self.in_domain(xs, ys, x) 

486 

487 if len(xs) > 0: 

488 fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs]) 

489 rs = square(subtract(ys, fittedvals)) 

490 w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x), 

491 self.h))))) 

492 v = np.sum(multiply(rs, square(subtract(1, square(divide( 

493 subtract(xs, x), self.h)))))) 

494 var = v / w 

495 sd = np.sqrt(var) 

496 K = self.L2Norm 

497 yhat = self.smooth(xs, ys, x) 

498 err = sd * K / np.sqrt(0.9375 * w * self.h) 

499 return (yhat - err, yhat, yhat + err) 

500 else: 

501 return (np.nan, np.nan, np.nan) 

502 

503class Triweight(CustomKernel): 

504 def __init__(self, h=1.0): 

505 CustomKernel.__init__(self, shape=lambda x: 1.09375*(1 - x*x)**3, h=h, 

506 domain=[-1.0, 1.0], norm = 1.0) 

507 self._L2Norm = 350.0/429.0 

508 self._kernel_var = 1. / 9 

509 self._order = 2 

510 

511 

512class Gaussian(CustomKernel): 

513 """ 

514 Gaussian (Normal) Kernel 

515 

516 K(u) = 1 / (sqrt(2*pi)) exp(-0.5 u**2) 

517 """ 

518 def __init__(self, h=1.0): 

519 CustomKernel.__init__(self, shape = lambda x: 0.3989422804014327 * 

520 np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0) 

521 self._L2Norm = 1.0/(2.0*np.sqrt(np.pi)) 

522 self._kernel_var = 1.0 

523 self._order = 2 

524 

525 def smooth(self, xs, ys, x): 

526 """Returns the kernel smoothing estimate for point x based on x-values 

527 xs and y-values ys. 

528 Not expected to be called by the user. 

529 

530 Special implementation optimised for Gaussian. 

531 """ 

532 w = np.sum(exp(multiply(square(divide(subtract(xs, x), 

533 self.h)),-0.5))) 

534 v = np.sum(multiply(ys, exp(multiply(square(divide(subtract(xs, x), 

535 self.h)), -0.5)))) 

536 return v/w 

537 

538class Cosine(CustomKernel): 

539 """ 

540 Cosine Kernel 

541 

542 K(u) = pi/4 cos(0.5 * pi * u) between -1.0 and 1.0 

543 """ 

544 def __init__(self, h=1.0): 

545 CustomKernel.__init__(self, shape=lambda x: 0.78539816339744828 * 

546 np.cos(np.pi/2.0 * x), h=h, domain=[-1.0, 1.0], norm = 1.0) 

547 self._L2Norm = np.pi**2/16.0 

548 self._kernel_var = 0.1894305308612978 # = 1 - 8 / np.pi**2 

549 self._order = 2 

550 

551 

552class Cosine2(CustomKernel): 

553 """ 

554 Cosine2 Kernel 

555 

556 K(u) = 1 + cos(2 * pi * u) between -0.5 and 0.5 

557 

558 Note: this is the same Cosine kernel that Stata uses 

559 """ 

560 def __init__(self, h=1.0): 

561 CustomKernel.__init__(self, shape=lambda x: 1 + np.cos(2.0 * np.pi * x) 

562 , h=h, domain=[-0.5, 0.5], norm = 1.0) 

563 self._L2Norm = 1.5 

564 self._kernel_var = 0.03267274151216444 # = 1/12. - 0.5 / np.pi**2 

565 self._order = 2