Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3A collection of smooth penalty functions. 

4 

5Penalties on vectors take a vector argument and return a scalar 

6penalty. The gradient of the penalty is a vector with the same shape 

7as the input value. 

8 

9Penalties on covariance matrices take two arguments: the matrix and 

10its inverse, both in unpacked (square) form. The returned penalty is 

11a scalar, and the gradient is returned as a vector that contains the 

12gradient with respect to the free elements in the lower triangle of 

13the covariance matrix. 

14 

15All penalties are subtracted from the log-likelihood, so greater 

16penalty values correspond to a greater degree of penalization. 

17 

18The penaties should be smooth so that they can be subtracted from log 

19likelihood functions and optimized using standard methods (i.e. L1 

20penalties do not belong here). 

21""" 

22import numpy as np 

23 

24 

25class Penalty(object): 

26 """ 

27 A class for representing a scalar-value penalty. 

28 

29 Parameters 

30 ---------- 

31 weights : array_like 

32 A vector of weights that determines the weight of the penalty 

33 for each parameter. 

34 

35 Notes 

36 ----- 

37 The class has a member called `alpha` that scales the weights. 

38 """ 

39 

40 def __init__(self, weights=1.): 

41 self.weights = weights 

42 self.alpha = 1. 

43 

44 def func(self, params): 

45 """ 

46 A penalty function on a vector of parameters. 

47 

48 Parameters 

49 ---------- 

50 params : array_like 

51 A vector of parameters. 

52 

53 Returns 

54 ------- 

55 A scalar penaty value; greater values imply greater 

56 penalization. 

57 """ 

58 raise NotImplementedError 

59 

60 def deriv(self, params): 

61 """ 

62 The gradient of a penalty function. 

63 

64 Parameters 

65 ---------- 

66 params : array_like 

67 A vector of parameters 

68 

69 Returns 

70 ------- 

71 The gradient of the penalty with respect to each element in 

72 `params`. 

73 """ 

74 raise NotImplementedError 

75 

76 def grad(self, params): 

77 import warnings 

78 warnings.warn('grad method is deprecated. Use `deriv` instead', 

79 DeprecationWarning) 

80 return self.deriv(params) 

81 

82 def _null_weights(self, params): 

83 """work around for Null model 

84 

85 This will not be needed anymore when we can use `self._null_drop_keys` 

86 as in DiscreteModels. 

87 TODO: check other models 

88 """ 

89 if np.size(self.weights) > 1: 

90 if len(params) == 1: 

91 raise # raise to identify models where this would be needed 

92 return 0. 

93 

94 return self.weights 

95 

96 

97class NonePenalty(Penalty): 

98 """ 

99 A penalty that does not penalize. 

100 """ 

101 

102 def __init__(self, **kwds): 

103 super().__init__() 

104 if kwds: 

105 import warnings 

106 warnings.warn('keyword arguments are be ignored') 

107 

108 def func(self, params): 

109 if params.ndim == 2: 

110 return np.zeros(params.shape[1:]) 

111 else: 

112 return 0 

113 

114 def deriv(self, params): 

115 return np.zeros(params.shape) 

116 

117 def deriv2(self, params): 

118 # returns diagonal of hessian 

119 return np.zeros(params.shape[0]) 

120 

121 

122class L2(Penalty): 

123 """ 

124 The L2 (ridge) penalty. 

125 """ 

126 

127 def __init__(self, weights=1.): 

128 super().__init__(weights) 

129 

130 def func(self, params): 

131 return np.sum(self.weights * self.alpha * params**2) 

132 

133 def deriv(self, params): 

134 return 2 * self.weights * self.alpha * params 

135 

136 def deriv2(self, params): 

137 return 2 * self.weights * self.alpha * np.ones(len(params)) 

138 

139 

140class L2Univariate(Penalty): 

141 """ 

142 The L2 (ridge) penalty applied to each parameter. 

143 """ 

144 

145 def __init__(self, weights=None): 

146 if weights is None: 

147 self.weights = 1. 

148 else: 

149 self.weights = weights 

150 

151 def func(self, params): 

152 return self.weights * params**2 

153 

154 def deriv(self, params): 

155 return 2 * self.weights * params 

156 

157 def deriv2(self, params): 

158 return 2 * self.weights * np.ones(len(params)) 

159 

160 

161class PseudoHuber(Penalty): 

162 """ 

163 The pseudo-Huber penalty. 

164 """ 

165 

166 def __init__(self, dlt, weights=1.): 

167 super().__init__(weights) 

168 self.dlt = dlt 

169 

170 def func(self, params): 

171 v = np.sqrt(1 + (params / self.dlt)**2) 

172 v -= 1 

173 v *= self.dlt**2 

174 return np.sum(self.weights * self.alpha * v, 0) 

175 

176 def deriv(self, params): 

177 v = np.sqrt(1 + (params / self.dlt)**2) 

178 return params * self.weights * self.alpha / v 

179 

180 def deriv2(self, params): 

181 v = np.power(1 + (params / self.dlt)**2, -3/2) 

182 return self.weights * self.alpha * v 

183 

184 

185class SCAD(Penalty): 

186 """ 

187 The SCAD penalty of Fan and Li. 

188 

189 The SCAD penalty is linear around zero as a L1 penalty up to threshold tau. 

190 The SCAD penalty is constant for values larger than c*tau. 

191 The middle segment is quadratic and connect the two segments with a continuous 

192 derivative. 

193 The penalty is symmetric around zero. 

194 

195 Parameterization follows Boo, Johnson, Li and Tan 2011. 

196 Fan and Li use lambda instead of tau, and a instead of c. Fan and Li 

197 recommend setting c=3.7. 

198 

199 f(x) = { tau |x| if 0 <= |x| < tau 

200 { -(|x|^2 - 2 c tau |x| + tau^2) / (2 (c - 1)) if tau <= |x| < c tau 

201 { (c + 1) tau^2 / 2 if c tau <= |x| 

202 

203 Parameters 

204 ---------- 

205 tau : float 

206 slope and threshold for linear segment 

207 c : float 

208 factor for second threshold which is c * tau 

209 weights : None or array 

210 weights for penalty of each parameter. If an entry is zero, then the 

211 corresponding parameter will not be penalized. 

212 

213 References 

214 ---------- 

215 Buu, Anne, Norman J. Johnson, Runze Li, and Xianming Tan. "New variable 

216 selection methods for zero‐inflated count data with applications to the 

217 substance abuse field." 

218 Statistics in medicine 30, no. 18 (2011): 2326-2340. 

219 

220 Fan, Jianqing, and Runze Li. "Variable selection via nonconcave penalized 

221 likelihood and its oracle properties." 

222 Journal of the American statistical Association 96, no. 456 (2001): 

223 1348-1360. 

224 """ 

225 

226 def __init__(self, tau, c=3.7, weights=1.): 

227 super().__init__(weights) 

228 self.tau = tau 

229 self.c = c 

230 

231 def func(self, params): 

232 

233 # 3 segments in absolute value 

234 tau = self.tau 

235 p_abs = np.atleast_1d(np.abs(params)) 

236 res = np.empty(p_abs.shape, p_abs.dtype) 

237 res.fill(np.nan) 

238 mask1 = p_abs < tau 

239 mask3 = p_abs >= self.c * tau 

240 res[mask1] = tau * p_abs[mask1] 

241 mask2 = ~mask1 & ~mask3 

242 p_abs2 = p_abs[mask2] 

243 tmp = (p_abs2**2 - 2 * self.c * tau * p_abs2 + tau**2) 

244 res[mask2] = -tmp / (2 * (self.c - 1)) 

245 res[mask3] = (self.c + 1) * tau**2 / 2. 

246 

247 return (self.weights * res).sum(0) 

248 

249 def deriv(self, params): 

250 

251 # 3 segments in absolute value 

252 tau = self.tau 

253 p = np.atleast_1d(params) 

254 p_abs = np.abs(p) 

255 p_sign = np.sign(p) 

256 res = np.empty(p_abs.shape) 

257 res.fill(np.nan) 

258 

259 mask1 = p_abs < tau 

260 mask3 = p_abs >= self.c * tau 

261 mask2 = ~mask1 & ~mask3 

262 res[mask1] = p_sign[mask1] * tau 

263 tmp = p_sign[mask2] * (p_abs[mask2] - self.c * tau) 

264 res[mask2] = -tmp / (self.c - 1) 

265 res[mask3] = 0 

266 

267 return self.weights * res 

268 

269 def deriv2(self, params): 

270 """Second derivative of function 

271 

272 This returns scalar or vector in same shape as params, not a square 

273 Hessian. If the return is 1 dimensional, then it is the diagonal of 

274 the Hessian. 

275 """ 

276 

277 # 3 segments in absolute value 

278 tau = self.tau 

279 p = np.atleast_1d(params) 

280 p_abs = np.abs(p) 

281 res = np.zeros(p_abs.shape) 

282 

283 mask1 = p_abs < tau 

284 mask3 = p_abs >= self.c * tau 

285 mask2 = ~mask1 & ~mask3 

286 res[mask2] = -1 / (self.c - 1) 

287 

288 return self.weights * res 

289 

290 

291class SCADSmoothed(SCAD): 

292 """ 

293 The SCAD penalty of Fan and Li, quadratically smoothed around zero. 

294 

295 This follows Fan and Li 2001 equation (3.7). 

296 

297 Parameterization follows Boo, Johnson, Li and Tan 2011 

298 see docstring of SCAD 

299 

300 Parameters 

301 ---------- 

302 tau : float 

303 slope and threshold for linear segment 

304 c : float 

305 factor for second threshold 

306 c0 : float 

307 threshold for quadratically smoothed segment 

308 restriction : None or array 

309 linear constraints for 

310 

311 Notes 

312 ----- 

313 TODO: Use delegation instead of subclassing, so smoothing can be added to 

314 all penalty classes. 

315 """ 

316 

317 def __init__(self, tau, c=3.7, c0=None, weights=1., restriction=None): 

318 super().__init__(tau, c=c, weights=weights) 

319 self.tau = tau 

320 self.c = c 

321 self.c0 = c0 if c0 is not None else tau * 0.1 

322 if self.c0 > tau: 

323 raise ValueError('c0 cannot be larger than tau') 

324 

325 # get coefficients for quadratic approximation 

326 c0 = self.c0 

327 # need to temporarily override weights for call to super 

328 weights = self.weights 

329 self.weights = 1. 

330 deriv_c0 = super(SCADSmoothed, self).deriv(c0) 

331 value_c0 = super(SCADSmoothed, self).func(c0) 

332 self.weights = weights 

333 

334 self.aq1 = value_c0 - 0.5 * deriv_c0 * c0 

335 self.aq2 = 0.5 * deriv_c0 / c0 

336 self.restriction = restriction 

337 

338 def func(self, params): 

339 # workaround for Null model 

340 weights = self._null_weights(params) 

341 # TODO: `and np.size(params) > 1` is hack for llnull, need better solution 

342 if self.restriction is not None and np.size(params) > 1: 

343 params = self.restriction.dot(params) 

344 # need to temporarily override weights for call to super 

345 # Note: we have the same problem with `restriction` 

346 self_weights = self.weights 

347 self.weights = 1. 

348 value = super(SCADSmoothed, self).func(params[None, ...]) 

349 self.weights = self_weights 

350 

351 # shift down so func(0) == 0 

352 value -= self.aq1 

353 # change the segment corrsponding to quadratic approximation 

354 p_abs = np.atleast_1d(np.abs(params)) 

355 mask = p_abs < self.c0 

356 p_abs_masked = p_abs[mask] 

357 value[mask] = self.aq2 * p_abs_masked**2 

358 

359 return (weights * value).sum(0) 

360 

361 def deriv(self, params): 

362 # workaround for Null model 

363 weights = self._null_weights(params) 

364 if self.restriction is not None and np.size(params) > 1: 

365 params = self.restriction.dot(params) 

366 # need to temporarily override weights for call to super 

367 self_weights = self.weights 

368 self.weights = 1. 

369 value = super(SCADSmoothed, self).deriv(params) 

370 self.weights = self_weights 

371 

372 #change the segment corrsponding to quadratic approximation 

373 p = np.atleast_1d(params) 

374 mask = np.abs(p) < self.c0 

375 value[mask] = 2 * self.aq2 * p[mask] 

376 

377 if self.restriction is not None and np.size(params) > 1: 

378 return weights * value.dot(self.restriction) 

379 else: 

380 return weights * value 

381 

382 def deriv2(self, params): 

383 # workaround for Null model 

384 weights = self._null_weights(params) 

385 if self.restriction is not None and np.size(params) > 1: 

386 params = self.restriction.dot(params) 

387 # need to temporarily override weights for call to super 

388 self_weights = self.weights 

389 self.weights = 1. 

390 value = super(SCADSmoothed, self).deriv2(params) 

391 self.weights = self_weights 

392 

393 # change the segment corrsponding to quadratic approximation 

394 p = np.atleast_1d(params) 

395 mask = np.abs(p) < self.c0 

396 value[mask] = 2 * self.aq2 

397 

398 if self.restriction is not None and np.size(params) > 1: 

399 # note: super returns 1d array for diag, i.e. hessian_diag 

400 # TODO: weights are missing 

401 return (self.restriction.T * (weights * value) 

402 ).dot(self.restriction) 

403 else: 

404 return weights * value 

405 

406 

407class ConstraintsPenalty(object): 

408 """ 

409 Penalty applied to linear transformation of parameters 

410 

411 Parameters 

412 ---------- 

413 penalty: instance of penalty function 

414 currently this requires an instance of a univariate, vectorized 

415 penalty class 

416 weights : None or ndarray 

417 weights for adding penalties of transformed params 

418 restriction : None or ndarray 

419 If it is not None, then restriction defines a linear transformation 

420 of the parameters. The penalty function is applied to each transformed 

421 parameter independently. 

422 

423 Notes 

424 ----- 

425 `restrictions` allows us to impose penalization on contrasts or stochastic 

426 constraints of the original parameters. 

427 Examples for these contrast are difference penalities or all pairs 

428 penalties. 

429 """ 

430 

431 def __init__(self, penalty, weights=None, restriction=None): 

432 

433 self.penalty = penalty 

434 if weights is None: 

435 self.weights = 1. 

436 else: 

437 self.weights = weights 

438 

439 if restriction is not None: 

440 restriction = np.asarray(restriction) 

441 

442 self.restriction = restriction 

443 

444 def func(self, params): 

445 """evaluate penalty function at params 

446 

447 Parameter 

448 --------- 

449 params : ndarray 

450 array of parameters at which derivative is evaluated 

451 

452 Returns 

453 ------- 

454 deriv2 : ndarray 

455 value(s) of penalty function 

456 """ 

457 # TODO: `and np.size(params) > 1` is hack for llnull, need better solution 

458 # Is this still needed? it seems to work without 

459 if self.restriction is not None: 

460 params = self.restriction.dot(params) 

461 

462 value = self.penalty.func(params) 

463 

464 return (self.weights * value.T).T.sum(0) 

465 

466 def deriv(self, params): 

467 """first derivative of penalty function w.r.t. params 

468 

469 Parameter 

470 --------- 

471 params : ndarray 

472 array of parameters at which derivative is evaluated 

473 

474 Returns 

475 ------- 

476 deriv2 : ndarray 

477 array of first partial derivatives 

478 """ 

479 if self.restriction is not None: 

480 params = self.restriction.dot(params) 

481 

482 value = self.penalty.deriv(params) 

483 

484 if self.restriction is not None: 

485 return self.weights * value.T.dot(self.restriction) 

486 else: 

487 return (self.weights * value.T) 

488 

489 grad = deriv 

490 

491 def deriv2(self, params): 

492 """second derivative of penalty function w.r.t. params 

493 

494 Parameter 

495 --------- 

496 params : ndarray 

497 array of parameters at which derivative is evaluated 

498 

499 Returns 

500 ------- 

501 deriv2 : ndarray, 2-D 

502 second derivative matrix 

503 """ 

504 

505 if self.restriction is not None: 

506 params = self.restriction.dot(params) 

507 

508 value = self.penalty.deriv2(params) 

509 

510 if self.restriction is not None: 

511 # note: univariate penalty returns 1d array for diag, 

512 # i.e. hessian_diag 

513 v = (self.restriction.T * value * self.weights) 

514 value = v.dot(self.restriction) 

515 else: 

516 value = np.diag(self.weights * value) 

517 

518 return value 

519 

520 

521class L2ContraintsPenalty(ConstraintsPenalty): 

522 """convenience class of ConstraintsPenalty with L2 penalization 

523 """ 

524 

525 def __init__(self, weights=None, restriction=None, sigma_prior=None): 

526 

527 if sigma_prior is not None: 

528 raise NotImplementedError('sigma_prior is not implemented yet') 

529 

530 penalty = L2Univariate() 

531 

532 super(L2ContraintsPenalty, self).__init__(penalty, weights=weights, 

533 restriction=restriction) 

534 

535 

536class CovariancePenalty(object): 

537 

538 def __init__(self, weight): 

539 # weight should be scalar 

540 self.weight = weight 

541 

542 def func(self, mat, mat_inv): 

543 """ 

544 Parameters 

545 ---------- 

546 mat : square matrix 

547 The matrix to be penalized. 

548 mat_inv : square matrix 

549 The inverse of `mat`. 

550 

551 Returns 

552 ------- 

553 A scalar penalty value 

554 """ 

555 raise NotImplementedError 

556 

557 def deriv(self, mat, mat_inv): 

558 """ 

559 Parameters 

560 ---------- 

561 mat : square matrix 

562 The matrix to be penalized. 

563 mat_inv : square matrix 

564 The inverse of `mat`. 

565 

566 Returns 

567 ------- 

568 A vector containing the gradient of the penalty 

569 with respect to each element in the lower triangle 

570 of `mat`. 

571 """ 

572 raise NotImplementedError 

573 

574 def grad(self, mat, mat_inv): 

575 import warnings 

576 warnings.warn('grad method is deprecated. Use `deriv` instead', 

577 DeprecationWarning) 

578 return self.deriv(mat, mat_inv) 

579 

580 

581class PSD(CovariancePenalty): 

582 """ 

583 A penalty that converges to +infinity as the argument matrix 

584 approaches the boundary of the domain of symmetric, positive 

585 definite matrices. 

586 """ 

587 

588 def func(self, mat, mat_inv): 

589 try: 

590 cy = np.linalg.cholesky(mat) 

591 except np.linalg.LinAlgError: 

592 return np.inf 

593 return -2 * self.weight * np.sum(np.log(np.diag(cy))) 

594 

595 def deriv(self, mat, mat_inv): 

596 cy = mat_inv.copy() 

597 cy = 2*cy - np.diag(np.diag(cy)) 

598 i,j = np.tril_indices(mat.shape[0]) 

599 return -self.weight * cy[i,j]