Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""numerical differentiation function, gradient, Jacobian, and Hessian 

2 

3Author : josef-pkt 

4License : BSD 

5 

6Notes 

7----- 

8These are simple forward differentiation, so that we have them available 

9without dependencies. 

10 

11* Jacobian should be faster than numdifftools because it does not use loop over 

12 observations. 

13* numerical precision will vary and depend on the choice of stepsizes 

14""" 

15 

16# TODO: 

17# * some cleanup 

18# * check numerical accuracy (and bugs) with numdifftools and analytical 

19# derivatives 

20# - linear least squares case: (hess - 2*X'X) is 1e-8 or so 

21# - gradient and Hessian agree with numdifftools when evaluated away from 

22# minimum 

23# - forward gradient, Jacobian evaluated at minimum is inaccurate, centered 

24# (+/- epsilon) is ok 

25# * dot product of Jacobian is different from Hessian, either wrong example or 

26# a bug (unlikely), or a real difference 

27# 

28# 

29# What are the conditions that Jacobian dotproduct and Hessian are the same? 

30# 

31# See also: 

32# 

33# BHHH: Greene p481 17.4.6, MLE Jacobian = d loglike / d beta , where loglike 

34# is vector for each observation 

35# see also example 17.4 when J'J is very different from Hessian 

36# also does it hold only at the minimum, what's relationship to covariance 

37# of Jacobian matrix 

38# http://projects.scipy.org/scipy/ticket/1157 

39# https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm 

40# objective: sum((y-f(beta,x)**2), Jacobian = d f/d beta 

41# and not d objective/d beta as in MLE Greene 

42# similar: http://crsouza.blogspot.com/2009/11/neural-network-learning-by-levenberg_18.html#hessian 

43# 

44# in example: if J = d x*beta / d beta then J'J == X'X 

45# similar to https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm 

46import numpy as np 

47 

48from statsmodels.compat.pandas import Appender, Substitution 

49 

50# NOTE: we only do double precision internally so far 

51EPS = np.MachAr().eps 

52 

53_hessian_docs = """ 

54 Calculate Hessian with finite difference derivative approximation 

55 

56 Parameters 

57 ---------- 

58 x : array_like 

59 value at which function derivative is evaluated 

60 f : function 

61 function of one array f(x, `*args`, `**kwargs`) 

62 epsilon : float or array_like, optional 

63 Stepsize used, if None, then stepsize is automatically chosen 

64 according to EPS**(1/%(scale)s)*x. 

65 args : tuple 

66 Arguments for function `f`. 

67 kwargs : dict 

68 Keyword arguments for function `f`. 

69 %(extra_params)s 

70 

71 Returns 

72 ------- 

73 hess : ndarray 

74 array of partial second derivatives, Hessian 

75 %(extra_returns)s 

76 

77 Notes 

78 ----- 

79 Equation (%(equation_number)s) in Ridout. Computes the Hessian as:: 

80 

81 %(equation)s 

82 

83 where e[j] is a vector with element j == 1 and the rest are zero and 

84 d[i] is epsilon[i]. 

85 

86 References 

87 ----------: 

88 

89 Ridout, M.S. (2009) Statistical applications of the complex-step method 

90 of numerical differentiation. The American Statistician, 63, 66-74 

91""" 

92 

93 

94def _get_epsilon(x, s, epsilon, n): 

95 if epsilon is None: 

96 h = EPS**(1. / s) * np.maximum(np.abs(x), 0.1) 

97 else: 

98 if np.isscalar(epsilon): 

99 h = np.empty(n) 

100 h.fill(epsilon) 

101 else: # pragma : no cover 

102 h = np.asarray(epsilon) 

103 if h.shape != x.shape: 

104 raise ValueError("If h is not a scalar it must have the same" 

105 " shape as x.") 

106 return h 

107 

108 

109def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False): 

110 ''' 

111 Gradient of function, or Jacobian if function f returns 1d array 

112 

113 Parameters 

114 ---------- 

115 x : ndarray 

116 parameters at which the derivative is evaluated 

117 f : function 

118 `f(*((x,)+args), **kwargs)` returning either one value or 1d array 

119 epsilon : float, optional 

120 Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for 

121 `centered` == False and EPS**(1/3)*x for `centered` == True. 

122 args : tuple 

123 Tuple of additional arguments for function `f`. 

124 kwargs : dict 

125 Dictionary of additional keyword arguments for function `f`. 

126 centered : bool 

127 Whether central difference should be returned. If not, does forward 

128 differencing. 

129 

130 Returns 

131 ------- 

132 grad : ndarray 

133 gradient or Jacobian 

134 

135 Notes 

136 ----- 

137 If f returns a 1d array, it returns a Jacobian. If a 2d array is returned 

138 by f (e.g., with a value for each observation), it returns a 3d array 

139 with the Jacobian of each observation with shape xk x nobs x xk. I.e., 

140 the Jacobian of the first observation would be [:, 0, :] 

141 ''' 

142 n = len(x) 

143 # TODO: add scaled stepsize 

144 f0 = f(*((x,)+args), **kwargs) 

145 dim = np.atleast_1d(f0).shape # it could be a scalar 

146 grad = np.zeros((n,) + dim, np.promote_types(float, x.dtype)) 

147 ei = np.zeros((n,), float) 

148 if not centered: 

149 epsilon = _get_epsilon(x, 2, epsilon, n) 

150 for k in range(n): 

151 ei[k] = epsilon[k] 

152 grad[k, :] = (f(*((x+ei,) + args), **kwargs) - f0)/epsilon[k] 

153 ei[k] = 0.0 

154 else: 

155 epsilon = _get_epsilon(x, 3, epsilon, n) / 2. 

156 for k in range(len(x)): 

157 ei[k] = epsilon[k] 

158 grad[k, :] = (f(*((x+ei,)+args), **kwargs) - 

159 f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k]) 

160 ei[k] = 0.0 

161 return grad.squeeze().T 

162 

163 

164def approx_fprime_cs(x, f, epsilon=None, args=(), kwargs={}): 

165 ''' 

166 Calculate gradient or Jacobian with complex step derivative approximation 

167 

168 Parameters 

169 ---------- 

170 x : ndarray 

171 parameters at which the derivative is evaluated 

172 f : function 

173 `f(*((x,)+args), **kwargs)` returning either one value or 1d array 

174 epsilon : float, optional 

175 Stepsize, if None, optimal stepsize is used. Optimal step-size is 

176 EPS*x. See note. 

177 args : tuple 

178 Tuple of additional arguments for function `f`. 

179 kwargs : dict 

180 Dictionary of additional keyword arguments for function `f`. 

181 

182 Returns 

183 ------- 

184 partials : ndarray 

185 array of partial derivatives, Gradient or Jacobian 

186 

187 Notes 

188 ----- 

189 The complex-step derivative has truncation error O(epsilon**2), so 

190 truncation error can be eliminated by choosing epsilon to be very small. 

191 The complex-step derivative avoids the problem of round-off error with 

192 small epsilon because there is no subtraction. 

193 ''' 

194 # From Guilherme P. de Freitas, numpy mailing list 

195 # May 04 2010 thread "Improvement of performance" 

196 # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html 

197 n = len(x) 

198 epsilon = _get_epsilon(x, 1, epsilon, n) 

199 increments = np.identity(n) * 1j * epsilon 

200 # TODO: see if this can be vectorized, but usually dim is small 

201 partials = [f(x+ih, *args, **kwargs).imag / epsilon[i] 

202 for i, ih in enumerate(increments)] 

203 return np.array(partials).T 

204 

205 

206def approx_hess_cs(x, f, epsilon=None, args=(), kwargs={}): 

207 '''Calculate Hessian with complex-step derivative approximation 

208 

209 Parameters 

210 ---------- 

211 x : array_like 

212 value at which function derivative is evaluated 

213 f : function 

214 function of one array f(x) 

215 epsilon : float 

216 stepsize, if None, then stepsize is automatically chosen 

217 

218 Returns 

219 ------- 

220 hess : ndarray 

221 array of partial second derivatives, Hessian 

222 

223 Notes 

224 ----- 

225 based on equation 10 in 

226 M. S. RIDOUT: Statistical Applications of the Complex-step Method 

227 of Numerical Differentiation, University of Kent, Canterbury, Kent, U.K. 

228 

229 The stepsize is the same for the complex and the finite difference part. 

230 ''' 

231 # TODO: might want to consider lowering the step for pure derivatives 

232 n = len(x) 

233 h = _get_epsilon(x, 3, epsilon, n) 

234 ee = np.diag(h) 

235 hess = np.outer(h, h) 

236 

237 n = len(x) 

238 

239 for i in range(n): 

240 for j in range(i, n): 

241 hess[i, j] = (f(*((x + 1j*ee[i, :] + ee[j, :],) + args), **kwargs) 

242 - f(*((x + 1j*ee[i, :] - ee[j, :],)+args), 

243 **kwargs)).imag/2./hess[i, j] 

244 hess[j, i] = hess[i, j] 

245 

246 return hess 

247 

248 

249@Substitution( 

250 scale="3", 

251 extra_params="""return_grad : bool 

252 Whether or not to also return the gradient 

253""", 

254 extra_returns="""grad : nparray 

255 Gradient if return_grad == True 

256""", 

257 equation_number="7", 

258 equation="""1/(d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j]))) 

259""" 

260) 

261@Appender(_hessian_docs) 

262def approx_hess1(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): 

263 n = len(x) 

264 h = _get_epsilon(x, 3, epsilon, n) 

265 ee = np.diag(h) 

266 

267 f0 = f(*((x,)+args), **kwargs) 

268 # Compute forward step 

269 g = np.zeros(n) 

270 for i in range(n): 

271 g[i] = f(*((x+ee[i, :],)+args), **kwargs) 

272 

273 hess = np.outer(h, h) # this is now epsilon**2 

274 # Compute "double" forward step 

275 for i in range(n): 

276 for j in range(i, n): 

277 hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - 

278 g[i] - g[j] + f0)/hess[i, j] 

279 hess[j, i] = hess[i, j] 

280 if return_grad: 

281 grad = (g - f0)/h 

282 return hess, grad 

283 else: 

284 return hess 

285 

286 

287@Substitution( 

288 scale="3", 

289 extra_params="""return_grad : bool 

290 Whether or not to also return the gradient 

291""", 

292 extra_returns="""grad : ndarray 

293 Gradient if return_grad == True 

294""", 

295 equation_number="8", 

296 equation="""1/(2*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j])) - 

297 (f(x + d[k]*e[k]) - f(x)) + 

298 (f(x - d[j]*e[j] - d[k]*e[k]) - f(x + d[j]*e[j])) - 

299 (f(x - d[k]*e[k]) - f(x))) 

300""" 

301) 

302@Appender(_hessian_docs) 

303def approx_hess2(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): 

304 # 

305 n = len(x) 

306 # NOTE: ridout suggesting using eps**(1/4)*theta 

307 h = _get_epsilon(x, 3, epsilon, n) 

308 ee = np.diag(h) 

309 f0 = f(*((x,)+args), **kwargs) 

310 # Compute forward step 

311 g = np.zeros(n) 

312 gg = np.zeros(n) 

313 for i in range(n): 

314 g[i] = f(*((x+ee[i, :],)+args), **kwargs) 

315 gg[i] = f(*((x-ee[i, :],)+args), **kwargs) 

316 

317 hess = np.outer(h, h) # this is now epsilon**2 

318 # Compute "double" forward step 

319 for i in range(n): 

320 for j in range(i, n): 

321 hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - 

322 g[i] - g[j] + f0 + 

323 f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs) - 

324 gg[i] - gg[j] + f0)/(2 * hess[i, j]) 

325 hess[j, i] = hess[i, j] 

326 if return_grad: 

327 grad = (g - f0)/h 

328 return hess, grad 

329 else: 

330 return hess 

331 

332 

333@Substitution( 

334 scale="4", 

335 extra_params="", 

336 extra_returns="", 

337 equation_number="9", 

338 equation="""1/(4*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j] 

339 - d[k]*e[k])) - 

340 (f(x - d[j]*e[j] + d[k]*e[k]) - f(x - d[j]*e[j] 

341 - d[k]*e[k]))""" 

342) 

343@Appender(_hessian_docs) 

344def approx_hess3(x, f, epsilon=None, args=(), kwargs={}): 

345 n = len(x) 

346 h = _get_epsilon(x, 4, epsilon, n) 

347 ee = np.diag(h) 

348 hess = np.outer(h,h) 

349 

350 for i in range(n): 

351 for j in range(i, n): 

352 hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) 

353 - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs) 

354 - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs) 

355 - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs)) 

356 )/(4.*hess[i, j]) 

357 hess[j, i] = hess[i, j] 

358 return hess 

359 

360 

361approx_hess = approx_hess3 

362approx_hess.__doc__ += "\n This is an alias for approx_hess3"