Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Functions 

3--------- 

4.. autosummary:: 

5 :toctree: generated/ 

6 

7 fmin_l_bfgs_b 

8 

9""" 

10 

11## License for the Python wrapper 

12## ============================== 

13 

14## Copyright (c) 2004 David M. Cooke <cookedm@physics.mcmaster.ca> 

15 

16## Permission is hereby granted, free of charge, to any person obtaining a 

17## copy of this software and associated documentation files (the "Software"), 

18## to deal in the Software without restriction, including without limitation 

19## the rights to use, copy, modify, merge, publish, distribute, sublicense, 

20## and/or sell copies of the Software, and to permit persons to whom the 

21## Software is furnished to do so, subject to the following conditions: 

22 

23## The above copyright notice and this permission notice shall be included in 

24## all copies or substantial portions of the Software. 

25 

26## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 

27## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 

28## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 

29## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 

30## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 

31## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 

32## DEALINGS IN THE SOFTWARE. 

33 

34## Modifications by Travis Oliphant and Enthought, Inc. for inclusion in SciPy 

35 

36import numpy as np 

37from numpy import array, asarray, float64, zeros 

38from . import _lbfgsb 

39from .optimize import (MemoizeJac, OptimizeResult, 

40 _check_unknown_options, _prepare_scalar_function) 

41from ._constraints import old_bound_to_new 

42 

43from scipy.sparse.linalg import LinearOperator 

44 

45__all__ = ['fmin_l_bfgs_b', 'LbfgsInvHessProduct'] 

46 

47 

48def fmin_l_bfgs_b(func, x0, fprime=None, args=(), 

49 approx_grad=0, 

50 bounds=None, m=10, factr=1e7, pgtol=1e-5, 

51 epsilon=1e-8, 

52 iprint=-1, maxfun=15000, maxiter=15000, disp=None, 

53 callback=None, maxls=20): 

54 """ 

55 Minimize a function func using the L-BFGS-B algorithm. 

56 

57 Parameters 

58 ---------- 

59 func : callable f(x,*args) 

60 Function to minimize. 

61 x0 : ndarray 

62 Initial guess. 

63 fprime : callable fprime(x,*args), optional 

64 The gradient of `func`. If None, then `func` returns the function 

65 value and the gradient (``f, g = func(x, *args)``), unless 

66 `approx_grad` is True in which case `func` returns only ``f``. 

67 args : sequence, optional 

68 Arguments to pass to `func` and `fprime`. 

69 approx_grad : bool, optional 

70 Whether to approximate the gradient numerically (in which case 

71 `func` returns only the function value). 

72 bounds : list, optional 

73 ``(min, max)`` pairs for each element in ``x``, defining 

74 the bounds on that parameter. Use None or +-inf for one of ``min`` or 

75 ``max`` when there is no bound in that direction. 

76 m : int, optional 

77 The maximum number of variable metric corrections 

78 used to define the limited memory matrix. (The limited memory BFGS 

79 method does not store the full hessian but uses this many terms in an 

80 approximation to it.) 

81 factr : float, optional 

82 The iteration stops when 

83 ``(f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= factr * eps``, 

84 where ``eps`` is the machine precision, which is automatically 

85 generated by the code. Typical values for `factr` are: 1e12 for 

86 low accuracy; 1e7 for moderate accuracy; 10.0 for extremely 

87 high accuracy. See Notes for relationship to `ftol`, which is exposed 

88 (instead of `factr`) by the `scipy.optimize.minimize` interface to 

89 L-BFGS-B. 

90 pgtol : float, optional 

91 The iteration will stop when 

92 ``max{|proj g_i | i = 1, ..., n} <= pgtol`` 

93 where ``pg_i`` is the i-th component of the projected gradient. 

94 epsilon : float, optional 

95 Step size used when `approx_grad` is True, for numerically 

96 calculating the gradient 

97 iprint : int, optional 

98 Controls the frequency of output. ``iprint < 0`` means no output; 

99 ``iprint = 0`` print only one line at the last iteration; 

100 ``0 < iprint < 99`` print also f and ``|proj g|`` every iprint iterations; 

101 ``iprint = 99`` print details of every iteration except n-vectors; 

102 ``iprint = 100`` print also the changes of active set and final x; 

103 ``iprint > 100`` print details of every iteration including x and g. 

104 disp : int, optional 

105 If zero, then no output. If a positive number, then this over-rides 

106 `iprint` (i.e., `iprint` gets the value of `disp`). 

107 maxfun : int, optional 

108 Maximum number of function evaluations. 

109 maxiter : int, optional 

110 Maximum number of iterations. 

111 callback : callable, optional 

112 Called after each iteration, as ``callback(xk)``, where ``xk`` is the 

113 current parameter vector. 

114 maxls : int, optional 

115 Maximum number of line search steps (per iteration). Default is 20. 

116 

117 Returns 

118 ------- 

119 x : array_like 

120 Estimated position of the minimum. 

121 f : float 

122 Value of `func` at the minimum. 

123 d : dict 

124 Information dictionary. 

125 

126 * d['warnflag'] is 

127 

128 - 0 if converged, 

129 - 1 if too many function evaluations or too many iterations, 

130 - 2 if stopped for another reason, given in d['task'] 

131 

132 * d['grad'] is the gradient at the minimum (should be 0 ish) 

133 * d['funcalls'] is the number of function calls made. 

134 * d['nit'] is the number of iterations. 

135 

136 See also 

137 -------- 

138 minimize: Interface to minimization algorithms for multivariate 

139 functions. See the 'L-BFGS-B' `method` in particular. Note that the 

140 `ftol` option is made available via that interface, while `factr` is 

141 provided via this interface, where `factr` is the factor multiplying 

142 the default machine floating-point precision to arrive at `ftol`: 

143 ``ftol = factr * numpy.finfo(float).eps``. 

144 

145 Notes 

146 ----- 

147 License of L-BFGS-B (FORTRAN code): 

148 

149 The version included here (in fortran code) is 3.0 

150 (released April 25, 2011). It was written by Ciyou Zhu, Richard Byrd, 

151 and Jorge Nocedal <nocedal@ece.nwu.edu>. It carries the following 

152 condition for use: 

153 

154 This software is freely available, but we expect that all publications 

155 describing work using this software, or all commercial products using it, 

156 quote at least one of the references given below. This software is released 

157 under the BSD License. 

158 

159 References 

160 ---------- 

161 * R. H. Byrd, P. Lu and J. Nocedal. A Limited Memory Algorithm for Bound 

162 Constrained Optimization, (1995), SIAM Journal on Scientific and 

163 Statistical Computing, 16, 5, pp. 1190-1208. 

164 * C. Zhu, R. H. Byrd and J. Nocedal. L-BFGS-B: Algorithm 778: L-BFGS-B, 

165 FORTRAN routines for large scale bound constrained optimization (1997), 

166 ACM Transactions on Mathematical Software, 23, 4, pp. 550 - 560. 

167 * J.L. Morales and J. Nocedal. L-BFGS-B: Remark on Algorithm 778: L-BFGS-B, 

168 FORTRAN routines for large scale bound constrained optimization (2011), 

169 ACM Transactions on Mathematical Software, 38, 1. 

170 

171 """ 

172 # handle fprime/approx_grad 

173 if approx_grad: 

174 fun = func 

175 jac = None 

176 elif fprime is None: 

177 fun = MemoizeJac(func) 

178 jac = fun.derivative 

179 else: 

180 fun = func 

181 jac = fprime 

182 

183 # build options 

184 if disp is None: 

185 disp = iprint 

186 opts = {'disp': disp, 

187 'iprint': iprint, 

188 'maxcor': m, 

189 'ftol': factr * np.finfo(float).eps, 

190 'gtol': pgtol, 

191 'eps': epsilon, 

192 'maxfun': maxfun, 

193 'maxiter': maxiter, 

194 'callback': callback, 

195 'maxls': maxls} 

196 

197 res = _minimize_lbfgsb(fun, x0, args=args, jac=jac, bounds=bounds, 

198 **opts) 

199 d = {'grad': res['jac'], 

200 'task': res['message'], 

201 'funcalls': res['nfev'], 

202 'nit': res['nit'], 

203 'warnflag': res['status']} 

204 f = res['fun'] 

205 x = res['x'] 

206 

207 return x, f, d 

208 

209 

210def _minimize_lbfgsb(fun, x0, args=(), jac=None, bounds=None, 

211 disp=None, maxcor=10, ftol=2.2204460492503131e-09, 

212 gtol=1e-5, eps=1e-8, maxfun=15000, maxiter=15000, 

213 iprint=-1, callback=None, maxls=20, 

214 finite_diff_rel_step=None, **unknown_options): 

215 """ 

216 Minimize a scalar function of one or more variables using the L-BFGS-B 

217 algorithm. 

218 

219 Options 

220 ------- 

221 disp : None or int 

222 If `disp is None` (the default), then the supplied version of `iprint` 

223 is used. If `disp is not None`, then it overrides the supplied version 

224 of `iprint` with the behaviour you outlined. 

225 maxcor : int 

226 The maximum number of variable metric corrections used to 

227 define the limited memory matrix. (The limited memory BFGS 

228 method does not store the full hessian but uses this many terms 

229 in an approximation to it.) 

230 ftol : float 

231 The iteration stops when ``(f^k - 

232 f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= ftol``. 

233 gtol : float 

234 The iteration will stop when ``max{|proj g_i | i = 1, ..., n} 

235 <= gtol`` where ``pg_i`` is the i-th component of the 

236 projected gradient. 

237 eps : float or ndarray 

238 If `jac is None` the absolute step size used for numerical 

239 approximation of the jacobian via forward differences. 

240 maxfun : int 

241 Maximum number of function evaluations. 

242 maxiter : int 

243 Maximum number of iterations. 

244 iprint : int, optional 

245 Controls the frequency of output. ``iprint < 0`` means no output; 

246 ``iprint = 0`` print only one line at the last iteration; 

247 ``0 < iprint < 99`` print also f and ``|proj g|`` every iprint iterations; 

248 ``iprint = 99`` print details of every iteration except n-vectors; 

249 ``iprint = 100`` print also the changes of active set and final x; 

250 ``iprint > 100`` print details of every iteration including x and g. 

251 callback : callable, optional 

252 Called after each iteration, as ``callback(xk)``, where ``xk`` is the 

253 current parameter vector. 

254 maxls : int, optional 

255 Maximum number of line search steps (per iteration). Default is 20. 

256 finite_diff_rel_step : None or array_like, optional 

257 If `jac in ['2-point', '3-point', 'cs']` the relative step size to 

258 use for numerical approximation of the jacobian. The absolute step 

259 size is computed as ``h = rel_step * sign(x0) * max(1, abs(x0))``, 

260 possibly adjusted to fit into the bounds. For ``method='3-point'`` 

261 the sign of `h` is ignored. If None (default) then step is selected 

262 automatically. 

263 

264 Notes 

265 ----- 

266 The option `ftol` is exposed via the `scipy.optimize.minimize` interface, 

267 but calling `scipy.optimize.fmin_l_bfgs_b` directly exposes `factr`. The 

268 relationship between the two is ``ftol = factr * numpy.finfo(float).eps``. 

269 I.e., `factr` multiplies the default machine floating-point precision to 

270 arrive at `ftol`. 

271 

272 """ 

273 _check_unknown_options(unknown_options) 

274 m = maxcor 

275 pgtol = gtol 

276 factr = ftol / np.finfo(float).eps 

277 

278 x0 = asarray(x0).ravel() 

279 n, = x0.shape 

280 

281 if bounds is None: 

282 bounds = [(None, None)] * n 

283 if len(bounds) != n: 

284 raise ValueError('length of x0 != length of bounds') 

285 

286 # unbounded variables must use None, not +-inf, for optimizer to work properly 

287 bounds = [(None if l == -np.inf else l, None if u == np.inf else u) for l, u in bounds] 

288 # LBFGSB is sent 'old-style' bounds, 'new-style' bounds are required by 

289 # approx_derivative and ScalarFunction 

290 new_bounds = old_bound_to_new(bounds) 

291 

292 # check bounds 

293 if (new_bounds[0] > new_bounds[1]).any(): 

294 raise ValueError("LBFGSB - one of the lower bounds is greater than an upper bound.") 

295 

296 # initial vector must lie within the bounds. Otherwise ScalarFunction and 

297 # approx_derivative will cause problems 

298 x0 = np.clip(x0, new_bounds[0], new_bounds[1]) 

299 

300 if disp is not None: 

301 if disp == 0: 

302 iprint = -1 

303 else: 

304 iprint = disp 

305 

306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps, 

307 bounds=new_bounds, 

308 finite_diff_rel_step=finite_diff_rel_step) 

309 

310 func_and_grad = sf.fun_and_grad 

311 

312 fortran_int = _lbfgsb.types.intvar.dtype 

313 

314 nbd = zeros(n, fortran_int) 

315 low_bnd = zeros(n, float64) 

316 upper_bnd = zeros(n, float64) 

317 bounds_map = {(None, None): 0, 

318 (1, None): 1, 

319 (1, 1): 2, 

320 (None, 1): 3} 

321 for i in range(0, n): 

322 l, u = bounds[i] 

323 if l is not None: 

324 low_bnd[i] = l 

325 l = 1 

326 if u is not None: 

327 upper_bnd[i] = u 

328 u = 1 

329 nbd[i] = bounds_map[l, u] 

330 

331 if not maxls > 0: 

332 raise ValueError('maxls must be positive.') 

333 

334 x = array(x0, float64) 

335 f = array(0.0, float64) 

336 g = zeros((n,), float64) 

337 wa = zeros(2*m*n + 5*n + 11*m*m + 8*m, float64) 

338 iwa = zeros(3*n, fortran_int) 

339 task = zeros(1, 'S60') 

340 csave = zeros(1, 'S60') 

341 lsave = zeros(4, fortran_int) 

342 isave = zeros(44, fortran_int) 

343 dsave = zeros(29, float64) 

344 

345 task[:] = 'START' 

346 

347 n_iterations = 0 

348 

349 while 1: 

350 # x, f, g, wa, iwa, task, csave, lsave, isave, dsave = \ 

351 _lbfgsb.setulb(m, x, low_bnd, upper_bnd, nbd, f, g, factr, 

352 pgtol, wa, iwa, task, iprint, csave, lsave, 

353 isave, dsave, maxls) 

354 task_str = task.tobytes() 

355 if task_str.startswith(b'FG'): 

356 # The minimization routine wants f and g at the current x. 

357 # Note that interruptions due to maxfun are postponed 

358 # until the completion of the current minimization iteration. 

359 # Overwrite f and g: 

360 f, g = func_and_grad(x) 

361 elif task_str.startswith(b'NEW_X'): 

362 # new iteration 

363 n_iterations += 1 

364 if callback is not None: 

365 callback(np.copy(x)) 

366 

367 if n_iterations >= maxiter: 

368 task[:] = 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT' 

369 elif sf.nfev > maxfun: 

370 task[:] = ('STOP: TOTAL NO. of f AND g EVALUATIONS ' 

371 'EXCEEDS LIMIT') 

372 else: 

373 break 

374 

375 task_str = task.tobytes().strip(b'\x00').strip() 

376 if task_str.startswith(b'CONV'): 

377 warnflag = 0 

378 elif sf.nfev > maxfun or n_iterations >= maxiter: 

379 warnflag = 1 

380 else: 

381 warnflag = 2 

382 

383 # These two portions of the workspace are described in the mainlb 

384 # subroutine in lbfgsb.f. See line 363. 

385 s = wa[0: m*n].reshape(m, n) 

386 y = wa[m*n: 2*m*n].reshape(m, n) 

387 

388 # See lbfgsb.f line 160 for this portion of the workspace. 

389 # isave(31) = the total number of BFGS updates prior the current iteration; 

390 n_bfgs_updates = isave[30] 

391 

392 n_corrs = min(n_bfgs_updates, maxcor) 

393 hess_inv = LbfgsInvHessProduct(s[:n_corrs], y[:n_corrs]) 

394 

395 return OptimizeResult(fun=f, jac=g, nfev=sf.nfev, 

396 njev=sf.ngev, 

397 nit=n_iterations, status=warnflag, message=task_str, 

398 x=x, success=(warnflag == 0), hess_inv=hess_inv) 

399 

400 

401class LbfgsInvHessProduct(LinearOperator): 

402 """Linear operator for the L-BFGS approximate inverse Hessian. 

403 

404 This operator computes the product of a vector with the approximate inverse 

405 of the Hessian of the objective function, using the L-BFGS limited 

406 memory approximation to the inverse Hessian, accumulated during the 

407 optimization. 

408 

409 Objects of this class implement the ``scipy.sparse.linalg.LinearOperator`` 

410 interface. 

411 

412 Parameters 

413 ---------- 

414 sk : array_like, shape=(n_corr, n) 

415 Array of `n_corr` most recent updates to the solution vector. 

416 (See [1]). 

417 yk : array_like, shape=(n_corr, n) 

418 Array of `n_corr` most recent updates to the gradient. (See [1]). 

419 

420 References 

421 ---------- 

422 .. [1] Nocedal, Jorge. "Updating quasi-Newton matrices with limited 

423 storage." Mathematics of computation 35.151 (1980): 773-782. 

424 

425 """ 

426 

427 def __init__(self, sk, yk): 

428 """Construct the operator.""" 

429 if sk.shape != yk.shape or sk.ndim != 2: 

430 raise ValueError('sk and yk must have matching shape, (n_corrs, n)') 

431 n_corrs, n = sk.shape 

432 

433 super(LbfgsInvHessProduct, self).__init__( 

434 dtype=np.float64, shape=(n, n)) 

435 

436 self.sk = sk 

437 self.yk = yk 

438 self.n_corrs = n_corrs 

439 self.rho = 1 / np.einsum('ij,ij->i', sk, yk) 

440 

441 def _matvec(self, x): 

442 """Efficient matrix-vector multiply with the BFGS matrices. 

443 

444 This calculation is described in Section (4) of [1]. 

445 

446 Parameters 

447 ---------- 

448 x : ndarray 

449 An array with shape (n,) or (n,1). 

450 

451 Returns 

452 ------- 

453 y : ndarray 

454 The matrix-vector product 

455 

456 """ 

457 s, y, n_corrs, rho = self.sk, self.yk, self.n_corrs, self.rho 

458 q = np.array(x, dtype=self.dtype, copy=True) 

459 if q.ndim == 2 and q.shape[1] == 1: 

460 q = q.reshape(-1) 

461 

462 alpha = np.zeros(n_corrs) 

463 

464 for i in range(n_corrs-1, -1, -1): 

465 alpha[i] = rho[i] * np.dot(s[i], q) 

466 q = q - alpha[i]*y[i] 

467 

468 r = q 

469 for i in range(n_corrs): 

470 beta = rho[i] * np.dot(y[i], r) 

471 r = r + s[i] * (alpha[i] - beta) 

472 

473 return r 

474 

475 def todense(self): 

476 """Return a dense array representation of this operator. 

477 

478 Returns 

479 ------- 

480 arr : ndarray, shape=(n, n) 

481 An array with the same shape and containing 

482 the same data represented by this `LinearOperator`. 

483 

484 """ 

485 s, y, n_corrs, rho = self.sk, self.yk, self.n_corrs, self.rho 

486 I = np.eye(*self.shape, dtype=self.dtype) 

487 Hk = I 

488 

489 for i in range(n_corrs): 

490 A1 = I - s[i][:, np.newaxis] * y[i][np.newaxis, :] * rho[i] 

491 A2 = I - y[i][:, np.newaxis] * s[i][np.newaxis, :] * rho[i] 

492 

493 Hk = np.dot(A1, np.dot(Hk, A2)) + (rho[i] * s[i][:, np.newaxis] * 

494 s[i][np.newaxis, :]) 

495 return Hk