-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathline_search_strong.py
432 lines (368 loc) · 13.5 KB
/
line_search_strong.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import numpy as np
# Below is an implementation of line_search reinforcing the strong Wolfe condition
def line_search(f, fprime, xk, pk):
"""
Line search enforcing weak Wolfe condition (Armoji and Wolfe condition)
"""
alpha = 0
beta = 2**100
t = 1
c1 = 1e-4
c2 = 0.7
num_iter = 0
has_found = False
while num_iter <= 50 and has_found == False:
if not S(f, fprime, xk, pk, t, c1):
beta = t
elif not C(f, fprime, xk, pk, t, c2):
alpha = t
else:
break
if has_found == True:
break
if beta < 2**100:
t = (alpha + beta)/2
else:
t = 2 * alpha
num_iter += 1
print("line search iterations:", num_iter)
return t, num_iter
def S(f, fprime, xk, pk, t, c1):
f_xk = f(xk)
f_xk1 = f(xk + t * pk)
df_xk = fprime(xk)
return f_xk1 <= f_xk + c1 * t * np.dot(df_xk, pk)
def C(f, fprime, xk, pk, t, c2):
df_xk = fprime(xk)
df_xk1 = fprime(xk + t * pk)
# print(xk, xk + t * pk, df_xk, df_xk1)
return np.absolute(np.dot(pk, df_xk1)) <= c2 * np.absolute(np.dot(pk, df_xk))
# SciPy implementation #
def line_search1(f, myfprime, xk, pk, old_fval=None,
old_old_fval=None, args=(), c1=1e-4, c2=0.9, amax=None,
extra_condition=None, maxiter=10):
"""Find alpha that satisfies strong Wolfe conditions.
Parameters
----------
f : callable f(x,*args)
Objective function.
myfprime : callable f'(x,*args)
Objective function gradient.
xk : ndarray
Starting point.
pk : ndarray
Search direction.
old_fval : float, optional
Function value for x=xk. Will be recomputed if omitted.
old_old_fval : float, optional
Function value for the point preceding x=xk.
args : tuple, optional
Additional arguments passed to objective function.
c1 : float, optional
Parameter for Armijo condition rule.
c2 : float, optional
Parameter for curvature condition rule.
amax : float, optional
Maximum step size
extra_condition : callable, optional
A callable of the form ``extra_condition(alpha, x, f, g)``
returning a boolean. Arguments are the proposed step ``alpha``
and the corresponding ``x``, ``f`` and ``g`` values. The line search
accepts the value of ``alpha`` only if this
callable returns ``True``. If the callable returns ``False``
for the step length, the algorithm will continue with
new iterates. The callable is only called for iterates
satisfying the strong Wolfe conditions.
maxiter : int, optional
Maximum number of iterations to perform.
Returns
-------
alpha : float or None
Alpha for which ``x_new = x0 + alpha * pk``,
or None if the line search algorithm did not converge.
fc : int
Number of function evaluations made.
gc : int
Number of gradient evaluations made.
new_fval : float or None
New function value ``f(x_new)=f(x0+alpha*pk)``,
or None if the line search algorithm did not converge.
old_fval : float
Old function value ``f(x0)``.
new_slope : float or None
The local slope along the search direction at the
new value ``<myfprime(x_new), pk>``,
or None if the line search algorithm did not converge.
"""
fc = [0] # num function evaluations made
gc = [0] # num gradient evaluations made
gval = [None]
gval_alpha = [None]
def phi(alpha):
fc[0] += 1
return f(xk + alpha * pk, *args)
fprime = myfprime
def derphi(alpha):
gc[0] += 1
gval[0] = fprime(xk + alpha * pk, *args) # store for later use
gval_alpha[0] = alpha
return np.dot(gval[0], pk)
gfk = fprime(xk, *args)
derphi0 = np.dot(gfk, pk)
if extra_condition is not None:
# Add the current gradient as argument, to avoid needless
# re-evaluation
def extra_condition2(alpha, phi):
if gval_alpha[0] != alpha:
derphi(alpha)
x = xk + alpha * pk
return extra_condition(alpha, x, phi, gval[0])
else:
extra_condition2 = None
alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
phi, derphi, old_fval, old_old_fval, derphi0, c1, c2, amax,
extra_condition2, maxiter=maxiter)
if derphi_star is None:
print('The line search algorithm did not converge')
else:
# derphi_star is a number (derphi) -- so use the most recently
# calculated gradient used in computing it derphi = gfk*pk
# this is the gradient at the next step no need to compute it
# again in the outer loop.
derphi_star = gval[0]
return alpha_star, fc[0], gc[0], phi_star, old_fval, derphi_star
def scalar_search_wolfe2(phi, derphi, phi0=None,
old_phi0=None, derphi0=None,
c1=1e-4, c2=0.9, amax=None,
extra_condition=None, maxiter=10):
"""Find alpha that satisfies strong Wolfe conditions.
alpha > 0 is assumed to be a descent direction.
Parameters
----------
phi : callable phi(alpha)
Objective scalar function.
derphi : callable phi'(alpha)
Objective function derivative. Returns a scalar.
phi0 : float, optional
Value of phi at 0.
old_phi0 : float, optional
Value of phi at previous point.
derphi0 : float, optional
Value of derphi at 0
c1 : float, optional
Parameter for Armijo condition rule.
c2 : float, optional
Parameter for curvature condition rule.
amax : float, optional
Maximum step size.
extra_condition : callable, optional
A callable of the form ``extra_condition(alpha, phi_value)``
returning a boolean. The line search accepts the value
of ``alpha`` only if this callable returns ``True``.
If the callable returns ``False`` for the step length,
the algorithm will continue with new iterates.
The callable is only called for iterates satisfying
the strong Wolfe conditions.
maxiter : int, optional
Maximum number of iterations to perform.
Returns
-------
alpha_star : float or None
Best alpha, or None if the line search algorithm did not converge.
phi_star : float
phi at alpha_star.
phi0 : float
phi at 0.
derphi_star : float or None
derphi at alpha_star, or None if the line search algorithm
did not converge.
"""
if phi0 is None:
phi0 = phi(0.)
if derphi0 is None:
derphi0 = derphi(0.)
alpha0 = 0
if old_phi0 is not None and derphi0 != 0:
alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
else:
alpha1 = 1.0
if alpha1 < 0:
alpha1 = 1.0
if amax is not None:
alpha1 = min(alpha1, amax)
phi_a1 = phi(alpha1)
# derphi_a1 = derphi(alpha1) evaluated below
phi_a0 = phi0
derphi_a0 = derphi0
if extra_condition is None:
def extra_condition(alpha, phi): return True
for i in range(maxiter):
if alpha1 == 0 or (amax is not None and alpha0 == amax):
# alpha1 == 0: This shouldn't happen. Perhaps the increment has
# slipped below machine precision?
alpha_star = None
phi_star = phi0
phi0 = old_phi0
derphi_star = None
if alpha1 == 0:
msg = 'Rounding errors prevent the line search from converging'
else:
msg = "The line search algorithm could not find a solution " + \
"less than or equal to amax: %s" % amax
print(msg)
break
not_first_iteration = i > 0
if (phi_a1 > phi0 + c1 * alpha1 * derphi0) or \
((phi_a1 >= phi_a0) and not_first_iteration):
alpha_star, phi_star, derphi_star = \
_zoom(alpha0, alpha1, phi_a0,
phi_a1, derphi_a0, phi, derphi,
phi0, derphi0, c1, c2, extra_condition)
break
derphi_a1 = derphi(alpha1)
if (abs(derphi_a1) <= -c2*derphi0):
if extra_condition(alpha1, phi_a1):
alpha_star = alpha1
phi_star = phi_a1
derphi_star = derphi_a1
break
if (derphi_a1 >= 0):
alpha_star, phi_star, derphi_star = \
_zoom(alpha1, alpha0, phi_a1,
phi_a0, derphi_a1, phi, derphi,
phi0, derphi0, c1, c2, extra_condition)
break
alpha2 = 2 * alpha1 # increase by factor of two on each iteration
if amax is not None:
alpha2 = min(alpha2, amax)
alpha0 = alpha1
alpha1 = alpha2
phi_a0 = phi_a1
phi_a1 = phi(alpha1)
derphi_a0 = derphi_a1
else:
# stopping test maxiter reached
alpha_star = alpha1
phi_star = phi_a1
derphi_star = None
print('The line search algorithm did not converge')
return alpha_star, phi_star, phi0, derphi_star
def _cubicmin(a, fa, fpa, b, fb, c, fc):
"""
Finds the minimizer for a cubic polynomial that goes through the
points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
If no minimizer can be found, return None.
"""
# f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D
with np.errstate(divide='raise', over='raise', invalid='raise'):
try:
C = fpa
db = b - a
dc = c - a
denom = (db * dc) ** 2 * (db - dc)
d1 = np.empty((2, 2))
d1[0, 0] = dc ** 2
d1[0, 1] = -db ** 2
d1[1, 0] = -dc ** 3
d1[1, 1] = db ** 3
[A, B] = np.dot(d1, np.asarray([fb - fa - C * db,
fc - fa - C * dc]).flatten())
A /= denom
B /= denom
radical = B * B - 3 * A * C
xmin = a + (-B + np.sqrt(radical)) / (3 * A)
except ArithmeticError:
return None
if not np.isfinite(xmin):
return None
return xmin
def _quadmin(a, fa, fpa, b, fb):
"""
Finds the minimizer for a quadratic polynomial that goes through
the points (a,fa), (b,fb) with derivative at a of fpa.
"""
# f(x) = B*(x-a)^2 + C*(x-a) + D
with np.errstate(divide='raise', over='raise', invalid='raise'):
try:
D = fa
C = fpa
db = b - a * 1.0
B = (fb - D - C * db) / (db * db)
xmin = a - C / (2.0 * B)
except ArithmeticError:
return None
if not np.isfinite(xmin):
return None
return xmin
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
phi, derphi, phi0, derphi0, c1, c2, extra_condition):
"""Zoom stage of approximate linesearch satisfying strong Wolfe conditions.
Part of the optimization algorithm in `scalar_search_wolfe2`.
"""
maxiter = 10
i = 0
delta1 = 0.2 # cubic interpolant check
delta2 = 0.1 # quadratic interpolant check
phi_rec = phi0
a_rec = 0
while True:
# interpolate to find a trial step length between a_lo and
# a_hi Need to choose interpolation here. Use cubic
# interpolation and then if the result is within delta *
# dalpha or outside of the interval bounded by a_lo or a_hi
# then use quadratic interpolation, if the result is still too
# close, then use bisection
dalpha = a_hi - a_lo
if dalpha < 0:
a, b = a_hi, a_lo
else:
a, b = a_lo, a_hi
# minimizer of cubic interpolant
# (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
#
# if the result is too close to the end points (or out of the
# interval), then use quadratic interpolation with phi_lo,
# derphi_lo and phi_hi if the result is still too close to the
# end points (or out of the interval) then use bisection
if (i > 0):
cchk = delta1 * dalpha
a_j = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
a_rec, phi_rec)
if (i == 0) or (a_j is None) or (a_j > b - cchk) or (a_j < a + cchk):
qchk = delta2 * dalpha
a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
if (a_j is None) or (a_j > b-qchk) or (a_j < a+qchk):
a_j = a_lo + 0.5*dalpha
# Check new value of a_j
phi_aj = phi(a_j)
if (phi_aj > phi0 + c1*a_j*derphi0) or (phi_aj >= phi_lo):
phi_rec = phi_hi
a_rec = a_hi
a_hi = a_j
phi_hi = phi_aj
else:
derphi_aj = derphi(a_j)
if abs(derphi_aj) <= -c2*derphi0 and extra_condition(a_j, phi_aj):
a_star = a_j
val_star = phi_aj
valprime_star = derphi_aj
break
if derphi_aj*(a_hi - a_lo) >= 0:
phi_rec = phi_hi
a_rec = a_hi
a_hi = a_lo
phi_hi = phi_lo
else:
phi_rec = phi_lo
a_rec = a_lo
a_lo = a_j
phi_lo = phi_aj
derphi_lo = derphi_aj
i += 1
if (i > maxiter):
# Failed to find a conforming step size
a_star = None
val_star = None
valprime_star = None
break
return a_star, val_star, valprime_star