Skip to content

Commit

Permalink
Switch over to bfgs coded ourselves, instead of using scipy
Browse files Browse the repository at this point in the history
  • Loading branch information
danpovey committed May 16, 2016
1 parent b091f80 commit 215909d
Show file tree
Hide file tree
Showing 3 changed files with 320 additions and 15 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
!/src/*.*
!/src/Makefile

# compiled object files, and directories automatically created on Darwin OS
# for debug symbols.
# compiled object files and python files, and directories automatically created
# on Darwin OS for debug symbols.
*.o
*.pyc
*.dSYM

# emacs saves
Expand Down
298 changes: 298 additions & 0 deletions scripts/internal/bfgs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
#!/usr/bin/env python

from __future__ import print_function
import math, sys
import numpy as np
import numpy.linalg as la


"""
This is a version of BFGS specialized for the case where the function
is constrained to a particular convex region via a barrier function,
and where we can efficiently evaluate (via calling f_finite(x), which
returns bool) whether the function is finite at the given point.
x0 The value to start the optimization at.
f The function being minimized. f(x) returns a pair (value, gradient).
f_finite f_finite(x) returns true if f(x) would be finite, and false otherwise.
init_hessian This gives you a way to specify a "better guess" at the initial
Hessian.
return value Returns a 4-tuple (x, f(x), f'(x), inverse-hessian-approximation).
"""
def Bfgs(x0, f, f_finite, init_inv_hessian = None):
b = __bfgs(x0, f, f_finite, init_inv_hessian)
return b.Minimize()




class __bfgs:
def __init__(self, x0, f, f_finite, init_inv_hessian = None,
gradient_tolerance = 0.0005, progress_tolerance = 1.0e-06,
progress_tolerance_num_iters = 3):
self.c1 = 1.0e-04 # constant used in line search
self.c2 = 0.9 # constant used in line search
assert len(x0.shape) == 1
self.dim = x0.shape[0]
self.f = f
self.f_finite = f_finite
self.gradient_tolerance = gradient_tolerance
self.progress_tolerance = progress_tolerance
assert progress_tolerance_num_iters >= 1
self.progress_tolerance_num_iters = progress_tolerance_num_iters

if not self.f_finite(x0):
self.LogMessage("Function is not finite at initial point {0}".format(x0))
sys.exit(1)

# evaluations will be a list of 3-tuples (x, function-value f(x),
# function-derivative f'(x)). it's written to and read from by the
# function self.FunctionValueAndDerivative().
self.cached_evaluations = [ ]

self.x = [ x0 ]
(value0, deriv0) = self.FunctionValueAndDerivative(x0)
self.value = [ value0 ]
self.deriv = [ deriv0 ]

deriv_magnitude = math.sqrt(np.dot(deriv0, deriv0))
self.LogMessage("On iteration 0, value is {0}, deriv-magnitude {1}".format(
value0, deriv_magnitude))

# note: self.inv_hessian is referred to as H_k in the Nocedal
# and Wright textbook.
if init_inv_hessian is None:
self.inv_hessian = np.identity(self.dim)
else:
self.inv_hessian = init_inv_hessian

def Minimize(self):
while not self.Converged():
self.Iterate()
self.FinalDebugOutput()
return (self.x[-1], self.value[-1], self.deriv[-1], self.inv_hessian)


def FinalDebugOutput(self):
pass
# currently this does nothing.

# This does one iteration of update.
def Iterate(self):
self.p = - np.dot(self.inv_hessian, self.deriv[-1])
alpha = self.LineSearch()
if alpha is None:
self.LogMessage("Restarting BFGS with unit Hessian since line search failed")
self.inv_hessian = np.identity(self.dim)
return
cur_x = self.x[-1]
next_x = cur_x + alpha * self.p
(next_value, next_deriv) = self.FunctionValueAndDerivative(next_x)
next_deriv_magnitude = math.sqrt(np.dot(next_deriv, next_deriv))
self.LogMessage("On iteration {0}, value is {1}, deriv-magnitude {2}".format(
len(self.x), next_value, next_deriv_magnitude))

# obtain s_k = x_{k+1} - x_k, y_k = gradient_{k+1} - gradient_{k}
# see eq. 6.5 in Nocedal and Wright.
self.x.append(next_x)
self.value.append(next_value)
self.deriv.append(next_deriv)
s_k = alpha * self.p
y_k = self.deriv[-1] - self.deriv[-2]
ysdot = np.dot(s_k, y_k)
if not ysdot > 0:
self.LogMessage("Restarting BFGS with unit Hessian since curvature "
"condition failed [likely a bug in the optimization code]")
self.inv_hessian = np.identity(self.dim)
return
rho_k = 1.0 / ysdot # eq. 6.14 in Nocedal and Wright.
# the next equation is eq. 6.17 in Nocedal and Wright.
# we don't bother rearranging it for efficiency because the dimension is small.
I = np.identity(self.dim)
self.inv_hessian = ((I - np.outer(s_k, y_k) * rho_k) * self.inv_hessian *
(I - np.outer(y_k, s_k) * rho_k)) + np.outer(s_k, s_k) * rho_k
# todo: maybe make the line above more efficient.

# the function LineSearch is to be called after you have set self.x and
# self.p. It returns an alpha value satisfying the strong Wolfe conditions,
# or None if the line search failed. It is Algorithm 3.5 of Nocedal and
# Wright.
def LineSearch(self):
alpha_max = 1.0e+10
alpha1 = self.GetDefaultAlpha()
increase_factor = 2.0 # amount by which we increase alpha if
# needed... after the 1st time we make it 4.
if alpha1 is None:
self.LogMessage("Line search failed unexpectedly in making sure "
"f(x) is finite.")
return None

alpha = [ 0.0, alpha1 ]
(phi_0, phi_dash_0) = self.FunctionValueAndDerivativeForAlpha(0.0)
phi = [phi_0]
phi_dash = [phi_dash_0]

if phi_dash_0 >= 0.0:
self.LogMessage("{0}: line search failed unexpectedly: not a descent "
"direction")

while True:
i = len(phi)
alpha_i = alpha[-1]
(phi_i, phi_dash_i) = self.FunctionValueAndDerivativeForAlpha(alpha_i)
phi.append(phi_i)
phi_dash.append(phi_dash_i)
if (phi_i > phi_0 + self.c1 * alpha_i * phi_dash_0 or
(i > 1 and phi_i > phi[-2])):
return self.Zoom(alpha[-2], alpha_i)
if abs(phi_dash_i) <= -self.c2 * phi_dash_0:
self.LogMessage("Line search: accepting default alpha = {0}".format(alpha_i))
return alpha_i
if phi_dash_i >= 0:
return self.Zoom(alpha_i, alpha[-2])

# the algorithm says "choose alpha_{i+1} \in (alpha_i, alpha_max).
# the rest of this block is implementing that.
next_alpha = alpha_i * increase_factor
increase_factor = 4.0 # after we double once, we get more aggressive.
if next_alpha > alpha_max:
# something went wrong if alpha needed to get this large.
# most likely we'll restart BFGS.
self.LogMessage("Line search failed unexpectedly, went "
"past the max.");
return None
# make sure the function is finite at the next alpha, if possible.
# we don't need to worry about efficiency too much, as this check
# for finiteness is very fast.
while next_alpha > alpha_i * 1.2 and not self.IsFiniteForAlpha(next_alpha):
next_alpha *= 0.9
while next_alpha > alpha_i * 1.02 and not self.IsFiniteForAlpha(next_alpha):
next_alpha *= 0.99
self.LogMessage("Increasing alpha from {0} to {1} in line search".format(alpha_i,
next_alpha))
alpha.append(next_alpha)

# This function, from Nocedal and Wright (alg. 3.6) is called from from
# LineSearch. It returns the alpha value satisfying the strong Wolfe
# conditions, or None if there was an error.
def Zoom(self, alpha_lo, alpha_hi):
# these function evaluations don't really happen, we use caching.
(phi_0, phi_dash_0) = self.FunctionValueAndDerivativeForAlpha(0.0)
(phi_lo, phi_dash_lo) = self.FunctionValueAndDerivativeForAlpha(alpha_lo)
(phi_hi, phi_dash_hi) = self.FunctionValueAndDerivativeForAlpha(alpha_hi)

min_diff = 1.0e-10
while True:
if abs(alpha_lo - alpha_hi) < min_diff:
self.LogMessage("Line search failed, interval is too small: [{0},{1}]".format(
alpha_lo, alpha_hi))
return None

# the algorithm says "Interpolate (using quadratic, cubic or
# bisection) to find a trial step length between alpha_lo and
# alpha_hi. We basically choose bisection, but because alpha_lo is
# guaranteed to always have a "better" (lower) function value than
# alpha_hi, we actually want to be a little bit closer to alpha_lo,
# so we go one third of the distance between alpha_lo and alpha_hi.
alpha_j = alpha_lo + 0.3333 * (alpha_hi - alpha_lo)
(phi_j, phi_dash_j) = self.FunctionValueAndDerivativeForAlpha(alpha_j)
if phi_j > phi_0 + self.c1 * alpha_j * phi_dash_0 or phi_j >= phi_lo:
(alpha_hi, phi_hi, phi_dash_hi) = (alpha_j, phi_j, phi_dash_j)
else:
if abs(phi_dash_j) <= - self.c2 * phi_dash_0:
self.LogMessage("Acceptable alpha is {0}".format(alpha_j))
return alpha_j
if phi_dash_j * (alpha_hi - alpha_lo) >= 0.0:
(alpha_hi, phi_hi, phi_dash_hi) = (alpha_lo, phi_lo, phi_dash_lo)
(alpha_lo, phi_lo, phi_dash_lo) = (alpha_j, phi_j, phi_dash_j)


# The function GetDefaultAlpha(), called from LineSearch(), is to be called
# after you have set self.x and self.p. It normally returns 1.0, but it
# will reduce it by factors of 0.9 until the function evaluated at 2*alpha
# is finite. This is because generally speaking, approaching the edge of
# the barrier function too rapidly will lead to poor function values. Note:
# evaluating whether the function is finite is very efficient.
# If the function was not finite even at very tiny alpha, then something
# probably went wrong; we'll restart BFGS in this case.
def GetDefaultAlpha(self):
min_alpha = 1.0e-10
alpha = 1.0
while alpha > min_alpha and not self.IsFiniteForAlpha(alpha * 2.0):
alpha *= 0.9
return alpha if alpha > min_alpha else None

# this function, called from LineSearch(), returns true if the function is finite
# at the given alpha value.
def IsFiniteForAlpha(self, alpha):
x = self.x[-1] + self.p * alpha
return self.f_finite(x)

def FunctionValueAndDerivativeForAlpha(self, alpha):
x = self.x[-1] + self.p * alpha
(value, deriv) = self.FunctionValueAndDerivative(x)
return (value, np.dot(self.p, deriv))

def Converged(self):
# we say that we're converged if either the gradient magnitude
current_gradient = self.deriv[-1]
gradient_magnitude = math.sqrt(np.dot(current_gradient, current_gradient))
if gradient_magnitude < self.gradient_tolerance:
self.LogMessage("BFGS converged on iteration {0} due to gradient magnitude {1} "
"less than gradient tolerance {2}".format(
len(self.x), gradient_magnitude, gradient_tolerance))
return True
n = self.progress_tolerance_num_iters
if len(self.x) > n:
cur_value = self.value[-1]
prev_value = self.value[-1 - n]
# the following will be nonnegative.
change_per_iter_amortized = (prev_value - cur_value) / n
if change_per_iter_amortized < self.progress_tolerance:
self.LogMessage("BFGS converged on iteration {0} due to objf-change per "
"iteration amortized over {1} iterations = {2} < "
"threshold = {3}.".format(
len(self.x), n, change_per_iter_amortized, self.progress_tolerance))
return True
return False

# this returns the function value and derivative for x, as a tuple; it
# does caching.
def FunctionValueAndDerivative(self, x):
for i in range(len(self.cached_evaluations)):
if np.array_equal(x, self.cached_evaluations[i][0]):
return (self.cached_evaluations[i][1],
self.cached_evaluations[i][2])
# we didn't find it cached, so we need to actually evaluate the
# function. this is where it gets slow.
(value, deriv) = self.f(x)
self.cached_evaluations.append((x, value, deriv))
return (value, deriv)

def LogMessage(self, message):
print(sys.argv[0] + ": " + message, file=sys.stderr)




def __TestFunction(x):
dim = 15
a = np.array(range(1, dim + 1))
B = np.diag(range(5, dim + 5))

# define a function f(x) = x.a + x^T B x
value = np.dot(x, a) + np.dot(x, np.dot(B, x))

# derivative is a + 2 B x.
deriv = a + np.dot(B, x) * 2.0
return (value, deriv)


def __TestBfgs():
dim = 15
x0 = np.array(range(10, dim + 10))
(a,b,c,d) = Bfgs(x0, __TestFunction, lambda x : True, )

#__TestBfgs()
32 changes: 19 additions & 13 deletions scripts/optimize_metaparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import numpy as np
from scipy.optimize import minimize

import re, os, argparse, sys, math, warnings, shutil
from math import log

# we need to add the ./internal/ subdirectory to the pythonpath before importing
# 'bfgs'.
sys.path = [ os.path.abspath(os.path.dirname(sys.argv[0])) + "/internal" ] + sys.path
import bfgs

parser = argparse.ArgumentParser(description="Optimizes metaparameters for LM estimation; "
"this utility uses derivatives from get_objf_and_derivs.py or "
"get_objf_and_derivs_split.py");
Expand Down Expand Up @@ -171,7 +175,8 @@ def ModifyWithBarrierFunction(x, objf, derivs):


# this will return a 2-tuple (objf, deriv). note, the objective function and
# derivative are both negated because scipy only supports minimization.
# derivative are both negated because conventionally optimization problems are
# framed as minimization problems.
def GetObjfAndDeriv(x):
global iteration
if not MetaparametersAreAllowed(x):
Expand Down Expand Up @@ -242,10 +247,8 @@ def GetObjfAndDeriv(x):
print("Iteration {0}: objf={1}, deriv-magnitude={2} (with barrier function)".format(
iteration, objf, math.sqrt(np.vdot(derivs, derivs))), file=sys.stderr)

# we need to negate the objective function and derivatives, since
# scipy only supports minimizing functions and we want to maximize.
# we actually apply a negative scale not equal to -1.0, as otherwise
# the first step is too small.
# we need to negate the objective function and derivatives, since we are
# minimizing.
scale = -1.0
return (objf * scale, derivs * scale)

Expand All @@ -256,20 +259,23 @@ def GetObjfAndDeriv(x):
# and derivatives.
iteration = 0

result = minimize(GetObjfAndDeriv, x0, method='BFGS', jac=True,
options={'disp': True, 'gtol': args.gradient_tolerance, 'mls':50})

print("result is ", result, file=sys.stderr)
(x, value, deriv, inv_hessian) = bfgs.Bfgs(x0, GetObjfAndDeriv, MetaparametersAreAllowed)


#result = minimize(GetObjfAndDeriv, x0, method='BFGS', jac=True,
# options={'disp': True, 'gtol': args.gradient_tolerance, 'mls':50})

print("optimize_metaparameters: final x value is ", x, file=sys.stderr)

WriteMetaparameters("{0}/final.metaparams".format(args.optimize_dir),
result.x)
WriteMetaparameters("{0}/final.metaparams".format(args.optimize_dir), x)

old_objf = ReadObjf("{0}/0.objf".format(args.optimize_dir))
new_objf = ReadObjf("{0}/{1}.objf".format(args.optimize_dir, iteration - 1))
new_objf = -1.0 * value

print("optimize_metaparameters.py: log-prob on dev data increased "
"from {0} to {1} over {2} passes of derivative estimation (perplexity: {3}->{4}".format(
old_objf, new_objf, iteration, math.exp(-old_objf), math.exp(-new_objf)),
old_objf, new_objf, iteration, math.exp(-old_objf), math.exp(-new_objf)),
file=sys.stderr)

print("optimize_metaparameters.py: do `diff -y {0}/{{0,final}}.metaparams` "
Expand Down

0 comments on commit 215909d

Please sign in to comment.