diff --git a/docs/make.jl b/docs/make.jl index 55a0ae8b..464e7675 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,6 +20,7 @@ makedocs( "Tutorials" => [ "Multi-precision" => "tutorials/multiprecision.md", "Warm-start" => "tutorials/warmstart.md", + "LBFGS" => "tutorials/lbfgs.md", ], "Manual" => [ "IPM solver" => "man/solver.md", diff --git a/docs/src/tutorials/lbfgs.md b/docs/src/tutorials/lbfgs.md new file mode 100644 index 00000000..e4df9858 --- /dev/null +++ b/docs/src/tutorials/lbfgs.md @@ -0,0 +1,156 @@ +# Limited-memory BFGS + +```@meta +CurrentModule = MadNLP +``` +```@setup lbfgs +using NLPModels +using MadNLP +using Random +using ExaModels + +``` + +Sometimes, the second-order derivatives are just too expensive to +evaluate. In that case, it is often a good idea to +approximate the Hessian matrix. +The BFGS algorithm uses the first-order derivatives (gradient and tranposed-Jacobian +vector product) to approximate the Hessian of the Lagrangian. LBFGS is a variant of BFGS +that computes a low-rank approximation of the Hessian matrix from the past iterates. +That way, LBFGS does not have to store a large dense matrix in memory, rendering +the algorithm appropriate in the large-scale regime. + +MadNLP implements several quasi-Newton approximation for the Hessian matrix. +In this page, we focus on the limited-memory version of the BFGS algorithm, +commonly known as LBFGS. We refer to [this article](https://epubs.siam.org/doi/abs/10.1137/0916069) for a detailed description of the method. + +## How to set up LBFGS in MadNLP? + +We look at the `elec` optimization problem from +the [COPS benchmark](https://www.mcs.anl.gov/~more/cops/): + +```@example lbfgs +function elec_model(np) + Random.seed!(1) + # Set the starting point to a quasi-uniform distribution of electrons on a unit sphere + theta = (2pi) .* rand(np) + phi = pi .* rand(np) + + core = ExaModels.ExaCore(Float64) + x = ExaModels.variable(core, 1:np; start = [cos(theta[i])*sin(phi[i]) for i=1:np]) + y = ExaModels.variable(core, 1:np; start = [sin(theta[i])*sin(phi[i]) for i=1:np]) + z = ExaModels.variable(core, 1:np; start = [cos(phi[i]) for i=1:np]) + # Coulomb potential + itr = [(i,j) for i in 1:np-1 for j in i+1:np] + ExaModels.objective(core, 1.0 / sqrt((x[i] - x[j])^2 + (y[i] - y[j])^2 + (z[i] - z[j])^2) for (i,j) in itr) + # Unit-ball + ExaModels.constraint(core, x[i]^2 + y[i]^2 + z[i]^2 - 1 for i=1:np) + + return ExaModels.ExaModel(core) +end + +``` + +The problem computes the positions of the electrons in an atom. +The potential of each electron depends on the positions of all the other electrons: +all the variables in the problem are coupled, resulting in a dense Hessian matrix. +Hence, the problem is good candidate for a quasi-Newton algorithm. + +We start by solving the problem with the default options in MadNLP, +using the dense linear solver Lapack: + +```@example lbfgs +nh = 10 +nlp = elec_model(nh) +results_hess = madnlp( + nlp; + linear_solver=LapackCPUSolver, +) +nothing + +``` +We observe that MadNLP converges in 21 iterations. + +To replace the second-order derivatives by an LBFGS approximation, +you should pass the option `hessian_approximation=CompactLBFGS` to MadNLP. + +```@example lbfgs +results_qn = madnlp( + nlp; + linear_solver=LapackCPUSolver, + hessian_approximation=MadNLP.CompactLBFGS, +) +nothing + +``` + +We observe that MadNLP converges in 93 iterations. As expected, the number of Hessian +evaluations is 0. + +## How to tune the options in LBGS? + +You can tune the LBFGS options by using the option `quasi_newton_options`. +The option takes as input a `QuasiNewtonOptions` structure, with the following attributes +(we put on the right their default values): +- `init_strategy::BFGSInitStrategy = SCALAR1` +- `max_history::Int = 6` +- `init_value::Float64 = 1.0` +- `sigma_min::Float64 = 1e-8` +- `sigma_max::Float64 = 1e+8` + +The most important parameter is `max_history`, which encodes the number of vectors used in the low-rank +approximation. For instance, we can increase the history to use the 20 past iterates using: + +```@example lbfgs +qn_options = MadNLP.QuasiNewtonOptions(; max_history=20) +results_qn = madnlp( + nlp; + linear_solver=LapackCPUSolver, + hessian_approximation=MadNLP.CompactLBFGS, + quasi_newton_options=qn_options, +) +nothing + +``` + +We observe that the total number of iterations has been reduced from 93 to 60. + + +## How does LBFGS is implemented in MadNLP? + +MadNLP implements the compact LBFGS algorithm described [in this article](https://link.springer.com/article/10.1007/bf01582063). At each iteration, the Hessian ``W_k`` is approximated by a +low rank positive definite matrix ``B_k``, defined as +```math +B_k = \xi_k I + U_k V_k^\top + +``` +with ``\xi > 0`` a scaling factor, ``U_k`` and ``V_k`` two ``n \times 2p`` matrices. +The number ``p`` denotes the number of vectors used when computing the limited memory updates +(the parameter ``max_history`` in MadNLP): the larger, the more accurate is the low-rank approximation. + +Replacing the Hessian of the Lagrangian ``W_k`` by the low-rank matrix ``B_k``, +the KKT system solved in MadNLP rewrites as +```math +\begin{bmatrix} +\xi I + U V^\top + \Sigma_x & 0 & A^\top \\ +0 & \Sigma_s & -I \\ +A & -I & 0 +\end{bmatrix} +\begin{bmatrix} +\Delta x \\ \Delta s \\ Delta y +\end{bmatrix} += +\begin{bmatrix} +r_1 \\ r_2 \\ r_3 +\end{bmatrix} + +``` +The KKT system has a low-rank structure we can exploit using the Sherman-Morrison formula. +The method is detailed e.g. in Section 3.8 of [that paper](https://link.springer.com/article/10.1007/s10107-004-0560-5). + + +!!! info + As MadNLP is designed to solve constrained optimization problems, + it does not approximate the inverse of the Hessian matrix, as it is done + in most implementations of the LBFGS algorithm. + diff --git a/src/IPM/IPM.jl b/src/IPM/IPM.jl index 65fa4e74..72d9ef72 100644 --- a/src/IPM/IPM.jl +++ b/src/IPM/IPM.jl @@ -159,6 +159,7 @@ function MadNLPSolver(nlp::AbstractNLPModel{T,VT}; kwargs...) where {T, VT} ipm_opt.linear_solver; hessian_approximation=ipm_opt.hessian_approximation, opt_linear_solver=options.linear_solver, + qn_options=ipm_opt.quasi_newton_options, ) @trace(logger,"Initializing iterative solver.") diff --git a/src/KKT/Dense/augmented.jl b/src/KKT/Dense/augmented.jl index 5ecee833..8f66adbf 100644 --- a/src/KKT/Dense/augmented.jl +++ b/src/KKT/Dense/augmented.jl @@ -46,6 +46,7 @@ function create_kkt_system( linear_solver::Type; opt_linear_solver=default_options(linear_solver), hessian_approximation=ExactHessian, + qn_options=QuasiNewtonOptions(), ) where {T, VT} ind_ineq = ind_cons.ind_ineq @@ -80,7 +81,7 @@ function create_kkt_system( fill!(du_diag, zero(T)) fill!(diag_hess, zero(T)) - quasi_newton = create_quasi_newton(hessian_approximation, cb, n) + quasi_newton = create_quasi_newton(hessian_approximation, cb, n; options=qn_options) _linear_solver = linear_solver(aug_com; opt = opt_linear_solver) return DenseKKTSystem( diff --git a/src/KKT/Dense/condensed.jl b/src/KKT/Dense/condensed.jl index f6ae1a17..a8246e06 100644 --- a/src/KKT/Dense/condensed.jl +++ b/src/KKT/Dense/condensed.jl @@ -56,6 +56,7 @@ function create_kkt_system( linear_solver::Type; opt_linear_solver=default_options(linear_solver), hessian_approximation=ExactHessian, + qn_options=QuasiNewtonOptions(), ) where {T, VT} n = cb.nvar @@ -93,7 +94,7 @@ function create_kkt_system( ind_eq_shifted = ind_cons.ind_eq .+ n .+ ns ind_ineq_shifted = ind_cons.ind_ineq .+ n .+ ns - quasi_newton = create_quasi_newton(hessian_approximation, cb, n) + quasi_newton = create_quasi_newton(hessian_approximation, cb, n; options=qn_options) _linear_solver = linear_solver(aug_com; opt = opt_linear_solver) return DenseCondensedKKTSystem( diff --git a/src/KKT/Sparse/augmented.jl b/src/KKT/Sparse/augmented.jl index a54d0464..28a3ed65 100644 --- a/src/KKT/Sparse/augmented.jl +++ b/src/KKT/Sparse/augmented.jl @@ -44,6 +44,7 @@ function create_kkt_system( linear_solver::Type; opt_linear_solver=default_options(linear_solver), hessian_approximation=ExactHessian, + qn_options=QuasiNewtonOptions(), ) where {T,VT} n_slack = length(ind_cons.ind_ineq) @@ -56,7 +57,7 @@ function create_kkt_system( jac_sparsity_J = create_array(cb, Int32, cb.nnzj) _jac_sparsity_wrapper!(cb,jac_sparsity_I, jac_sparsity_J) - quasi_newton = create_quasi_newton(hessian_approximation, cb, n) + quasi_newton = create_quasi_newton(hessian_approximation, cb, n; options=qn_options) hess_sparsity_I, hess_sparsity_J = build_hessian_structure(cb, hessian_approximation) nlb = length(ind_cons.ind_lb) diff --git a/src/KKT/Sparse/condensed.jl b/src/KKT/Sparse/condensed.jl index 694231c7..32634b41 100644 --- a/src/KKT/Sparse/condensed.jl +++ b/src/KKT/Sparse/condensed.jl @@ -59,6 +59,7 @@ function create_kkt_system( linear_solver::Type; opt_linear_solver=default_options(linear_solver), hessian_approximation=ExactHessian, + qn_options=QuasiNewtonOptions(), ) where {T, VT} ind_ineq = ind_cons.ind_ineq n = cb.nvar @@ -74,7 +75,7 @@ function create_kkt_system( jac_sparsity_J = create_array(cb, Int32, cb.nnzj) _jac_sparsity_wrapper!(cb,jac_sparsity_I, jac_sparsity_J) - quasi_newton = create_quasi_newton(hessian_approximation, cb, n) + quasi_newton = create_quasi_newton(hessian_approximation, cb, n; options=qn_options) hess_sparsity_I, hess_sparsity_J = build_hessian_structure(cb, hessian_approximation) force_lower_triangular!(hess_sparsity_I,hess_sparsity_J) diff --git a/src/KKT/Sparse/scaled_augmented.jl b/src/KKT/Sparse/scaled_augmented.jl index b7316a30..25c27233 100644 --- a/src/KKT/Sparse/scaled_augmented.jl +++ b/src/KKT/Sparse/scaled_augmented.jl @@ -73,6 +73,7 @@ function create_kkt_system( linear_solver::Type; opt_linear_solver=default_options(linear_solver), hessian_approximation=ExactHessian, + qn_options=QuasiNewtonOptions(), ) where {T,VT} n_slack = length(ind_cons.ind_ineq) @@ -85,7 +86,7 @@ function create_kkt_system( jac_sparsity_J = create_array(cb, Int32, cb.nnzj) _jac_sparsity_wrapper!(cb,jac_sparsity_I, jac_sparsity_J) - quasi_newton = create_quasi_newton(hessian_approximation, cb, n) + quasi_newton = create_quasi_newton(hessian_approximation, cb, n; options=qn_options) hess_sparsity_I, hess_sparsity_J = build_hessian_structure(cb, hessian_approximation) nlb = length(ind_cons.ind_lb) diff --git a/src/KKT/Sparse/unreduced.jl b/src/KKT/Sparse/unreduced.jl index 8cd48ae1..9fee6c11 100644 --- a/src/KKT/Sparse/unreduced.jl +++ b/src/KKT/Sparse/unreduced.jl @@ -51,6 +51,7 @@ function create_kkt_system( linear_solver::Type; opt_linear_solver=default_options(linear_solver), hessian_approximation=ExactHessian, + qn_options=QuasiNewtonOptions(), ) where {T, VT} ind_ineq = ind_cons.ind_ineq ind_lb = ind_cons.ind_lb @@ -64,7 +65,7 @@ function create_kkt_system( m = cb.ncon # Quasi-newton - quasi_newton = create_quasi_newton(hessian_approximation, cb, n) + quasi_newton = create_quasi_newton(hessian_approximation, cb, n; options=qn_options) # Evaluate sparsity pattern jac_sparsity_I = create_array(cb, Int32, cb.nnzj) diff --git a/test/kkt_test.jl b/test/kkt_test.jl index 4686d3cf..41a2fa0c 100644 --- a/test/kkt_test.jl +++ b/test/kkt_test.jl @@ -34,7 +34,6 @@ end (MadNLP.DenseCondensedKKTSystem, MadNLP.DenseCallback), ] linear_solver = MadNLP.LapackCPUSolver - cnt = MadNLP.MadNLPCounters(; start_time=time()) nlp = MadNLPTests.HS15Model() ind_cons = MadNLP.get_index_constraints(nlp) @@ -52,3 +51,30 @@ end MadNLPTests.test_kkt_system(kkt, cb) end +@testset "[KKT system] $(KKTSystem)+LBFGS" for (KKTSystem, Callback) in [ + (MadNLP.SparseKKTSystem, MadNLP.SparseCallback), + (MadNLP.SparseUnreducedKKTSystem, MadNLP.SparseCallback), + (MadNLP.SparseCondensedKKTSystem, MadNLP.SparseCallback), + (MadNLP.ScaledSparseKKTSystem, MadNLP.SparseCallback), +] + linear_solver = MadNLP.LapackCPUSolver + nlp = MadNLPTests.HS15Model() + ind_cons = MadNLP.get_index_constraints(nlp) + cb = MadNLP.create_callback(Callback, nlp) + # Define options for LBFGS + p = 20 + qn_options = MadNLP.QuasiNewtonOptions(; max_history=p) + + kkt = MadNLP.create_kkt_system( + KKTSystem, + cb, + ind_cons, + linear_solver; + hessian_approximation=MadNLP.CompactLBFGS, + qn_options=qn_options, + ) + @test isa(kkt.quasi_newton, MadNLP.CompactLBFGS) + # Test options are correctly passed to MadNLP + @test kkt.quasi_newton.max_mem == p +end +