From 5a6f0b9177c9fbf4e809692c570c18f61d789c91 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Wed, 17 Jul 2024 09:55:31 +0200
Subject: [PATCH 1/4] Switch from AbstractDifferentiation to
 DifferentiationInterface

---
 Project.toml                                  |  6 ++-
 README.md                                     |  2 +-
 benchmark/benchmarks.jl                       | 48 +++++++++----------
 docs/Project.toml                             |  2 +-
 docs/src/examples/sparse_linear_regression.jl | 10 ++--
 docs/src/guide/custom_objectives.jl           | 47 +++++++++---------
 docs/src/guide/getting_started.jl             | 11 ++---
 docs/src/index.md                             |  2 +-
 src/ProximalAlgorithms.jl                     | 26 +++++-----
 src/algorithms/davis_yin.jl                   |  7 ++-
 src/algorithms/fast_forward_backward.jl       |  7 ++-
 src/algorithms/forward_backward.jl            |  7 ++-
 src/algorithms/li_lin.jl                      | 10 ++--
 src/algorithms/panoc.jl                       | 15 +++---
 src/algorithms/panocplus.jl                   | 15 +++---
 src/algorithms/primal_dual.jl                 |  8 ++--
 src/algorithms/sfista.jl                      |  7 ++-
 src/algorithms/zerofpr.jl                     | 11 ++---
 src/utilities/fb_tools.jl                     | 36 ++++++--------
 test/Project.toml                             |  2 +-
 test/problems/test_elasticnet.jl              |  8 ++--
 test/problems/test_equivalence.jl             |  6 +--
 test/problems/test_lasso_small.jl             |  6 +--
 .../test_lasso_small_strongly_convex.jl       |  4 +-
 test/problems/test_linear_programs.jl         |  8 ++--
 test/problems/test_nonconvex_qp.jl            |  6 +--
 test/problems/test_sparse_logistic_small.jl   |  6 +--
 test/problems/test_verbose.jl                 |  6 +--
 test/runtests.jl                              |  8 ++--
 test/utilities/test_ad.jl                     | 10 ++--
 test/utilities/test_fb_tools.jl               |  2 +-
 31 files changed, 167 insertions(+), 182 deletions(-)

diff --git a/Project.toml b/Project.toml
index e8e7aab..41a6248 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,13 +3,15 @@ uuid = "140ffc9f-1907-541a-a177-7475e0a401e9"
 version = "0.6.0"
 
 [deps]
-AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProximalCore = "dc4f5ac2-75d1-4f31-931e-60435d74994b"
 
 [compat]
-AbstractDifferentiation = "0.6"
+ADTypes = "1.5.3"
+DifferentiationInterface = "0.5.8"
 LinearAlgebra = "1.2"
 Printf = "1.2"
 ProximalCore = "0.1"
diff --git a/README.md b/README.md
index 1ce9dd2..c52c6a3 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Implemented algorithms include:
 Check out [this section](https://juliafirstorder.github.io/ProximalAlgorithms.jl/stable/guide/implemented_algorithms/) for an overview of the available algorithms.
 
 Algorithms rely on:
-- [AbstractDifferentiation.jl](https://github.com/JuliaDiff/AbstractDifferentiation.jl) for automatic differentiation (but you can easily bring your own gradients)
+- [DifferentiationInterface.jl](https://github.com/gdalle/DifferentiationInterface.jl) for automatic differentiation (but you can easily bring your own gradients)
 - the [ProximalCore API](https://github.com/JuliaFirstOrder/ProximalCore.jl) for proximal mappings, projections, etc, to handle non-differentiable terms (see for example [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl) for an extensive collection of functions).
 
 ## Documentation
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 035b35d..27032f2 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -8,12 +8,12 @@ using FileIO
 
 const SUITE = BenchmarkGroup()
 
-function ProximalAlgorithms.value_and_gradient_closure(
+function ProximalAlgorithms.value_and_gradient(
     f::ProximalOperators.LeastSquaresDirect,
     x,
 )
     res = f.A * x - f.b
-    norm(res)^2, () -> f.A' * res
+    norm(res)^2, f.A' * res
 end
 
 struct SquaredDistance{Tb}
@@ -22,9 +22,9 @@ end
 
 (f::SquaredDistance)(x) = norm(x - f.b)^2
 
-function ProximalAlgorithms.value_and_gradient_closure(f::SquaredDistance, x)
+function ProximalAlgorithms.value_and_gradient(f::SquaredDistance, x)
     diff = x - f.b
-    norm(diff)^2, () -> diff
+    norm(diff)^2, diff
 end
 
 for (benchmark_name, file_name) in [
@@ -45,56 +45,56 @@ for (benchmark_name, file_name) in [
         m, n = size(A)
 
         SUITE[k]["ForwardBackward"] =
-            @benchmarkable solver(x0 = x0, f = f, g = g) setup = begin
-                solver = ProximalAlgorithms.ForwardBackward(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, g=g) setup = begin
+                solver = ProximalAlgorithms.ForwardBackward(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["FastForwardBackward"] =
-            @benchmarkable solver(x0 = x0, f = f, g = g) setup = begin
-                solver = ProximalAlgorithms.FastForwardBackward(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, g=g) setup = begin
+                solver = ProximalAlgorithms.FastForwardBackward(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["ZeroFPR"] =
-            @benchmarkable solver(x0 = x0, f = f, A = $A, g = g) setup = begin
-                solver = ProximalAlgorithms.ZeroFPR(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup = begin
+                solver = ProximalAlgorithms.ZeroFPR(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = SquaredDistance($b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["PANOC"] =
-            @benchmarkable solver(x0 = x0, f = f, A = $A, g = g) setup = begin
-                solver = ProximalAlgorithms.PANOC(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup = begin
+                solver = ProximalAlgorithms.PANOC(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = SquaredDistance($b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["PANOCplus"] =
-            @benchmarkable solver(x0 = x0, f = f, A = $A, g = g) setup = begin
-                solver = ProximalAlgorithms.PANOCplus(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup = begin
+                solver = ProximalAlgorithms.PANOCplus(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = SquaredDistance($b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["DouglasRachford"] =
-            @benchmarkable solver(x0 = x0, f = f, g = g, gamma = $R(1)) setup = begin
-                solver = ProximalAlgorithms.DouglasRachford(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, g=g, gamma=$R(1)) setup = begin
+                solver = ProximalAlgorithms.DouglasRachford(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["DRLS"] =
-            @benchmarkable solver(x0 = x0, f = f, g = g, Lf = Lf) setup = begin
-                solver = ProximalAlgorithms.DRLS(tol = 1e-6)
+            @benchmarkable solver(x0=x0, f=f, g=g, Lf=Lf) setup = begin
+                solver = ProximalAlgorithms.DRLS(tol=1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 Lf = opnorm(($A)' * $A)
@@ -102,11 +102,11 @@ for (benchmark_name, file_name) in [
             end
 
         SUITE[k]["AFBA-1"] =
-            @benchmarkable solver(x0 = x0, y0 = y0, f = f, g = g, beta_f = beta_f) setup =
+            @benchmarkable solver(x0=x0, y0=y0, f=f, g=g, beta_f=beta_f) setup =
                 begin
                     beta_f = opnorm($A)^2
                     solver =
-                        ProximalAlgorithms.AFBA(theta = $R(1), mu = $R(1), tol = $R(1e-6))
+                        ProximalAlgorithms.AFBA(theta=$R(1), mu=$R(1), tol=$R(1e-6))
                     x0 = zeros($T, size($A, 2))
                     y0 = zeros($T, size($A, 2))
                     f = LeastSquares($A, $b)
@@ -114,10 +114,10 @@ for (benchmark_name, file_name) in [
                 end
 
         SUITE[k]["AFBA-2"] =
-            @benchmarkable solver(x0 = x0, y0 = y0, h = h, L = $A, g = g) setup = begin
+            @benchmarkable solver(x0=x0, y0=y0, h=h, L=$A, g=g) setup = begin
                 beta_f = opnorm($A)^2
                 solver =
-                    ProximalAlgorithms.AFBA(theta = $R(1), mu = $R(1), tol = $R(1e-6))
+                    ProximalAlgorithms.AFBA(theta=$R(1), mu=$R(1), tol=$R(1e-6))
                 x0 = zeros($T, size($A, 2))
                 y0 = zeros($T, size($A, 1))
                 h = Translate(SqrNormL2(), -$b)
@@ -125,8 +125,8 @@ for (benchmark_name, file_name) in [
             end
 
         SUITE[k]["SFISTA"] =
-            @benchmarkable solver(x0 = x0, f = f, Lf = Lf, g = g) setup = begin
-                solver = ProximalAlgorithms.SFISTA(tol = $R(1e-3))
+            @benchmarkable solver(x0=x0, f=f, Lf=Lf, g=g) setup = begin
+                solver = ProximalAlgorithms.SFISTA(tol=$R(1e-3))
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
diff --git a/docs/Project.toml b/docs/Project.toml
index f38368c..64d2acd 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,5 @@
 [deps]
-AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
diff --git a/docs/src/examples/sparse_linear_regression.jl b/docs/src/examples/sparse_linear_regression.jl
index 771a596..8952a86 100644
--- a/docs/src/examples/sparse_linear_regression.jl
+++ b/docs/src/examples/sparse_linear_regression.jl
@@ -35,8 +35,8 @@ n_training, n_features = size(training_input)
 using LinearAlgebra
 using Statistics
 
-input_loc = mean(training_input, dims = 1) |> vec
-input_scale = std(training_input, dims = 1) |> vec
+input_loc = mean(training_input, dims=1) |> vec
+input_scale = std(training_input, dims=1) |> vec
 
 linear_model(wb, input) = input * wb[1:end-1] .+ wb[end]
 
@@ -53,12 +53,12 @@ end
 mean_squared_error(label, output) = mean((output .- label) .^ 2) / 2
 
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalAlgorithms
 
 training_loss = ProximalAlgorithms.AutoDifferentiable(
     wb -> mean_squared_error(training_label, standardized_linear_model(wb, training_input)),
-    ZygoteBackend(),
+    AutoZygote(),
 )
 
 # As regularization we will use the L1 norm, implemented in [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl):
@@ -73,7 +73,7 @@ reg = ProximalOperators.NormL1(1)
 # and the objective terms `f=training_loss` (smooth) and `g=reg` (non smooth).
 
 ffb = ProximalAlgorithms.FastForwardBackward()
-solution, iterations = ffb(x0 = zeros(n_features + 1), f = training_loss, g = reg)
+solution, iterations = ffb(x0=zeros(n_features + 1), f=training_loss, g=reg)
 
 # We can now check how well the trained model performs on the test portion of our data.
 
diff --git a/docs/src/guide/custom_objectives.jl b/docs/src/guide/custom_objectives.jl
index 95adc32..cf61cc8 100644
--- a/docs/src/guide/custom_objectives.jl
+++ b/docs/src/guide/custom_objectives.jl
@@ -12,18 +12,18 @@
 # 
 # Defining the proximal mapping for a custom function type requires adding a method for [`ProximalCore.prox!`](@ref).
 # 
-# To compute gradients, algorithms use [`value_and_gradient_closure`](@ref):
-# this relies on [AbstractDifferentiation](https://github.com/JuliaDiff/AbstractDifferentiation.jl), for automatic differentiation
+# To compute gradients, algorithms use [`value_and_gradient`](@ref):
+# this relies on [DifferentiationInterface.jl](https://github.com/gdalle/DifferentiationInterface.jl), for automatic differentiation
 # with any of its supported backends, when functions are wrapped in [`AutoDifferentiable`](@ref),
 # as the examples below show.
 # 
 # If however you would like to provide your own gradient implementation (e.g. for efficiency reasons),
-# you can simply implement a method for [`value_and_gradient_closure`](@ref) on your own function type.
+# you can simply implement a method for [`value_and_gradient`](@ref) on your own function type.
 # 
 # ```@docs
 # ProximalCore.prox
 # ProximalCore.prox!
-# ProximalAlgorithms.value_and_gradient_closure
+# ProximalAlgorithms.value_and_gradient
 # ProximalAlgorithms.AutoDifferentiable
 # ```
 # 
@@ -32,12 +32,12 @@
 # Let's try to minimize the celebrated Rosenbrock function, but constrained to the unit norm ball. The cost function is
 
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalAlgorithms
 
 rosenbrock2D = ProximalAlgorithms.AutoDifferentiable(
     x -> 100 * (x[2] - x[1]^2)^2 + (1 - x[1])^2,
-    ZygoteBackend(),
+    AutoZygote(),
 )
 
 # To enforce the constraint, we define the indicator of the unit ball, together with its proximal mapping:
@@ -63,7 +63,7 @@ end
 # We can now minimize the function, for which we will use [`PANOC`](@ref), which is a Newton-type method:
 
 panoc = ProximalAlgorithms.PANOC()
-solution, iterations = panoc(x0 = -ones(2), f = rosenbrock2D, g = IndUnitBall())
+solution, iterations = panoc(x0=-ones(2), f=rosenbrock2D, g=IndUnitBall())
 
 # Plotting the solution against the cost function contour and constraint, gives an idea of its correctness.
 
@@ -73,17 +73,17 @@ contour(
     -2:0.1:2,
     -2:0.1:2,
     (x, y) -> rosenbrock2D([x, y]),
-    fill = true,
-    framestyle = :none,
-    background = nothing,
+    fill=true,
+    framestyle=:none,
+    background=nothing,
 )
-plot!(Shape(cos.(0:0.01:2*pi), sin.(0:0.01:2*pi)), opacity = 0.5, label = "feasible set")
+plot!(Shape(cos.(0:0.01:2*pi), sin.(0:0.01:2*pi)), opacity=0.5, label="feasible set")
 scatter!(
     [solution[1]],
     [solution[2]],
-    color = :red,
-    markershape = :star5,
-    label = "computed solution",
+    color=:red,
+    markershape=:star5,
+    label="computed solution",
 )
 
 # ## Example: counting operations
@@ -105,16 +105,17 @@ end
 
 Counting(f::T) where {T} = Counting{T}(f, 0, 0, 0)
 
-# Now we only need to intercept any call to [`value_and_gradient_closure`](@ref) and [`prox!`](@ref) and increase counters there:
+function (f::Counting)(x)
+    f.eval_count += 1
+    return f.f(x)
+end
 
-function ProximalAlgorithms.value_and_gradient_closure(f::Counting, x)
+# Now we only need to intercept any call to [`value_and_gradient`](@ref) and [`prox!`](@ref) and increase counters there:
+
+function ProximalAlgorithms.value_and_gradient(f::Counting, x)
     f.eval_count += 1
-    fx, pb = ProximalAlgorithms.value_and_gradient_closure(f.f, x)
-    function counting_pullback()
-        f.gradient_count += 1
-        return pb()
-    end
-    return fx, counting_pullback
+    f.gradient_count += 1
+    return ProximalAlgorithms.value_and_gradient(f.f, x)
 end
 
 function ProximalCore.prox!(y, f::Counting, x, gamma)
@@ -127,7 +128,7 @@ end
 f = Counting(rosenbrock2D)
 g = Counting(IndUnitBall())
 
-solution, iterations = panoc(x0 = -ones(2), f = f, g = g)
+solution, iterations = panoc(x0=-ones(2), f=f, g=g)
 
 # and check how many operations where actually performed:
 
diff --git a/docs/src/guide/getting_started.jl b/docs/src/guide/getting_started.jl
index 2b4c870..71defe7 100644
--- a/docs/src/guide/getting_started.jl
+++ b/docs/src/guide/getting_started.jl
@@ -20,7 +20,7 @@
 # The literature on proximal operators and algorithms is vast: for an overview, one can refer to [Parikh2014](@cite), [Beck2017](@cite).
 # 
 # To evaluate these first-order primitives, in ProximalAlgorithms:
-# * ``\nabla f_i`` falls back to using automatic differentiation (as provided by [AbstractDifferentiation](https://github.com/JuliaDiff/AbstractDifferentiation.jl) and all of its backends).
+# * ``\nabla f_i`` falls back to using automatic differentiation (as provided by [DifferentiationInterface.jl](https://github.com/gdalle/DifferentiationInterface.jl) and all of its backends).
 # * ``\operatorname{prox}_{f_i}`` relies on the intereface of [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl) (>= 0.15).
 # Both of the above can be implemented for custom function types, as [documented here](@ref custom_terms).
 # 
@@ -52,13 +52,13 @@
 
 using LinearAlgebra
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators
 using ProximalAlgorithms
 
 quadratic_cost = ProximalAlgorithms.AutoDifferentiable(
     x -> dot([3.4 1.2; 1.2 4.5] * x, x) / 2 + dot([-2.3, 9.9], x),
-    ZygoteBackend(),
+    AutoZygote(),
 )
 box_indicator = ProximalOperators.IndBox(0, 1)
 
@@ -72,10 +72,9 @@ ffb = ProximalAlgorithms.FastForwardBackward(maxit = 1000, tol = 1e-5, verbose =
 solution, iterations = ffb(x0 = ones(2), f = quadratic_cost, g = box_indicator)
 
 # We can verify the correctness of the solution by checking that the negative gradient is orthogonal to the constraints, pointing outwards:
-# for this, we just evaluate the closure `cl` returned as second output of [`value_and_gradient_closure`](@ref).
+# for this, we just evaluate the second output of [`value_and_gradient`](@ref).
 
-v, cl = ProximalAlgorithms.value_and_gradient_closure(quadratic_cost, solution)
--cl()
+last(ProximalAlgorithms.value_and_gradient(quadratic_cost, solution))
 
 # Or by plotting the solution against the cost function and constraint:
 
diff --git a/docs/src/index.md b/docs/src/index.md
index ef8c44a..ef7354d 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -14,7 +14,7 @@ Implemented algorithms include:
 Check out [this section](@ref problems_algorithms) for an overview of the available algorithms.
 
 Algorithms rely on:
-- [AbstractDifferentiation.jl](https://github.com/JuliaDiff/AbstractDifferentiation.jl) for automatic differentiation (but you can easily bring your own gradients),
+- [DifferentiationInterface.jl](https://github.com/gdalle/DifferentiationInterface.jl) for automatic differentiation (but you can easily bring your own gradients),
 - the [ProximalCore API](https://github.com/JuliaFirstOrder/ProximalCore.jl) for proximal mappings, projections, etc, to handle non-differentiable terms (see for example [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl) for an extensive collection of functions).
 
 !!! note
diff --git a/src/ProximalAlgorithms.jl b/src/ProximalAlgorithms.jl
index fb43534..e248613 100644
--- a/src/ProximalAlgorithms.jl
+++ b/src/ProximalAlgorithms.jl
@@ -1,6 +1,7 @@
 module ProximalAlgorithms
 
-using AbstractDifferentiation
+using ADTypes: ADTypes
+using DifferentiationInterface: DifferentiationInterface
 using ProximalCore
 using ProximalCore: prox, prox!
 
@@ -12,11 +13,11 @@ const Maybe{T} = Union{T,Nothing}
 
 Callable struct wrapping function `f` to be auto-differentiated using `backend`.
 
-When called, it evaluates the same as `f`, while [`value_and_gradient_closure`](@ref)
+When called, it evaluates the same as `f`, while its gradient
 is implemented using `backend` for automatic differentiation.
-The backend can be any from [AbstractDifferentiation](https://github.com/JuliaDiff/AbstractDifferentiation.jl).
+The backend can be any of those supported by [DifferentiationInterface.jl](https://github.com/gdalle/DifferentiationInterface.jl).
 """
-struct AutoDifferentiable{F,B}
+struct AutoDifferentiable{F,B<:ADTypes.AbstractADType}
     f::F
     backend::B
 end
@@ -24,21 +25,18 @@ end
 (f::AutoDifferentiable)(x) = f.f(x)
 
 """
-    value_and_gradient_closure(f, x)
+    value_and_gradient(f, x)
 
-Return a tuple containing the value of `f` at `x`, and a closure `cl`.
-
-Function `cl`, once called, yields the gradient of `f` at `x`.
+Return a tuple containing the value of `f` at `x` and the gradient of `f` at `x`.
 """
-value_and_gradient_closure
+value_and_gradient
 
-function value_and_gradient_closure(f::AutoDifferentiable, x)
-    fx, pb = AbstractDifferentiation.value_and_pullback_function(f.backend, f.f, x)
-    return fx, () -> pb(one(fx))[1]
+function value_and_gradient(f::AutoDifferentiable, x)
+    return DifferentiationInterface.value_and_gradient(f.f, f.backend, x)
 end
 
-function value_and_gradient_closure(f::ProximalCore.Zero, x)
-    f(x), () -> zero(x)
+function value_and_gradient(f::ProximalCore.Zero, x)
+    return f(x), zero(x)
 end
 
 # various utilities
diff --git a/src/algorithms/davis_yin.jl b/src/algorithms/davis_yin.jl
index 1c633ee..55a4c79 100644
--- a/src/algorithms/davis_yin.jl
+++ b/src/algorithms/davis_yin.jl
@@ -56,8 +56,7 @@ end
 function Base.iterate(iter::DavisYinIteration)
     z = copy(iter.x0)
     xg, = prox(iter.g, z, iter.gamma)
-    f_xg, cl = value_and_gradient_closure(iter.f, xg)
-    grad_f_xg = cl()
+    f_xg, grad_f_xg = value_and_gradient(iter.f, xg)
     z_half = 2 .* xg .- z .- iter.gamma .* grad_f_xg
     xh, = prox(iter.h, z_half, iter.gamma)
     res = xh - xg
@@ -68,8 +67,8 @@ end
 
 function Base.iterate(iter::DavisYinIteration, state::DavisYinState)
     prox!(state.xg, iter.g, state.z, iter.gamma)
-    f_xg, cl = value_and_gradient_closure(iter.f, state.xg)
-    state.grad_f_xg .= cl()
+    f_xg, grad_f_xg = value_and_gradient(iter.f, state.xg)
+    state.grad_f_xg .= grad_f_xg
     state.z_half .= 2 .* state.xg .- state.z .- iter.gamma .* state.grad_f_xg
     prox!(state.xh, iter.h, state.z_half, iter.gamma)
     state.res .= state.xh .- state.xg
diff --git a/src/algorithms/fast_forward_backward.jl b/src/algorithms/fast_forward_backward.jl
index adf4c64..c4ccb3f 100644
--- a/src/algorithms/fast_forward_backward.jl
+++ b/src/algorithms/fast_forward_backward.jl
@@ -72,8 +72,7 @@ end
 
 function Base.iterate(iter::FastForwardBackwardIteration)
     x = copy(iter.x0)
-    f_x, cl = value_and_gradient_closure(iter.f, x)
-    grad_f_x = cl()
+    f_x, grad_f_x = value_and_gradient(iter.f, x)
     gamma =
         iter.gamma === nothing ?
         1 / lower_bound_smoothness_constant(iter.f, I, x, grad_f_x) : iter.gamma
@@ -136,8 +135,8 @@ function Base.iterate(
     state.x .= state.z .+ beta .* (state.z .- state.z_prev)
     state.z_prev, state.z = state.z, state.z_prev
 
-    state.f_x, cl = value_and_gradient_closure(iter.f, state.x)
-    state.grad_f_x .= cl()
+    state.f_x, grad_f_x = value_and_gradient(iter.f, state.x)
+    state.grad_f_x .= grad_f_x
     state.y .= state.x .- state.gamma .* state.grad_f_x
     state.g_z = prox!(state.z, iter.g, state.y, state.gamma)
     state.res .= state.x .- state.z
diff --git a/src/algorithms/forward_backward.jl b/src/algorithms/forward_backward.jl
index 2ba7f8c..574389b 100644
--- a/src/algorithms/forward_backward.jl
+++ b/src/algorithms/forward_backward.jl
@@ -64,8 +64,7 @@ end
 
 function Base.iterate(iter::ForwardBackwardIteration)
     x = copy(iter.x0)
-    f_x, cl = value_and_gradient_closure(iter.f, x)
-    grad_f_x = cl()
+    f_x, grad_f_x = value_and_gradient(iter.f, x)
     gamma =
         iter.gamma === nothing ?
         1 / lower_bound_smoothness_constant(iter.f, I, x, grad_f_x) : iter.gamma
@@ -111,8 +110,8 @@ function Base.iterate(
         state.grad_f_x, state.grad_f_z = state.grad_f_z, state.grad_f_x
     else
         state.x, state.z = state.z, state.x
-        state.f_x, cl = value_and_gradient_closure(iter.f, state.x)
-        state.grad_f_x .= cl()
+        state.f_x, grad_f_x = value_and_gradient(iter.f, state.x)
+        state.grad_f_x .= grad_f_x
     end
 
     state.y .= state.x .- state.gamma .* state.grad_f_x
diff --git a/src/algorithms/li_lin.jl b/src/algorithms/li_lin.jl
index 6fbcde3..0889024 100644
--- a/src/algorithms/li_lin.jl
+++ b/src/algorithms/li_lin.jl
@@ -62,8 +62,7 @@ end
 
 function Base.iterate(iter::LiLinIteration{R}) where {R}
     y = copy(iter.x0)
-    f_y, cl = value_and_gradient_closure(iter.f, y)
-    grad_f_y = cl()
+    f_y, grad_f_y = value_and_gradient(iter.f, y)
 
     # TODO: initialize gamma if not provided
     # TODO: authors suggest Barzilai-Borwein rule?
@@ -110,8 +109,7 @@ function Base.iterate(iter::LiLinIteration{R}, state::LiLinState{R,Tx}) where {R
     else
         # TODO: re-use available space in state?
         # TODO: backtrack gamma at x
-        f_x, cl = value_and_gradient_closure(iter.f, x)
-        grad_f_x = cl()
+        f_x, grad_f_x = value_and_gradient(iter.f, x)
         x_forward = state.x - state.gamma .* grad_f_x
         v, g_v = prox(iter.g, x_forward, state.gamma)
         Fv = iter.f(v) + g_v
@@ -130,8 +128,8 @@ function Base.iterate(iter::LiLinIteration{R}, state::LiLinState{R,Tx}) where {R
         Fx = Fv
     end
 
-    state.f_y, cl = value_and_gradient_closure(iter.f, state.y)
-    state.grad_f_y .= cl()
+    state.f_y, grad_f_y = value_and_gradient(iter.f, state.y)
+    state.grad_f_y .= grad_f_y
     state.y_forward .= state.y .- state.gamma .* state.grad_f_y
     state.g_z = prox!(state.z, iter.g, state.y_forward, state.gamma)
 
diff --git a/src/algorithms/panoc.jl b/src/algorithms/panoc.jl
index bea282e..c7f2558 100644
--- a/src/algorithms/panoc.jl
+++ b/src/algorithms/panoc.jl
@@ -87,8 +87,7 @@ f_model(iter::PANOCIteration, state::PANOCState) =
 function Base.iterate(iter::PANOCIteration{R}) where {R}
     x = copy(iter.x0)
     Ax = iter.A * x
-    f_Ax, cl = value_and_gradient_closure(iter.f, Ax)
-    grad_f_Ax = cl()
+    f_Ax, grad_f_Ax = value_and_gradient(iter.f, Ax)
     gamma =
         iter.gamma === nothing ?
         iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) :
@@ -182,8 +181,8 @@ function Base.iterate(iter::PANOCIteration{R,Tx,Tf}, state::PANOCState) where {R
 
     state.x_d .= state.x .+ state.d
     state.Ax_d .= state.Ax .+ state.Ad
-    state.f_Ax_d, cl = value_and_gradient_closure(iter.f, state.Ax_d)
-    state.grad_f_Ax_d .= cl()
+    state.f_Ax_d, grad_f_Ax_d = value_and_gradient(iter.f, state.Ax_d)
+    state.grad_f_Ax_d .= grad_f_Ax_d
     mul!(state.At_grad_f_Ax_d, adjoint(iter.A), state.grad_f_Ax_d)
 
     copyto!(state.x, state.x_d)
@@ -220,8 +219,8 @@ function Base.iterate(iter::PANOCIteration{R,Tx,Tf}, state::PANOCState) where {R
             # along a line using interpolation and linear combinations
             # this allows saving operations
             if isinf(f_Az)
-                f_Az, cl = value_and_gradient_closure(iter.f, state.Az)
-                state.grad_f_Az .= cl()
+                f_Az, grad_f_Az = value_and_gradient(iter.f, state.Az)
+                state.grad_f_Az .= grad_f_Az
             end
             if isinf(c)
                 mul!(state.At_grad_f_Az, iter.A', state.grad_f_Az)
@@ -239,8 +238,8 @@ function Base.iterate(iter::PANOCIteration{R,Tx,Tf}, state::PANOCState) where {R
         else
             # otherwise, in the general case where f is only smooth, we compute
             # one gradient and matvec per backtracking step
-            state.f_Ax, cl = value_and_gradient_closure(iter.f, state.Ax)
-            state.grad_f_Ax .= cl()
+            state.f_Ax, grad_f_Ax = value_and_gradient(iter.f, state.Ax)
+            state.grad_f_Ax .= grad_f_Ax
             mul!(state.At_grad_f_Ax, adjoint(iter.A), state.grad_f_Ax)
         end
 
diff --git a/src/algorithms/panocplus.jl b/src/algorithms/panocplus.jl
index 553d0a4..d407039 100644
--- a/src/algorithms/panocplus.jl
+++ b/src/algorithms/panocplus.jl
@@ -80,8 +80,7 @@ f_model(iter::PANOCplusIteration, state::PANOCplusState) =
 function Base.iterate(iter::PANOCplusIteration{R}) where {R}
     x = copy(iter.x0)
     Ax = iter.A * x
-    f_Ax, cl = value_and_gradient_closure(iter.f, Ax)
-    grad_f_Ax = cl()
+    f_Ax, grad_f_Ax = value_and_gradient(iter.f, Ax)
     gamma =
         iter.gamma === nothing ?
         iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) :
@@ -122,8 +121,8 @@ function Base.iterate(iter::PANOCplusIteration{R}) where {R}
         )
     else
         mul!(state.Az, iter.A, state.z)
-        f_Az, cl = value_and_gradient_closure(iter.f, state.Az)
-        state.grad_f_Az = cl()
+        f_Az, grad_f_Az = value_and_gradient(iter.f, state.Az)
+        state.grad_f_Az = grad_f_Az
     end
     mul!(state.At_grad_f_Az, adjoint(iter.A), state.grad_f_Az)
     return state, state
@@ -198,8 +197,8 @@ function Base.iterate(iter::PANOCplusIteration{R}, state::PANOCplusState) where
         end
 
         mul!(state.Ax, iter.A, state.x)
-        state.f_Ax, cl = value_and_gradient_closure(iter.f, state.Ax)
-        state.grad_f_Ax .= cl()
+        state.f_Ax, grad_f_Ax = value_and_gradient(iter.f, state.Ax)
+        state.grad_f_Ax .= grad_f_Ax
         mul!(state.At_grad_f_Ax, adjoint(iter.A), state.grad_f_Ax)
 
         state.y .= state.x .- state.gamma .* state.At_grad_f_Ax
@@ -209,8 +208,8 @@ function Base.iterate(iter::PANOCplusIteration{R}, state::PANOCplusState) where
         f_Az_upp = f_model(iter, state)
 
         mul!(state.Az, iter.A, state.z)
-        f_Az, cl = value_and_gradient_closure(iter.f, state.Az)
-        state.grad_f_Az .= cl()
+        f_Az, grad_f_Az = value_and_gradient(iter.f, state.Az)
+        state.grad_f_Az .= grad_f_Az
         if (iter.gamma === nothing || iter.adaptive == true)
             tol = 10 * eps(R) * (1 + abs(f_Az))
             if f_Az > f_Az_upp + tol && state.gamma >= iter.minimum_gamma
diff --git a/src/algorithms/primal_dual.jl b/src/algorithms/primal_dual.jl
index 15c0375..9077da7 100644
--- a/src/algorithms/primal_dual.jl
+++ b/src/algorithms/primal_dual.jl
@@ -175,8 +175,8 @@ function Base.iterate(
     state::AFBAState = AFBAState(x = copy(iter.x0), y = copy(iter.y0)),
 )
     # perform xbar-update step
-    f_x, cl = value_and_gradient_closure(iter.f, state.x)
-    state.gradf .= cl()
+    f_x, gradf = value_and_gradient(iter.f, state.x)
+    state.gradf .= gradf
     mul!(state.temp_x, iter.L', state.y)
     state.temp_x .+= state.gradf
     state.temp_x .*= -iter.gamma[1]
@@ -184,8 +184,8 @@ function Base.iterate(
     prox!(state.xbar, iter.g, state.temp_x, iter.gamma[1])
 
     # perform ybar-update step
-    lc_y, cl = value_and_gradient_closure(convex_conjugate(iter.l), state.y)
-    state.gradl .= cl()
+    lc_y, gradl = value_and_gradient(convex_conjugate(iter.l), state.y)
+    state.gradl .= gradl
     state.temp_x .= iter.theta .* state.xbar .+ (1 - iter.theta) .* state.x
     mul!(state.temp_y, iter.L, state.temp_x)
     state.temp_y .-= state.gradl
diff --git a/src/algorithms/sfista.jl b/src/algorithms/sfista.jl
index 1c6c6c6..3e9458e 100644
--- a/src/algorithms/sfista.jl
+++ b/src/algorithms/sfista.jl
@@ -71,8 +71,8 @@ function Base.iterate(
     state.a = (state.τ + sqrt(state.τ^2 + 4 * state.τ * state.APrev)) / 2
     state.A = state.APrev + state.a
     state.xt .= (state.APrev / state.A) .* state.yPrev + (state.a / state.A) .* state.xPrev
-    f_xt, cl = value_and_gradient_closure(iter.f, state.xt)
-    state.gradf_xt .= cl()
+    f_xt, gradf_xt = value_and_gradient(iter.f, state.xt)
+    state.gradf_xt .= gradf_xt
     λ2 = state.λ / (1 + state.λ * iter.mf)
     # FISTA acceleration steps.
     prox!(state.y, iter.g, state.xt - λ2 * state.gradf_xt, λ2)
@@ -97,8 +97,7 @@ function check_sc(state::SFISTAState, iter::SFISTAIteration, tol, termination_ty
     else
         # Classic (approximate) first-order stationary point [4]. The main inclusion is: r ∈ ∇f(y) + ∂h(y).
         λ2 = state.λ / (1 + state.λ * iter.mf)
-        f_y, cl = value_and_gradient_closure(iter.f, state.y)
-        gradf_y = cl()
+        f_y, gradf_y = value_and_gradient(iter.f, state.y)
         r = gradf_y - state.gradf_xt + (state.xt - state.y) / λ2
         res = norm(r)
     end
diff --git a/src/algorithms/zerofpr.jl b/src/algorithms/zerofpr.jl
index cf49257..8830969 100644
--- a/src/algorithms/zerofpr.jl
+++ b/src/algorithms/zerofpr.jl
@@ -85,8 +85,7 @@ f_model(iter::ZeroFPRIteration, state::ZeroFPRState) =
 function Base.iterate(iter::ZeroFPRIteration{R}) where {R}
     x = copy(iter.x0)
     Ax = iter.A * x
-    f_Ax, cl = value_and_gradient_closure(iter.f, Ax)
-    grad_f_Ax = cl()
+    f_Ax, grad_f_Ax = value_and_gradient(iter.f, Ax)
     gamma =
         iter.gamma === nothing ?
         iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) :
@@ -166,8 +165,8 @@ function Base.iterate(iter::ZeroFPRIteration{R}, state::ZeroFPRState) where {R}
         f_Axbar_upp, f_Axbar
     else
         mul!(state.Axbar, iter.A, state.xbar)
-        f_Axbar, cl = value_and_gradient_closure(iter.f, state.Axbar)
-        state.grad_f_Axbar .= cl()
+        f_Axbar, grad_f_Axbar = value_and_gradient(iter.f, state.Axbar)
+        state.grad_f_Axbar .= grad_f_Axbar
         f_model(iter, state), f_Axbar
     end
 
@@ -202,8 +201,8 @@ function Base.iterate(iter::ZeroFPRIteration{R}, state::ZeroFPRState) where {R}
         state.x .= state.xbar_prev .+ state.tau .* state.d
         state.Ax .= state.Axbar .+ state.tau .* state.Ad
         # TODO: can precompute most of next line in case f is quadratic
-        state.f_Ax, cl = value_and_gradient_closure(iter.f, state.Ax)
-        state.grad_f_Ax .= cl()
+        state.f_Ax, grad_f_Ax = value_and_gradient(iter.f, state.Ax)
+        state.grad_f_Ax .= grad_f_Ax
         mul!(state.At_grad_f_Ax, iter.A', state.grad_f_Ax)
         state.y .= state.x .- state.gamma .* state.At_grad_f_Ax
         state.g_xbar = prox!(state.xbar, iter.g, state.y, state.gamma)
diff --git a/src/utilities/fb_tools.jl b/src/utilities/fb_tools.jl
index 6ebe71d..0c88c8a 100644
--- a/src/utilities/fb_tools.jl
+++ b/src/utilities/fb_tools.jl
@@ -7,16 +7,14 @@ end
 function lower_bound_smoothness_constant(f, A, x, grad_f_Ax)
     R = real(eltype(x))
     xeps = x .+ 1
-    f_Axeps, cl = value_and_gradient_closure(f, A * xeps)
-    grad_f_Axeps = cl()
+    f_Axeps, grad_f_Axeps = value_and_gradient(f, A * xeps)
     return norm(A' * (grad_f_Axeps - grad_f_Ax)) / R(sqrt(length(x)))
 end
 
 function lower_bound_smoothness_constant(f, A, x)
     R = real(eltype(x))
     Ax = A * x
-    f_Ax, cl = value_and_gradient_closure(f, Ax)
-    grad_f_Ax = cl()
+    f_Ax, grad_f_Ax = value_and_gradient(f, Ax)
     return lower_bound_smoothness_constant(f, A, x, grad_f_Ax)
 end
 
@@ -36,14 +34,14 @@ function backtrack_stepsize!(
     g_z::R,
     res,
     Az,
-    grad_f_Az = nothing;
-    alpha = R(1),
-    minimum_gamma = R(1e-7),
-    reduce_gamma = R(0.5),
+    grad_f_Az=nothing;
+    alpha=R(1),
+    minimum_gamma=R(1e-7),
+    reduce_gamma=R(0.5),
 ) where {R}
     f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
     _mul!(Az, A, z)
-    f_Az, cl = value_and_gradient_closure(f, Az)
+    f_Az, grad_f_Az = value_and_gradient(f, Az)
     tol = 10 * eps(R) * (1 + abs(f_Az))
     while f_Az > f_Az_upp + tol && gamma >= minimum_gamma
         gamma *= reduce_gamma
@@ -52,12 +50,9 @@ function backtrack_stepsize!(
         res .= x .- z
         f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
         _mul!(Az, A, z)
-        f_Az, cl = value_and_gradient_closure(f, Az)
+        f_Az, grad_f_Az = value_and_gradient(f, Az)
         tol = 10 * eps(R) * (1 + abs(f_Az))
     end
-    if grad_f_Az !== nothing
-        grad_f_Az .= cl()
-    end
     if gamma < minimum_gamma
         @warn "stepsize `gamma` became too small ($(gamma))"
     end
@@ -70,13 +65,12 @@ function backtrack_stepsize!(
     A,
     g,
     x;
-    alpha = R(1),
-    minimum_gamma = R(1e-7),
-    reduce_gamma = R(0.5),
+    alpha=R(1),
+    minimum_gamma=R(1e-7),
+    reduce_gamma=R(0.5),
 ) where {R}
     Ax = A * x
-    f_Ax, cl = value_and_gradient_closure(f, Ax)
-    grad_f_Ax = cl()
+    f_Ax, grad_f_Ax = value_and_gradient(f, Ax)
     At_grad_f_Ax = A' * grad_f_Ax
     y = x - gamma .* At_grad_f_Ax
     z, g_z = prox(g, y, gamma)
@@ -94,8 +88,8 @@ function backtrack_stepsize!(
         x - z,
         Ax,
         grad_f_Ax;
-        alpha = alpha,
-        minimum_gamma = minimum_gamma,
-        reduce_gamma = reduce_gamma,
+        alpha=alpha,
+        minimum_gamma=minimum_gamma,
+        reduce_gamma=reduce_gamma,
     )
 end
diff --git a/test/Project.toml b/test/Project.toml
index 7241417..004bce7 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,7 +1,7 @@
 [deps]
-AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 AbstractOperators = "d9c5613a-d543-52d8-9afd-8f241a8c3f1c"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
diff --git a/test/problems/test_elasticnet.jl b/test/problems/test_elasticnet.jl
index 3db580a..274d8ea 100644
--- a/test/problems/test_elasticnet.jl
+++ b/test/problems/test_elasticnet.jl
@@ -2,7 +2,7 @@ using LinearAlgebra
 using ProximalOperators: NormL1, SqrNormL2, ElasticNet, Translate
 using ProximalAlgorithms
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 
 @testset "Elastic net ($T)" for T in [Float32, Float64, ComplexF32, ComplexF64]
     A = T[
@@ -22,7 +22,7 @@ using AbstractDifferentiation: ZygoteBackend
     reg2 = SqrNormL2(R(1))
     loss = Translate(SqrNormL2(R(1)), -b)
     cost =
-        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, AutoZygote())
 
     L = opnorm(A)^2
 
@@ -68,7 +68,7 @@ using AbstractDifferentiation: ZygoteBackend
         (x_afba, y_afba), it_afba = solver(
             x0 = x0,
             y0 = y0,
-            f = ProximalAlgorithms.AutoDifferentiable(reg2, ZygoteBackend()),
+            f = ProximalAlgorithms.AutoDifferentiable(reg2, AutoZygote()),
             g = reg1,
             h = loss,
             L = A,
@@ -92,7 +92,7 @@ using AbstractDifferentiation: ZygoteBackend
         (x_afba, y_afba), it_afba = solver(
             x0 = x0,
             y0 = y0,
-            f = ProximalAlgorithms.AutoDifferentiable(reg2, ZygoteBackend()),
+            f = ProximalAlgorithms.AutoDifferentiable(reg2, AutoZygote()),
             g = reg1,
             h = loss,
             L = A,
diff --git a/test/problems/test_equivalence.jl b/test/problems/test_equivalence.jl
index 42d73ad..54c96f7 100644
--- a/test/problems/test_equivalence.jl
+++ b/test/problems/test_equivalence.jl
@@ -1,7 +1,7 @@
 using LinearAlgebra
 using Test
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators: LeastSquares, NormL1
 using ProximalAlgorithms:
     DouglasRachfordIteration,
@@ -63,7 +63,7 @@ end
 
     lam = R(0.1) * norm(A' * b, Inf)
 
-    f = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, ZygoteBackend())
+    f = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, AutoZygote())
     g = NormL1(lam)
 
     x0 = zeros(R, n)
@@ -98,7 +98,7 @@ end
 
     lam = R(0.1) * norm(A' * b, Inf)
 
-    f = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, ZygoteBackend())
+    f = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, AutoZygote())
     g = NormL1(lam)
 
     x0 = zeros(R, n)
diff --git a/test/problems/test_lasso_small.jl b/test/problems/test_lasso_small.jl
index 2c2faec..8ffa2df 100644
--- a/test/problems/test_lasso_small.jl
+++ b/test/problems/test_lasso_small.jl
@@ -2,7 +2,7 @@ using LinearAlgebra
 using Test
 
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators: NormL1, LeastSquares
 using ProximalAlgorithms
 using ProximalAlgorithms:
@@ -30,9 +30,9 @@ using ProximalAlgorithms:
     @test typeof(lam) == R
 
     f_autodiff =
-        ProximalAlgorithms.AutoDifferentiable(x -> (norm(x - b)^2) / 2, ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> (norm(x - b)^2) / 2, AutoZygote())
     fA_autodiff =
-        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, AutoZygote())
     f_prox = Translate(SqrNormL2(R(1)), -b)
     fA_prox = LeastSquares(A, b)
     g = NormL1(lam)
diff --git a/test/problems/test_lasso_small_strongly_convex.jl b/test/problems/test_lasso_small_strongly_convex.jl
index 476efec..b282c57 100644
--- a/test/problems/test_lasso_small_strongly_convex.jl
+++ b/test/problems/test_lasso_small_strongly_convex.jl
@@ -2,7 +2,7 @@ using LinearAlgebra
 using Test
 
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators: NormL1, LeastSquares
 using ProximalAlgorithms
 
@@ -45,7 +45,7 @@ using ProximalAlgorithms
 
     fA_prox = LeastSquares(A, b)
     fA_autodiff =
-        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, AutoZygote())
     g = NormL1(lam)
 
     TOL = T(1e-4)
diff --git a/test/problems/test_linear_programs.jl b/test/problems/test_linear_programs.jl
index d0aea87..54a908a 100644
--- a/test/problems/test_linear_programs.jl
+++ b/test/problems/test_linear_programs.jl
@@ -1,5 +1,5 @@
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators: Linear, IndNonnegative, IndPoint, IndAffine, SlicedSeparableSum
 using ProximalAlgorithms
 using LinearAlgebra
@@ -101,7 +101,7 @@ end
 
     @testset "AFBA" begin
 
-        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), ZygoteBackend())
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), AutoZygote())
         g = IndNonnegative()
         h = IndPoint(b)
 
@@ -127,7 +127,7 @@ end
 
     @testset "VuCondat" begin
 
-        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), ZygoteBackend())
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), AutoZygote())
         g = IndNonnegative()
         h = IndPoint(b)
 
@@ -176,7 +176,7 @@ end
 
     @testset "DavisYin" begin
 
-        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), ZygoteBackend())
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), AutoZygote())
         g = IndNonnegative()
         h = IndAffine(A, b)
 
diff --git a/test/problems/test_nonconvex_qp.jl b/test/problems/test_nonconvex_qp.jl
index 60558f2..fadc5f2 100644
--- a/test/problems/test_nonconvex_qp.jl
+++ b/test/problems/test_nonconvex_qp.jl
@@ -1,5 +1,5 @@
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalAlgorithms
 using ProximalOperators: IndBox
 using LinearAlgebra
@@ -14,7 +14,7 @@ using Test
 
     f = ProximalAlgorithms.AutoDifferentiable(
         x -> dot(Q * x, x) / 2 + dot(q, x),
-        ZygoteBackend(),
+        AutoZygote(),
     )
     g = IndBox(low, upp)
 
@@ -83,7 +83,7 @@ end
 
         f = ProximalAlgorithms.AutoDifferentiable(
             x -> dot(Q * x, x) / 2 + dot(q, x),
-            ZygoteBackend(),
+            AutoZygote(),
         )
         g = IndBox(low, upp)
 
diff --git a/test/problems/test_sparse_logistic_small.jl b/test/problems/test_sparse_logistic_small.jl
index 8c854e5..5103120 100644
--- a/test/problems/test_sparse_logistic_small.jl
+++ b/test/problems/test_sparse_logistic_small.jl
@@ -1,5 +1,5 @@
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators: NormL1
 using ProximalAlgorithms
 using LinearAlgebra
@@ -23,10 +23,10 @@ using LinearAlgebra
     end
 
     f_autodiff =
-        ProximalAlgorithms.AutoDifferentiable(x -> logistic_loss(x - b), ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> logistic_loss(x - b), AutoZygote())
     fA_autodiff = ProximalAlgorithms.AutoDifferentiable(
         x -> logistic_loss(A * x - b),
-        ZygoteBackend(),
+        AutoZygote(),
     )
     lam = R(0.1)
     g = NormL1(lam)
diff --git a/test/problems/test_verbose.jl b/test/problems/test_verbose.jl
index ee3f318..ce2da75 100644
--- a/test/problems/test_verbose.jl
+++ b/test/problems/test_verbose.jl
@@ -1,5 +1,5 @@
 using Zygote
-using AbstractDifferentiation: ZygoteBackend
+using DifferentiationInterface: AutoZygote
 using ProximalOperators: LeastSquares, NormL1
 using ProximalAlgorithms
 using LinearAlgebra
@@ -21,9 +21,9 @@ using LinearAlgebra
     @test typeof(lam) == R
 
     f_autodiff =
-        ProximalAlgorithms.AutoDifferentiable(x -> (norm(x - b)^2) / 2, ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> (norm(x - b)^2) / 2, AutoZygote())
     fA_autodiff =
-        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, ZygoteBackend())
+        ProximalAlgorithms.AutoDifferentiable(x -> (norm(A * x - b)^2) / 2, AutoZygote())
     fA_prox = LeastSquares(A, b)
     g = NormL1(lam)
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 957c189..42b4be9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 using Test
 using Aqua
-using AbstractDifferentiation
+using DifferentiationInterface
 using ProximalAlgorithms
 
 struct Quadratic{M,V}
@@ -10,13 +10,13 @@ end
 
 (f::Quadratic)(x) = dot(x, f.Q * x) / 2 + dot(f.q, x)
 
-function ProximalAlgorithms.value_and_gradient_closure(f::Quadratic, x)
+function ProximalAlgorithms.value_and_gradient(f::Quadratic, x)
     grad = f.Q * x + f.q
-    return dot(grad, x) / 2 + dot(f.q, x), () -> grad
+    return dot(grad, x) / 2 + dot(f.q, x), grad
 end
 
 @testset "Aqua" begin
-    Aqua.test_all(ProximalAlgorithms; ambiguities = false)
+    Aqua.test_all(ProximalAlgorithms; ambiguities=false)
 end
 
 include("utilities/test_ad.jl")
diff --git a/test/utilities/test_ad.jl b/test/utilities/test_ad.jl
index e8f9c0b..17a21ff 100644
--- a/test/utilities/test_ad.jl
+++ b/test/utilities/test_ad.jl
@@ -4,13 +4,13 @@ using ProximalOperators: NormL1
 using ProximalAlgorithms
 using Zygote
 using ReverseDiff
-using AbstractDifferentiation: ZygoteBackend, ReverseDiffBackend
+using DifferentiationInterface: AutoZygote, AutoReverseDiff
 
 @testset "Autodiff backend ($B on $T)" for (T, B) in Iterators.product(
     [Float32, Float64, ComplexF32, ComplexF64],
-    [ZygoteBackend, ReverseDiffBackend],
+    [AutoZygote, AutoReverseDiff],
 )
-    if T <: Complex && B == ReverseDiffBackend
+    if T <: Complex && B == AutoReverseDiff
         continue
     end
 
@@ -28,8 +28,8 @@ using AbstractDifferentiation: ZygoteBackend, ReverseDiffBackend
 
     x0 = zeros(T, n)
 
-    f_x0, cl = ProximalAlgorithms.value_and_gradient_closure(f, x0)
-    grad_f_x0 = @inferred cl()
+    # TODO: I removed the @inferred below, Zygote can infer the output type of the closure once it has the closure but it cannot infer the whole procedure of computing the gradient anyway
+    f_x0, grad_f_x0 = ProximalAlgorithms.value_and_gradient(f, x0)
 
     lam = R(0.1) * norm(A' * b, Inf)
     @test typeof(lam) == R
diff --git a/test/utilities/test_fb_tools.jl b/test/utilities/test_fb_tools.jl
index a2c6509..392b20c 100644
--- a/test/utilities/test_fb_tools.jl
+++ b/test/utilities/test_fb_tools.jl
@@ -2,7 +2,7 @@ using Test
 using LinearAlgebra
 using ProximalCore: Zero
 using ProximalAlgorithms
-using AbstractDifferentiation
+using DifferentiationInterface
 
 @testset "Lipschitz constant estimation" for R in [Float32, Float64]
 

From 92a12049463419baecb404a6f9e44db2fb2ff917 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Wed, 17 Jul 2024 10:08:56 +0200
Subject: [PATCH 2/4] Reduce diff

---
 benchmark/benchmarks.jl                       | 40 +++++++++----------
 docs/src/examples/sparse_linear_regression.jl |  6 +--
 docs/src/guide/custom_objectives.jl           | 18 ++++-----
 src/utilities/fb_tools.jl                     | 20 +++++-----
 test/runtests.jl                              |  2 +-
 5 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 27032f2..7c29af6 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -45,56 +45,56 @@ for (benchmark_name, file_name) in [
         m, n = size(A)
 
         SUITE[k]["ForwardBackward"] =
-            @benchmarkable solver(x0=x0, f=f, g=g) setup = begin
-                solver = ProximalAlgorithms.ForwardBackward(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, g = g) setup = begin
+                solver = ProximalAlgorithms.ForwardBackward(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["FastForwardBackward"] =
-            @benchmarkable solver(x0=x0, f=f, g=g) setup = begin
-                solver = ProximalAlgorithms.FastForwardBackward(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, g = g) setup = begin
+                solver = ProximalAlgorithms.FastForwardBackward(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["ZeroFPR"] =
-            @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup = begin
-                solver = ProximalAlgorithms.ZeroFPR(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, A = $A, g = g) setup = begin
+                solver = ProximalAlgorithms.ZeroFPR(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = SquaredDistance($b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["PANOC"] =
-            @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup = begin
-                solver = ProximalAlgorithms.PANOC(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, A = $A, g = g) setup = begin
+                solver = ProximalAlgorithms.PANOC(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = SquaredDistance($b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["PANOCplus"] =
-            @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup = begin
-                solver = ProximalAlgorithms.PANOCplus(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, A = $A, g = g) setup = begin
+                solver = ProximalAlgorithms.PANOCplus(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = SquaredDistance($b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["DouglasRachford"] =
-            @benchmarkable solver(x0=x0, f=f, g=g, gamma=$R(1)) setup = begin
-                solver = ProximalAlgorithms.DouglasRachford(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, g = g, gamma = $R(1)) setup = begin
+                solver = ProximalAlgorithms.DouglasRachford(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
             end
 
         SUITE[k]["DRLS"] =
-            @benchmarkable solver(x0=x0, f=f, g=g, Lf=Lf) setup = begin
-                solver = ProximalAlgorithms.DRLS(tol=1e-6)
+            @benchmarkable solver(x0 = x0, f = f, g = g, Lf = Lf) setup = begin
+                solver = ProximalAlgorithms.DRLS(tol = 1e-6)
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 Lf = opnorm(($A)' * $A)
@@ -102,11 +102,11 @@ for (benchmark_name, file_name) in [
             end
 
         SUITE[k]["AFBA-1"] =
-            @benchmarkable solver(x0=x0, y0=y0, f=f, g=g, beta_f=beta_f) setup =
+            @benchmarkable solver(x0 = x0, y0 = y0, f = f, g = g, beta_f = beta_f) setup =
                 begin
                     beta_f = opnorm($A)^2
                     solver =
-                        ProximalAlgorithms.AFBA(theta=$R(1), mu=$R(1), tol=$R(1e-6))
+                        ProximalAlgorithms.AFBA(theta = $R(1), mu = $R(1), tol = $R(1e-6))
                     x0 = zeros($T, size($A, 2))
                     y0 = zeros($T, size($A, 2))
                     f = LeastSquares($A, $b)
@@ -114,10 +114,10 @@ for (benchmark_name, file_name) in [
                 end
 
         SUITE[k]["AFBA-2"] =
-            @benchmarkable solver(x0=x0, y0=y0, h=h, L=$A, g=g) setup = begin
+            @benchmarkable solver(x0 = x0, y0 = y0, h = h, L = $A, g = g) setup = begin
                 beta_f = opnorm($A)^2
                 solver =
-                    ProximalAlgorithms.AFBA(theta=$R(1), mu=$R(1), tol=$R(1e-6))
+                    ProximalAlgorithms.AFBA(theta = $R(1), mu = $R(1), tol = $R(1e-6))
                 x0 = zeros($T, size($A, 2))
                 y0 = zeros($T, size($A, 1))
                 h = Translate(SqrNormL2(), -$b)
@@ -125,8 +125,8 @@ for (benchmark_name, file_name) in [
             end
 
         SUITE[k]["SFISTA"] =
-            @benchmarkable solver(x0=x0, f=f, Lf=Lf, g=g) setup = begin
-                solver = ProximalAlgorithms.SFISTA(tol=$R(1e-3))
+            @benchmarkable solver(x0 = x0, f = f, Lf = Lf, g = g) setup = begin
+                solver = ProximalAlgorithms.SFISTA(tol = $R(1e-3))
                 x0 = zeros($T, size($A, 2))
                 f = LeastSquares($A, $b)
                 g = NormL1($lam)
diff --git a/docs/src/examples/sparse_linear_regression.jl b/docs/src/examples/sparse_linear_regression.jl
index 8952a86..e087e6a 100644
--- a/docs/src/examples/sparse_linear_regression.jl
+++ b/docs/src/examples/sparse_linear_regression.jl
@@ -35,8 +35,8 @@ n_training, n_features = size(training_input)
 using LinearAlgebra
 using Statistics
 
-input_loc = mean(training_input, dims=1) |> vec
-input_scale = std(training_input, dims=1) |> vec
+input_loc = mean(training_input, dims = 1) |> vec
+input_scale = std(training_input, dims = 1) |> vec
 
 linear_model(wb, input) = input * wb[1:end-1] .+ wb[end]
 
@@ -73,7 +73,7 @@ reg = ProximalOperators.NormL1(1)
 # and the objective terms `f=training_loss` (smooth) and `g=reg` (non smooth).
 
 ffb = ProximalAlgorithms.FastForwardBackward()
-solution, iterations = ffb(x0=zeros(n_features + 1), f=training_loss, g=reg)
+solution, iterations = ffb(x0 = zeros(n_features + 1), f = training_loss, g = reg)
 
 # We can now check how well the trained model performs on the test portion of our data.
 
diff --git a/docs/src/guide/custom_objectives.jl b/docs/src/guide/custom_objectives.jl
index cf61cc8..dfdee82 100644
--- a/docs/src/guide/custom_objectives.jl
+++ b/docs/src/guide/custom_objectives.jl
@@ -63,7 +63,7 @@ end
 # We can now minimize the function, for which we will use [`PANOC`](@ref), which is a Newton-type method:
 
 panoc = ProximalAlgorithms.PANOC()
-solution, iterations = panoc(x0=-ones(2), f=rosenbrock2D, g=IndUnitBall())
+solution, iterations = panoc(x0 = -ones(2), f = rosenbrock2D, g = IndUnitBall())
 
 # Plotting the solution against the cost function contour and constraint, gives an idea of its correctness.
 
@@ -73,17 +73,17 @@ contour(
     -2:0.1:2,
     -2:0.1:2,
     (x, y) -> rosenbrock2D([x, y]),
-    fill=true,
-    framestyle=:none,
-    background=nothing,
+    fill = true,
+    framestyle = :none,
+    background = nothing,
 )
-plot!(Shape(cos.(0:0.01:2*pi), sin.(0:0.01:2*pi)), opacity=0.5, label="feasible set")
+plot!(Shape(cos.(0:0.01:2*pi), sin.(0:0.01:2*pi)), opacity = 0.5, label = "feasible set")
 scatter!(
     [solution[1]],
     [solution[2]],
-    color=:red,
-    markershape=:star5,
-    label="computed solution",
+    color = :red,
+    markershape = :star5,
+    label = "computed solution",
 )
 
 # ## Example: counting operations
@@ -128,7 +128,7 @@ end
 f = Counting(rosenbrock2D)
 g = Counting(IndUnitBall())
 
-solution, iterations = panoc(x0=-ones(2), f=f, g=g)
+solution, iterations = panoc(x0 = -ones(2), f = f, g = g)
 
 # and check how many operations where actually performed:
 
diff --git a/src/utilities/fb_tools.jl b/src/utilities/fb_tools.jl
index 0c88c8a..42c46f3 100644
--- a/src/utilities/fb_tools.jl
+++ b/src/utilities/fb_tools.jl
@@ -34,10 +34,10 @@ function backtrack_stepsize!(
     g_z::R,
     res,
     Az,
-    grad_f_Az=nothing;
-    alpha=R(1),
-    minimum_gamma=R(1e-7),
-    reduce_gamma=R(0.5),
+    grad_f_Az = nothing;
+    alpha = R(1),
+    minimum_gamma = R(1e-7),
+    reduce_gamma = R(0.5),
 ) where {R}
     f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
     _mul!(Az, A, z)
@@ -65,9 +65,9 @@ function backtrack_stepsize!(
     A,
     g,
     x;
-    alpha=R(1),
-    minimum_gamma=R(1e-7),
-    reduce_gamma=R(0.5),
+    alpha = R(1),
+    minimum_gamma = R(1e-7),
+    reduce_gamma = R(0.5),
 ) where {R}
     Ax = A * x
     f_Ax, grad_f_Ax = value_and_gradient(f, Ax)
@@ -88,8 +88,8 @@ function backtrack_stepsize!(
         x - z,
         Ax,
         grad_f_Ax;
-        alpha=alpha,
-        minimum_gamma=minimum_gamma,
-        reduce_gamma=reduce_gamma,
+        alpha = alpha,
+        minimum_gamma = minimum_gamma,
+        reduce_gamma = reduce_gamma,
     )
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 42b4be9..d61d66e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -16,7 +16,7 @@ function ProximalAlgorithms.value_and_gradient(f::Quadratic, x)
 end
 
 @testset "Aqua" begin
-    Aqua.test_all(ProximalAlgorithms; ambiguities=false)
+    Aqua.test_all(ProximalAlgorithms; ambiguities = false)
 end
 
 include("utilities/test_ad.jl")

From 1836ee6bfb8f128c243f538c933f708f3ca41486 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Wed, 17 Jul 2024 10:12:43 +0200
Subject: [PATCH 3/4] Fix backtracking

---
 src/utilities/fb_tools.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/utilities/fb_tools.jl b/src/utilities/fb_tools.jl
index 42c46f3..4058641 100644
--- a/src/utilities/fb_tools.jl
+++ b/src/utilities/fb_tools.jl
@@ -41,7 +41,7 @@ function backtrack_stepsize!(
 ) where {R}
     f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
     _mul!(Az, A, z)
-    f_Az, grad_f_Az = value_and_gradient(f, Az)
+    f_Az, grad_f_Az_tmp = value_and_gradient(f, Az)
     tol = 10 * eps(R) * (1 + abs(f_Az))
     while f_Az > f_Az_upp + tol && gamma >= minimum_gamma
         gamma *= reduce_gamma
@@ -50,9 +50,12 @@ function backtrack_stepsize!(
         res .= x .- z
         f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
         _mul!(Az, A, z)
-        f_Az, grad_f_Az = value_and_gradient(f, Az)
+        f_Az, grad_f_Az_tmp = value_and_gradient(f, Az)
         tol = 10 * eps(R) * (1 + abs(f_Az))
     end
+    if grad_f_Az !== nothing
+        grad_f_Az .= grad_f_Az_tmp
+    end
     if gamma < minimum_gamma
         @warn "stepsize `gamma` became too small ($(gamma))"
     end

From 5db374c37ca61e49fd8e6980c15234963aee6962 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Wed, 24 Jul 2024 21:45:02 +0200
Subject: [PATCH 4/4] Update Project.toml

Co-authored-by: Lorenzo Stella <lorenzostella@gmail.com>
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 41a6248..744eb5b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "ProximalAlgorithms"
 uuid = "140ffc9f-1907-541a-a177-7475e0a401e9"
-version = "0.6.0"
+version = "0.7.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"