Skip to content

Commit

Permalink
improved effiency of ValueGradientKernel by introducing specialized v…
Browse files Browse the repository at this point in the history
…alue_gradient_covariance function
  • Loading branch information
SebastianAment committed May 18, 2022
1 parent 8f8641f commit c4dd8ba
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 104 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,11 @@ b = zero(a);
@time mul!(b, G, a); # multiplying with G allocates little memory
0.394388 seconds (67 allocations: 86.516 KiB)
```
Note that the last multiplication was with a **million by million matrix**,
The last multiplication was with a **million by million matrix**,
which would be impossible without CovarianceFunctions.jl's lazy and structured representation of the gradient kernel matrix.
Note that `GradientKernel` only computes covariances of gradient observations,
to get the `(d+1) × (d+1)` covariance kernel that includes value observations,
use `ValueGradientKernel`.

To highlight the scalability of this MVM algorithm, we compare against the implementation in [GPyTorch](https://docs.gpytorch.ai/en/stable/kernels.html?highlight=kernels#rbfkernelgrad) and the fast *approximate* MVM provided by [D-SKIP](https://github.com/ericlee0803/GP_Derivatives).

Expand Down
2 changes: 2 additions & 0 deletions src/CovarianceFunctions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ using IterativeSolvers
using ToeplitzMatrices
using FFTW

# IDEA: AbstractKernel{T, IT}, where IT isa InputType
# then const IsotropicKernel{T} = AbstractKernel{T, IsotropicInput} ...
abstract type AbstractKernel{T} end
abstract type MercerKernel{T} <: AbstractKernel{T} end
abstract type StationaryKernel{T} <: MercerKernel{T} end
Expand Down
29 changes: 14 additions & 15 deletions src/algebra.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
# NOTE: output type inference of product, sum, and power not supported for
# user-defined kernels unless Base.eltype is defined for them
################################ Product #######################################
# IDEA: constructors which merge products and sums
struct Product{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T}
struct Product{T, AT<:Union{Tuple, AbstractVector}, IT} <: AbstractKernel{T}
args::AT
# input_traits : # could keep track of input_trait.(args)
# input_trait # could keep track of the overall input trait
input_trait::IT # could keep track of the overall input trait
end
@functor Product
function Product(k::Union{Tuple, AbstractVector})
T = promote_type(eltype.(k)...)
Product{T, typeof(k)}(k)
IT = sum_and_product_input_trait(k)
Product{T, typeof(k), typeof(IT)}(k, IT)
end
Product(k...) = Product(k)
(P::Product)(τ) = prod(k->k(τ), P.args) # IDEA could check for isotropy here
Expand All @@ -26,47 +25,48 @@ Base.:*(c::Number, k::AbstractKernel) = Constant(c) * k
Base.:*(k::AbstractKernel, c::Number) = Constant(c) * k

################################### Sum ########################################
struct Sum{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T}
struct Sum{T, AT<:Union{Tuple, AbstractVector}, IT} <: AbstractKernel{T}
args::AT
# input_trait # could keep track of the overall input trait
input_trait::IT # could keep track of the overall input trait
end
@functor Sum
function Sum(k::Union{Tuple, AbstractVector})
T = promote_type(eltype.(k)...)
Sum{T, typeof(k)}(k)
IT = sum_and_product_input_trait(k)
Sum{T, typeof(k), typeof(IT)}(k, IT)
end
Sum(k...) = Sum(k)
(S::Sum)(τ) = sum(k->k(τ), S.args) # should only be called if S is stationary
(S::Sum)(x, y) = sum(k->k(x, y), S.args)
# (S::Sum)(τ) = isstationary(S) ? sum(k->k(τ), S.args) : error("One argument evaluation not possible for non-stationary kernel")
# (S::Sum)(x, y) = isstationary(S) ? S(difference(x, y)) : sum(k->k(x, y), S.args)
Sum(k...) = Sum(k)
Base.sum(k::AbstractVector{<:AbstractKernel}) = Sum(k)

Base.:+(k::AbstractKernel...) = Sum(k)
Base.:+(k::AbstractKernel, c::Number) = k + Constant(c)
Base.:+(c::Number, k::AbstractKernel) = k + Constant(c)

################################## Power #######################################
struct Power{T, K<:AbstractKernel} <: AbstractKernel{T}
struct Power{T, K, IT} <: AbstractKernel{T}
k::K
p::Int
# input_trait # could keep track of the overall input trait
input_trait::IT # could keep track of the overall input trait
end
@functor Power
function Power(k, p::Int)
T = promote_type(eltype(k))
Power{T, typeof(k)}(k, p)
IT = input_trait(k)
Power{T, typeof(k), typeof(IT)}(k, p, IT)
end
(P::Power)(τ) = P.k(τ)^P.p
(P::Power)(x, y) = P.k(x, y)^P.p
Base.:^(k::AbstractKernel, p::Number) = Power(k, p)

############################ Separable Product #################################
# product kernel, but separately evaluates component kernels on different parts of the input
# NOTE: input_trait(::SeparableProduct) defaults to GenericInput
struct SeparableProduct{T, K} <: AbstractKernel{T}
args::K # kernel for input covariances
# input_trait # could keep track of the overall input trait
end
@functor SeparableProduct
SeparableProduct(k...) = SeparableProduct(k)
Expand Down Expand Up @@ -101,17 +101,16 @@ end
# useful for "Additive Gaussian Processes" - Duvenaud 2011
# https://papers.nips.cc/paper/2011/file/4c5bde74a8f110656874902f07378009-Paper.pdf
# IDEA: could introduce have AdditiveKernel with "order" argument, that adds higher order interactions
# NOTE: input_trait(::SeparableSum) defaults to GenericInput
struct SeparableSum{T, K} <: AbstractKernel{T}
args::K # kernel for input covariances
end
@functor SeparableSum

SeparableSum(k...) = SeparableSum(k)
function SeparableSum(k::Union{Tuple, AbstractVector})
T = promote_type(eltype.(k)...)
SeparableSum{T, typeof(k)}(k)
end

function (k::SeparableSum)(x::AbstractVector, y::AbstractVector)
d = checklength(x, y)
length(k.args) == d || throw(DimensionMismatch("SeparableProduct needs d = $d kernels but has r = $(length(k.args))"))
Expand Down
113 changes: 88 additions & 25 deletions src/gradient.jl
Original file line number Diff line number Diff line change
Expand Up @@ -431,39 +431,32 @@ function (G::ValueGradientKernel)(x, y)
value_gradient_kernel(G.k, x, y, input_trait(G.k))
end

function value_gradient_kernel(k, x, y, T::InputTrait = input_trait(G.k))
function value_gradient_kernel(k, x, y, IT::InputTrait = input_trait(G.k))
d = length(x)
kxy = k(x, y)
value_value = kxy
value_gradient = MVector{d, typeof(kxy)}(undef)
gradient_value = MVector{d, typeof(kxy)}(undef)
gradient_gradient = gradient_kernel(k, x, y, T)
value_gradient = zeros(typeof(kxy), d)
gradient_value = zeros(typeof(kxy), d)
gradient_gradient = gradient_kernel(k, x, y, IT)
K = DerivativeKernelElement(value_value, value_gradient, gradient_value, gradient_gradient)
value_gradient_kernel!(K, k, x, y, T)
end

# IDEA: specialize first_gradient!(g, k, x, y) = ForwardDiff.gradient!(g, z->k(z, y), x)
# computes covariance between value and gradient
# function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput)
# r² = sum(abs2, difference(x, y))
# g .= derivative(k, r²)
# end
#
# function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput())
# r² = sum(abs2, difference(x, y))
# g .= derivative(k, r²)
# end

# IDEA: specialize evaluate for IsotropicInput, DotProductInput
# returns block matrix
function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::InputTrait)
value_gradient_kernel!(K, k, x, y, IT)
end

# IDEA: for Sum and Product kernels, if input_trait is not passed, could default to Generic
# No, should keep track of input type in kernel
function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y)
value_gradient_kernel!(K, k, x, y, input_trait(k))
end
function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, IT::InputTrait)
K.value_value = k(x, y)
K.value_gradient .= ForwardDiff.gradient(z->k(x, z), y) # ForwardDiff.gradient!(r, z->k(z, y), x)
K.gradient_value .= ForwardDiff.gradient(z->k(z, y), x)
K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T)
value_gradient_covariance!(K.gradient_value, K.value_gradient, k, x, y, IT)
K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, IT)
return K
end

# NOTE: since value_gradient_covariance! was added, specializations of value_gradient_kernel!
# only yield marginal performance improvements (<2x)
# could be removed to reduce LOC
function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::DotProductInput)
xy = dot(x, y)
k0, k1 = value_derivative(k, xy)
Expand Down Expand Up @@ -493,6 +486,76 @@ function evaluate_block!(Gij::DerivativeKernelElement, k::ValueGradientKernel, x
value_gradient_kernel!(Gij, k.k, x, y, IT)
end

################################################################################
# computes covariance between value and gradient, needed for value-gradient kernel
# technically, complexity would be O(d) even with generic implementation
# in practice, at least an order of magitude can be gained by specialized implementations
function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput)
# GradientConfig() # for generic version, this could be pre-computed for efficiency gains
ForwardDiff.gradient!(gx, z->k(z, y), x) # these are bottlenecks
ForwardDiff.gradient!(gy, z->k(x, z), y)
return gx, gy
end

function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput, α::Real, β::Real)
tx, ty = ForwardDiff.gradient(z->k(z, y), x), ForwardDiff.gradient(z->k(x, z), y)
@. gx = α * tx + β * gx
@. gy = α * ty + β * gy
return gx, gy
end

function value_gradient_covariance!(gx, gy, k::Sum, x, y, ::GenericInput, α::Real = 1, β::Real = 0)
@. gx *= β
@. gy *= β
for h in k.args # input_trait(h) should not be called if h is a composite kernel for higher effiency
value_gradient_covariance!(gx, gy, h, x, y, input_trait(h), α, 1)
end
return gx, gy
end

# this is more tricky to do without additional allocations
# could do it with one more vector
function value_gradient_covariance!(gx, gy, k::Product, x, y, ::GenericInput, α::Real = 1, β::Real = 0)
@. gx *= β
@. gy *= β
prod_k_xy = k(x, y)
if !iszero(prod_k_xy)
for i in eachindex(k.args)
ki = k.args[i]
αi = α * prod_k_xy / ki(x, y)
value_gradient_covariance!(gx, gy, ki, x, y, input_trait(ki), αi, 1)
end
else # GradientKernel of Product requires less efficient O(dr²) special case if prod_k_xy is zero, for now, throw error
throw(DomainError("value_gradient_covariance! of Product not supported for products that are zero"))
end
return gx, gy
end

function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput, α::Real = 1, β::Real = 0)
r = difference(x, y)
= sum(abs2, r)
k1 = ForwardDiff.derivative(k, r²) # IDEA: this computation could be pooled with the gradient computation
@. gx = α * 2k1 * r + β * gx
@. gy = -α * 2k1 * r + β * gy
return gx, gy
end

function value_gradient_covariance!(gx, gy, k, x, y, ::DotProductInput, α::Real = 1, β::Real = 0)
xy = dot(x, y)
k1 = ForwardDiff.derivative(k, xy) # IDEA: this computation could be pooled with the gradient computation
@. gx = α * k1 * y + β * gx
@. gy = α * k1 * x + β * gy
return gx, gy
end

function value_gradient_covariance!(gx, gy, k, x, y, ::StationaryLinearFunctionalInput, α::Real = 1, β::Real = 0)
cr = dot(k.c, difference(x, y))
k1 = ForwardDiff.derivative(k, cr) # IDEA: this computation could be pooled with the gradient computation
@. gx = α * k1 * k.c + β * gx
@. gy = -α * k1 * k.c + β * gy
return gx, gy
end

################################################################################
# for 1d inputs
struct DerivativeKernel{T, K} <: AbstractKernel{T}
Expand Down
54 changes: 24 additions & 30 deletions src/gradient_algebra.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,39 +35,33 @@ function gradient_kernel(k::Product, x, y, ::GenericInput)
gradient_kernel!(W, k, x, y, GenericInput())
end

# IDEA: could have kernel element for heterogeneous product kernels, problem: would need to
# pre-allocate storage for Jacobian matrix or be allocating
# struct ProductGradientKernelElement{T, K, X, Y} <: Factorization{T}
# k::K
# x::X
# y::Y
# end
function gradient_kernel!(W::Woodbury, k::Product, x::AbstractVector, y::AbstractVector, ::GenericInput = input_trait(k))
A = W.A # this is a LazyMatrixSum of LazyMatrixProducts
prod_k_j = k(x, y)

# k_vec(x, y) = [h(x, y) for h in k.args] # include in loop
# ForwardDiff.jacobian!(W.U', z->k_vec(z, y), x) # this is actually less allocating than the gradient! option
# ForwardDiff.jacobian!(W.V, z->k_vec(x, z), y)
# GradientConfig() # for generic version, this could be pre-computed for efficiency gains
r = length(k.args)
for i in 1:r # parallelize this?
h, H = k.args[i], A.args[i]
hxy = h(x, y)
D = H.args[1]
D.diag .= prod_k_j / hxy
# input_trait(h) could be pre-computed, or should not be passed here, because the factors might be composite kernels themselves
H.args[2] = gradient_kernel!(H.args[2], h, x, y, input_trait(h))

ui, vi = @views W.U[:, i], W.V[i, :]
ForwardDiff.gradient!(ui, z->h(z, y), x) # these are bottlenecks
ForwardDiff.gradient!(vi, z->h(x, z), y) # TODO: replace by value_gradient_covariance!
@. ui *= prod_k_j / hxy
@. vi /= hxy
A = W.A # this is a LazyMatrixSum of LazyMatrixProducts
prod_k_xy = k(x, y)
if !iszero(prod_k_xy) # main case
for i in 1:r # parallelize this?
h, H = k.args[i], A.args[i]
ui, vi = @views W.U[:, i], W.V[i, :]
product_gradient_kernel_helper!(h, H, prod_k_xy, ui, vi, x, y)
end
else # requires less efficient O(dr²) special case if prod_k_xy is zero, for now, throw error
throw(DomainError("GradientKernel of Product not supported for products that are zero, since it would require a O(dr²) special case."))
end
return W
end

@inline function product_gradient_kernel_helper!(ki, Ki, prod_k_xy, ui, vi, x, y, IT::InputTrait = input_trait(ki))
ki_xy = ki(x, y)
D = Ki.args[1]
@. D.diag = prod_k_xy / ki_xy
Ki.args[2] = gradient_kernel!(Ki.args[2], ki, x, y, IT)
value_gradient_covariance!(ui, vi, ki, x, y, IT)
@. ui *= prod_k_xy / ki_xy
@. vi /= ki_xy
return Ki
end

############################# Separable Product ################################
# for product kernel with generic input
function allocate_gradient_kernel(k::SeparableProduct, x::AbstractVector{<:Number},
Expand All @@ -90,16 +84,16 @@ end

function gradient_kernel!(W::Woodbury, k::SeparableProduct, x::AbstractVector, y::AbstractVector, ::GenericInput = input_trait(k))
A = W.A # this is a LazyMatrixProducts of Diagonals
prod_k_j = k(x, y)
prod_k_xy = k(x, y)
D, H = A.args # first is scaling matrix by leave_one_out_products, second is diagonal of derivative kernels
for (i, ki) in enumerate(k.args)
xi, yi = x[i], y[i]
kixy = ki(xi, yi)
# D[i, i] = prod_k_j / kixy
# D[i, i] = prod_k_xy / kixy
D[i, i] = ki(xi, yi)
W.U[i, i] = ForwardDiff.derivative(z->ki(z, yi), xi)
W.V[i, i] = ForwardDiff.derivative(z->ki(xi, z), yi)
W.U[i, i] *= prod_k_j / kixy
W.U[i, i] *= prod_k_xy / kixy
W.V[i, i] /= kixy
H[i, i] = DerivativeKernel(ki)(xi, yi)
end
Expand Down
11 changes: 6 additions & 5 deletions src/properties.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,16 @@ input_trait(::IsotropicKernel) = IsotropicInput()
input_trait(::Union{Dot, ExponentialDot}) = DotProductInput()
input_trait(P::Power) = input_trait(P.k)

input_trait(S::Union{Product, Sum}) = S.input_trait
# special treatment for constant kernel, since it can function for any input
function input_trait(S::ProductsAndSums)
i = findfirst(x->!isa(x, Constant), S.args)
function sum_and_product_input_trait(args)
i = findfirst(x->!isa(x, Constant), args)
if isnothing(i) # all constant kernels
return IsotropicInput()
else
trait = input_trait(S.args[i]) # first non-constant kernel
for j in i+1:length(S.args)
k = S.args[j]
trait = input_trait(args[i]) # first non-constant kernel
for j in i+1:length(args)
k = args[j]
if k isa Constant # ignore constants, since they can function as any input type
continue
elseif input_trait(k) != trait # if the non-constant kernels don't have the same input type,
Expand Down
Loading

0 comments on commit c4dd8ba

Please sign in to comment.