diff --git a/README.md b/README.md index 47878a0..fa44fbd 100644 --- a/README.md +++ b/README.md @@ -241,8 +241,11 @@ b = zero(a); @time mul!(b, G, a); # multiplying with G allocates little memory 0.394388 seconds (67 allocations: 86.516 KiB) ``` -Note that the last multiplication was with a **million by million matrix**, +The last multiplication was with a **million by million matrix**, which would be impossible without CovarianceFunctions.jl's lazy and structured representation of the gradient kernel matrix. +Note that `GradientKernel` only computes covariances of gradient observations, +to get the `(d+1) × (d+1)` covariance kernel that includes value observations, +use `ValueGradientKernel`. To highlight the scalability of this MVM algorithm, we compare against the implementation in [GPyTorch](https://docs.gpytorch.ai/en/stable/kernels.html?highlight=kernels#rbfkernelgrad) and the fast *approximate* MVM provided by [D-SKIP](https://github.com/ericlee0803/GP_Derivatives). diff --git a/src/CovarianceFunctions.jl b/src/CovarianceFunctions.jl index dcb1582..a72c7db 100644 --- a/src/CovarianceFunctions.jl +++ b/src/CovarianceFunctions.jl @@ -27,6 +27,8 @@ using IterativeSolvers using ToeplitzMatrices using FFTW +# IDEA: AbstractKernel{T, IT}, where IT isa InputType +# then const IsotropicKernel{T} = AbstractKernel{T, IsotropicInput} ... abstract type AbstractKernel{T} end abstract type MercerKernel{T} <: AbstractKernel{T} end abstract type StationaryKernel{T} <: MercerKernel{T} end diff --git a/src/algebra.jl b/src/algebra.jl index 11586e7..6cffb49 100644 --- a/src/algebra.jl +++ b/src/algebra.jl @@ -2,16 +2,15 @@ # NOTE: output type inference of product, sum, and power not supported for # user-defined kernels unless Base.eltype is defined for them ################################ Product ####################################### -# IDEA: constructors which merge products and sums -struct Product{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T} +struct Product{T, AT<:Union{Tuple, AbstractVector}, IT} <: AbstractKernel{T} args::AT - # input_traits : # could keep track of input_trait.(args) - # input_trait # could keep track of the overall input trait + input_trait::IT # could keep track of the overall input trait end @functor Product function Product(k::Union{Tuple, AbstractVector}) T = promote_type(eltype.(k)...) - Product{T, typeof(k)}(k) + IT = sum_and_product_input_trait(k) + Product{T, typeof(k), typeof(IT)}(k, IT) end Product(k...) = Product(k) (P::Product)(τ) = prod(k->k(τ), P.args) # IDEA could check for isotropy here @@ -26,21 +25,21 @@ Base.:*(c::Number, k::AbstractKernel) = Constant(c) * k Base.:*(k::AbstractKernel, c::Number) = Constant(c) * k ################################### Sum ######################################## -struct Sum{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T} +struct Sum{T, AT<:Union{Tuple, AbstractVector}, IT} <: AbstractKernel{T} args::AT - # input_trait # could keep track of the overall input trait + input_trait::IT # could keep track of the overall input trait end @functor Sum function Sum(k::Union{Tuple, AbstractVector}) T = promote_type(eltype.(k)...) - Sum{T, typeof(k)}(k) + IT = sum_and_product_input_trait(k) + Sum{T, typeof(k), typeof(IT)}(k, IT) end Sum(k...) = Sum(k) (S::Sum)(τ) = sum(k->k(τ), S.args) # should only be called if S is stationary (S::Sum)(x, y) = sum(k->k(x, y), S.args) # (S::Sum)(τ) = isstationary(S) ? sum(k->k(τ), S.args) : error("One argument evaluation not possible for non-stationary kernel") # (S::Sum)(x, y) = isstationary(S) ? S(difference(x, y)) : sum(k->k(x, y), S.args) -Sum(k...) = Sum(k) Base.sum(k::AbstractVector{<:AbstractKernel}) = Sum(k) Base.:+(k::AbstractKernel...) = Sum(k) @@ -48,15 +47,16 @@ Base.:+(k::AbstractKernel, c::Number) = k + Constant(c) Base.:+(c::Number, k::AbstractKernel) = k + Constant(c) ################################## Power ####################################### -struct Power{T, K<:AbstractKernel} <: AbstractKernel{T} +struct Power{T, K, IT} <: AbstractKernel{T} k::K p::Int - # input_trait # could keep track of the overall input trait + input_trait::IT # could keep track of the overall input trait end @functor Power function Power(k, p::Int) T = promote_type(eltype(k)) - Power{T, typeof(k)}(k, p) + IT = input_trait(k) + Power{T, typeof(k), typeof(IT)}(k, p, IT) end (P::Power)(τ) = P.k(τ)^P.p (P::Power)(x, y) = P.k(x, y)^P.p @@ -64,9 +64,9 @@ Base.:^(k::AbstractKernel, p::Number) = Power(k, p) ############################ Separable Product ################################# # product kernel, but separately evaluates component kernels on different parts of the input +# NOTE: input_trait(::SeparableProduct) defaults to GenericInput struct SeparableProduct{T, K} <: AbstractKernel{T} args::K # kernel for input covariances - # input_trait # could keep track of the overall input trait end @functor SeparableProduct SeparableProduct(k...) = SeparableProduct(k) @@ -101,17 +101,16 @@ end # useful for "Additive Gaussian Processes" - Duvenaud 2011 # https://papers.nips.cc/paper/2011/file/4c5bde74a8f110656874902f07378009-Paper.pdf # IDEA: could introduce have AdditiveKernel with "order" argument, that adds higher order interactions +# NOTE: input_trait(::SeparableSum) defaults to GenericInput struct SeparableSum{T, K} <: AbstractKernel{T} args::K # kernel for input covariances end @functor SeparableSum - SeparableSum(k...) = SeparableSum(k) function SeparableSum(k::Union{Tuple, AbstractVector}) T = promote_type(eltype.(k)...) SeparableSum{T, typeof(k)}(k) end - function (k::SeparableSum)(x::AbstractVector, y::AbstractVector) d = checklength(x, y) length(k.args) == d || throw(DimensionMismatch("SeparableProduct needs d = $d kernels but has r = $(length(k.args))")) diff --git a/src/gradient.jl b/src/gradient.jl index 48f7d2c..df692bd 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -431,39 +431,32 @@ function (G::ValueGradientKernel)(x, y) value_gradient_kernel(G.k, x, y, input_trait(G.k)) end -function value_gradient_kernel(k, x, y, T::InputTrait = input_trait(G.k)) +function value_gradient_kernel(k, x, y, IT::InputTrait = input_trait(G.k)) d = length(x) kxy = k(x, y) value_value = kxy - value_gradient = MVector{d, typeof(kxy)}(undef) - gradient_value = MVector{d, typeof(kxy)}(undef) - gradient_gradient = gradient_kernel(k, x, y, T) + value_gradient = zeros(typeof(kxy), d) + gradient_value = zeros(typeof(kxy), d) + gradient_gradient = gradient_kernel(k, x, y, IT) K = DerivativeKernelElement(value_value, value_gradient, gradient_value, gradient_gradient) - value_gradient_kernel!(K, k, x, y, T) -end - -# IDEA: specialize first_gradient!(g, k, x, y) = ForwardDiff.gradient!(g, z->k(z, y), x) -# computes covariance between value and gradient -# function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput) -# r² = sum(abs2, difference(x, y)) -# g .= derivative(k, r²) -# end -# -# function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput()) -# r² = sum(abs2, difference(x, y)) -# g .= derivative(k, r²) -# end - -# IDEA: specialize evaluate for IsotropicInput, DotProductInput -# returns block matrix -function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::InputTrait) + value_gradient_kernel!(K, k, x, y, IT) +end + +# IDEA: for Sum and Product kernels, if input_trait is not passed, could default to Generic +# No, should keep track of input type in kernel +function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y) + value_gradient_kernel!(K, k, x, y, input_trait(k)) +end +function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, IT::InputTrait) K.value_value = k(x, y) - K.value_gradient .= ForwardDiff.gradient(z->k(x, z), y) # ForwardDiff.gradient!(r, z->k(z, y), x) - K.gradient_value .= ForwardDiff.gradient(z->k(z, y), x) - K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T) + value_gradient_covariance!(K.gradient_value, K.value_gradient, k, x, y, IT) + K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, IT) return K end +# NOTE: since value_gradient_covariance! was added, specializations of value_gradient_kernel! +# only yield marginal performance improvements (<2x) +# could be removed to reduce LOC function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::DotProductInput) xy = dot(x, y) k0, k1 = value_derivative(k, xy) @@ -493,6 +486,76 @@ function evaluate_block!(Gij::DerivativeKernelElement, k::ValueGradientKernel, x value_gradient_kernel!(Gij, k.k, x, y, IT) end +################################################################################ +# computes covariance between value and gradient, needed for value-gradient kernel +# technically, complexity would be O(d) even with generic implementation +# in practice, at least an order of magitude can be gained by specialized implementations +function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput) + # GradientConfig() # for generic version, this could be pre-computed for efficiency gains + ForwardDiff.gradient!(gx, z->k(z, y), x) # these are bottlenecks + ForwardDiff.gradient!(gy, z->k(x, z), y) + return gx, gy +end + +function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput, α::Real, β::Real) + tx, ty = ForwardDiff.gradient(z->k(z, y), x), ForwardDiff.gradient(z->k(x, z), y) + @. gx = α * tx + β * gx + @. gy = α * ty + β * gy + return gx, gy +end + +function value_gradient_covariance!(gx, gy, k::Sum, x, y, ::GenericInput, α::Real = 1, β::Real = 0) + @. gx *= β + @. gy *= β + for h in k.args # input_trait(h) should not be called if h is a composite kernel for higher effiency + value_gradient_covariance!(gx, gy, h, x, y, input_trait(h), α, 1) + end + return gx, gy +end + +# this is more tricky to do without additional allocations +# could do it with one more vector +function value_gradient_covariance!(gx, gy, k::Product, x, y, ::GenericInput, α::Real = 1, β::Real = 0) + @. gx *= β + @. gy *= β + prod_k_xy = k(x, y) + if !iszero(prod_k_xy) + for i in eachindex(k.args) + ki = k.args[i] + αi = α * prod_k_xy / ki(x, y) + value_gradient_covariance!(gx, gy, ki, x, y, input_trait(ki), αi, 1) + end + else # GradientKernel of Product requires less efficient O(dr²) special case if prod_k_xy is zero, for now, throw error + throw(DomainError("value_gradient_covariance! of Product not supported for products that are zero")) + end + return gx, gy +end + +function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput, α::Real = 1, β::Real = 0) + r = difference(x, y) + r² = sum(abs2, r) + k1 = ForwardDiff.derivative(k, r²) # IDEA: this computation could be pooled with the gradient computation + @. gx = α * 2k1 * r + β * gx + @. gy = -α * 2k1 * r + β * gy + return gx, gy +end + +function value_gradient_covariance!(gx, gy, k, x, y, ::DotProductInput, α::Real = 1, β::Real = 0) + xy = dot(x, y) + k1 = ForwardDiff.derivative(k, xy) # IDEA: this computation could be pooled with the gradient computation + @. gx = α * k1 * y + β * gx + @. gy = α * k1 * x + β * gy + return gx, gy +end + +function value_gradient_covariance!(gx, gy, k, x, y, ::StationaryLinearFunctionalInput, α::Real = 1, β::Real = 0) + cr = dot(k.c, difference(x, y)) + k1 = ForwardDiff.derivative(k, cr) # IDEA: this computation could be pooled with the gradient computation + @. gx = α * k1 * k.c + β * gx + @. gy = -α * k1 * k.c + β * gy + return gx, gy +end + ################################################################################ # for 1d inputs struct DerivativeKernel{T, K} <: AbstractKernel{T} diff --git a/src/gradient_algebra.jl b/src/gradient_algebra.jl index 1df7b93..f30862e 100644 --- a/src/gradient_algebra.jl +++ b/src/gradient_algebra.jl @@ -35,39 +35,33 @@ function gradient_kernel(k::Product, x, y, ::GenericInput) gradient_kernel!(W, k, x, y, GenericInput()) end -# IDEA: could have kernel element for heterogeneous product kernels, problem: would need to -# pre-allocate storage for Jacobian matrix or be allocating -# struct ProductGradientKernelElement{T, K, X, Y} <: Factorization{T} -# k::K -# x::X -# y::Y -# end function gradient_kernel!(W::Woodbury, k::Product, x::AbstractVector, y::AbstractVector, ::GenericInput = input_trait(k)) - A = W.A # this is a LazyMatrixSum of LazyMatrixProducts - prod_k_j = k(x, y) - - # k_vec(x, y) = [h(x, y) for h in k.args] # include in loop - # ForwardDiff.jacobian!(W.U', z->k_vec(z, y), x) # this is actually less allocating than the gradient! option - # ForwardDiff.jacobian!(W.V, z->k_vec(x, z), y) - # GradientConfig() # for generic version, this could be pre-computed for efficiency gains r = length(k.args) - for i in 1:r # parallelize this? - h, H = k.args[i], A.args[i] - hxy = h(x, y) - D = H.args[1] - D.diag .= prod_k_j / hxy - # input_trait(h) could be pre-computed, or should not be passed here, because the factors might be composite kernels themselves - H.args[2] = gradient_kernel!(H.args[2], h, x, y, input_trait(h)) - - ui, vi = @views W.U[:, i], W.V[i, :] - ForwardDiff.gradient!(ui, z->h(z, y), x) # these are bottlenecks - ForwardDiff.gradient!(vi, z->h(x, z), y) # TODO: replace by value_gradient_covariance! - @. ui *= prod_k_j / hxy - @. vi /= hxy + A = W.A # this is a LazyMatrixSum of LazyMatrixProducts + prod_k_xy = k(x, y) + if !iszero(prod_k_xy) # main case + for i in 1:r # parallelize this? + h, H = k.args[i], A.args[i] + ui, vi = @views W.U[:, i], W.V[i, :] + product_gradient_kernel_helper!(h, H, prod_k_xy, ui, vi, x, y) + end + else # requires less efficient O(dr²) special case if prod_k_xy is zero, for now, throw error + throw(DomainError("GradientKernel of Product not supported for products that are zero, since it would require a O(dr²) special case.")) end return W end +@inline function product_gradient_kernel_helper!(ki, Ki, prod_k_xy, ui, vi, x, y, IT::InputTrait = input_trait(ki)) + ki_xy = ki(x, y) + D = Ki.args[1] + @. D.diag = prod_k_xy / ki_xy + Ki.args[2] = gradient_kernel!(Ki.args[2], ki, x, y, IT) + value_gradient_covariance!(ui, vi, ki, x, y, IT) + @. ui *= prod_k_xy / ki_xy + @. vi /= ki_xy + return Ki +end + ############################# Separable Product ################################ # for product kernel with generic input function allocate_gradient_kernel(k::SeparableProduct, x::AbstractVector{<:Number}, @@ -90,16 +84,16 @@ end function gradient_kernel!(W::Woodbury, k::SeparableProduct, x::AbstractVector, y::AbstractVector, ::GenericInput = input_trait(k)) A = W.A # this is a LazyMatrixProducts of Diagonals - prod_k_j = k(x, y) + prod_k_xy = k(x, y) D, H = A.args # first is scaling matrix by leave_one_out_products, second is diagonal of derivative kernels for (i, ki) in enumerate(k.args) xi, yi = x[i], y[i] kixy = ki(xi, yi) - # D[i, i] = prod_k_j / kixy + # D[i, i] = prod_k_xy / kixy D[i, i] = ki(xi, yi) W.U[i, i] = ForwardDiff.derivative(z->ki(z, yi), xi) W.V[i, i] = ForwardDiff.derivative(z->ki(xi, z), yi) - W.U[i, i] *= prod_k_j / kixy + W.U[i, i] *= prod_k_xy / kixy W.V[i, i] /= kixy H[i, i] = DerivativeKernel(ki)(xi, yi) end diff --git a/src/properties.jl b/src/properties.jl index 90fd9bf..4e05215 100644 --- a/src/properties.jl +++ b/src/properties.jl @@ -42,15 +42,16 @@ input_trait(::IsotropicKernel) = IsotropicInput() input_trait(::Union{Dot, ExponentialDot}) = DotProductInput() input_trait(P::Power) = input_trait(P.k) +input_trait(S::Union{Product, Sum}) = S.input_trait # special treatment for constant kernel, since it can function for any input -function input_trait(S::ProductsAndSums) - i = findfirst(x->!isa(x, Constant), S.args) +function sum_and_product_input_trait(args) + i = findfirst(x->!isa(x, Constant), args) if isnothing(i) # all constant kernels return IsotropicInput() else - trait = input_trait(S.args[i]) # first non-constant kernel - for j in i+1:length(S.args) - k = S.args[j] + trait = input_trait(args[i]) # first non-constant kernel + for j in i+1:length(args) + k = args[j] if k isa Constant # ignore constants, since they can function as any input type continue elseif input_trait(k) != trait # if the non-constant kernels don't have the same input type, diff --git a/test/gradient_algebra.jl b/test/gradient_algebra.jl index 13ba819..763bbca 100644 --- a/test/gradient_algebra.jl +++ b/test/gradient_algebra.jl @@ -31,7 +31,6 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization} end @testset "gradient algebra" begin - # TODO: test for ValueGradientKernel # test data d, n = 3, 7 X = randn(d, n) @@ -48,34 +47,26 @@ end @test MK ≈ Mkk + Mkh # sum of heterogeneous kernel types - k = CovarianceFunctions.EQ() # EQ - h = CovarianceFunctions.Dot()^2 + 1 # quadratic - gk, gh = GradientKernel(k), GradientKernel(h) - G = GradientKernel(k + h) - K, Kk, Kh = gramian(G, X), gramian(gk, X), gramian(gh, X) - - MK, Mkk, Mkh = Matrix(K), Matrix(Kk), Matrix(Kh) - @test K isa BlockFactorization - @test K.A[1, 1] isa LazyMatrixSum - @test MK ≈ Mkk + Mkh - a = randn(d*n) - @test K*a ≈ MK*a - @test K*a ≈ Mkk*a + Mkh*a - @test K*a ≈ Kk*a + Kh*a - + eq = CovarianceFunctions.EQ() # EQ + line = CovarianceFunctions.Dot() # dot + quad = line^2 + 1 + cosine = CovarianceFunctions.Cosine(randn(d)) x, y = X[:, 1], X[:, 2] - # could add k+h - # TODO: Chained, VerticalScaling, Warped, SeparableSum, SeparableProduct - algebraic_combinations = [k * h, SeparableProduct(k, h, k)] - for kh in algebraic_combinations - # testing product gradient kernel - G = GradientKernel(kh) - kh_control = (x, y) -> (kh)(x, y) - G_control = GradientKernel(kh_control) - Gxy = G(x, y) - Gxy_control = G_control(x, y) - @test typeof(Gxy) != typeof(Gxy_control) # means special structure was discovered - @test Matrix(Gxy) ≈ Gxy_control + # TODO: Chained, VerticalScaling, Warped, SeparableSum + heterogeneous_combinations = [eq + line, eq + quad, eq * cosine, eq * quad, + SeparableProduct(k, h, k), line * line] + gradient_kernels = [GradientKernel, ValueGradientKernel] + for gradient_kernel in gradient_kernels + for kh in heterogeneous_combinations + # testing product gradient kernel + G = gradient_kernel(kh) + kh_control = (x, y) -> (kh)(x, y) + G_control = gradient_kernel(kh_control) + Gxy = G(x, y) + Gxy_control = G_control(x, y) + @test typeof(Gxy) != typeof(Gxy_control) # means special structure was discovered + @test Matrix(Gxy) ≈ Gxy_control + end end end