diff --git a/README.md b/README.md
index 47878a0..fa44fbd 100644
--- a/README.md
+++ b/README.md
@@ -241,8 +241,11 @@ b = zero(a);
 @time mul!(b, G, a); # multiplying with G allocates little memory
   0.394388 seconds (67 allocations: 86.516 KiB)
 ```
-Note that the last multiplication was with a **million by million matrix**,
+The last multiplication was with a **million by million matrix**,
 which would be impossible without CovarianceFunctions.jl's lazy and structured representation of the gradient kernel matrix.
+Note that `GradientKernel` only computes covariances of gradient observations,
+to get the `(d+1) × (d+1)` covariance kernel that includes value observations,
+use `ValueGradientKernel`.
 
 To highlight the scalability of this MVM algorithm, we compare against the implementation in [GPyTorch](https://docs.gpytorch.ai/en/stable/kernels.html?highlight=kernels#rbfkernelgrad) and the fast *approximate* MVM provided by [D-SKIP](https://github.com/ericlee0803/GP_Derivatives).
 
diff --git a/src/CovarianceFunctions.jl b/src/CovarianceFunctions.jl
index dcb1582..a72c7db 100644
--- a/src/CovarianceFunctions.jl
+++ b/src/CovarianceFunctions.jl
@@ -27,6 +27,8 @@ using IterativeSolvers
 using ToeplitzMatrices
 using FFTW
 
+# IDEA: AbstractKernel{T, IT}, where IT isa InputType
+# then const IsotropicKernel{T} = AbstractKernel{T, IsotropicInput} ... 
 abstract type AbstractKernel{T} end
 abstract type MercerKernel{T} <: AbstractKernel{T} end
 abstract type StationaryKernel{T} <: MercerKernel{T} end
diff --git a/src/algebra.jl b/src/algebra.jl
index 11586e7..6cffb49 100644
--- a/src/algebra.jl
+++ b/src/algebra.jl
@@ -2,16 +2,15 @@
 # NOTE: output type inference of product, sum, and power not supported for
 # user-defined kernels unless Base.eltype is defined for them
 ################################ Product #######################################
-# IDEA: constructors which merge products and sums
-struct Product{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T}
+struct Product{T, AT<:Union{Tuple, AbstractVector}, IT} <: AbstractKernel{T}
     args::AT
-    # input_traits : # could keep track of input_trait.(args)
-    # input_trait # could keep track of the overall input trait
+    input_trait::IT # could keep track of the overall input trait
 end
 @functor Product
 function Product(k::Union{Tuple, AbstractVector})
     T = promote_type(eltype.(k)...)
-    Product{T, typeof(k)}(k)
+    IT = sum_and_product_input_trait(k)
+    Product{T, typeof(k), typeof(IT)}(k, IT)
 end
 Product(k...) = Product(k)
 (P::Product)(τ) = prod(k->k(τ), P.args) # IDEA could check for isotropy here
@@ -26,21 +25,21 @@ Base.:*(c::Number, k::AbstractKernel) = Constant(c) * k
 Base.:*(k::AbstractKernel, c::Number) = Constant(c) * k
 
 ################################### Sum ########################################
-struct Sum{T, AT<:Union{Tuple, AbstractVector}} <: AbstractKernel{T}
+struct Sum{T, AT<:Union{Tuple, AbstractVector}, IT} <: AbstractKernel{T}
     args::AT
-    # input_trait # could keep track of the overall input trait
+    input_trait::IT # could keep track of the overall input trait
 end
 @functor Sum
 function Sum(k::Union{Tuple, AbstractVector})
     T = promote_type(eltype.(k)...)
-    Sum{T, typeof(k)}(k)
+    IT = sum_and_product_input_trait(k)
+    Sum{T, typeof(k), typeof(IT)}(k, IT)
 end
 Sum(k...) = Sum(k)
 (S::Sum)(τ) = sum(k->k(τ), S.args) # should only be called if S is stationary
 (S::Sum)(x, y) = sum(k->k(x, y), S.args)
 # (S::Sum)(τ) = isstationary(S) ? sum(k->k(τ), S.args) : error("One argument evaluation not possible for non-stationary kernel")
 # (S::Sum)(x, y) = isstationary(S) ? S(difference(x, y)) : sum(k->k(x, y), S.args)
-Sum(k...) = Sum(k)
 Base.sum(k::AbstractVector{<:AbstractKernel}) = Sum(k)
 
 Base.:+(k::AbstractKernel...) = Sum(k)
@@ -48,15 +47,16 @@ Base.:+(k::AbstractKernel, c::Number) = k + Constant(c)
 Base.:+(c::Number, k::AbstractKernel) = k + Constant(c)
 
 ################################## Power #######################################
-struct Power{T, K<:AbstractKernel} <: AbstractKernel{T}
+struct Power{T, K, IT} <: AbstractKernel{T}
     k::K
     p::Int
-    # input_trait # could keep track of the overall input trait
+    input_trait::IT # could keep track of the overall input trait
 end
 @functor Power
 function Power(k, p::Int)
     T = promote_type(eltype(k))
-    Power{T, typeof(k)}(k, p)
+    IT = input_trait(k)
+    Power{T, typeof(k), typeof(IT)}(k, p, IT)
 end
 (P::Power)(τ) = P.k(τ)^P.p
 (P::Power)(x, y) = P.k(x, y)^P.p
@@ -64,9 +64,9 @@ Base.:^(k::AbstractKernel, p::Number) = Power(k, p)
 
 ############################ Separable Product #################################
 # product kernel, but separately evaluates component kernels on different parts of the input
+# NOTE: input_trait(::SeparableProduct) defaults to GenericInput
 struct SeparableProduct{T, K} <: AbstractKernel{T}
     args::K # kernel for input covariances
-    # input_trait # could keep track of the overall input trait
 end
 @functor SeparableProduct
 SeparableProduct(k...) = SeparableProduct(k)
@@ -101,17 +101,16 @@ end
 # useful for "Additive Gaussian Processes" - Duvenaud 2011
 # https://papers.nips.cc/paper/2011/file/4c5bde74a8f110656874902f07378009-Paper.pdf
 # IDEA: could introduce have AdditiveKernel with "order" argument, that adds higher order interactions
+# NOTE: input_trait(::SeparableSum) defaults to GenericInput
 struct SeparableSum{T, K} <: AbstractKernel{T}
     args::K # kernel for input covariances
 end
 @functor SeparableSum
-
 SeparableSum(k...) = SeparableSum(k)
 function SeparableSum(k::Union{Tuple, AbstractVector})
     T = promote_type(eltype.(k)...)
     SeparableSum{T, typeof(k)}(k)
 end
-
 function (k::SeparableSum)(x::AbstractVector, y::AbstractVector)
     d = checklength(x, y)
     length(k.args) == d || throw(DimensionMismatch("SeparableProduct needs d = $d kernels but has r = $(length(k.args))"))
diff --git a/src/gradient.jl b/src/gradient.jl
index 48f7d2c..df692bd 100644
--- a/src/gradient.jl
+++ b/src/gradient.jl
@@ -431,39 +431,32 @@ function (G::ValueGradientKernel)(x, y)
     value_gradient_kernel(G.k, x, y, input_trait(G.k))
 end
 
-function value_gradient_kernel(k, x, y, T::InputTrait = input_trait(G.k))
+function value_gradient_kernel(k, x, y, IT::InputTrait = input_trait(G.k))
     d = length(x)
     kxy = k(x, y)
     value_value = kxy
-    value_gradient = MVector{d, typeof(kxy)}(undef)
-    gradient_value = MVector{d, typeof(kxy)}(undef)
-    gradient_gradient = gradient_kernel(k, x, y, T)
+    value_gradient = zeros(typeof(kxy), d)
+    gradient_value = zeros(typeof(kxy), d)
+    gradient_gradient = gradient_kernel(k, x, y, IT)
     K = DerivativeKernelElement(value_value, value_gradient, gradient_value, gradient_gradient)
-    value_gradient_kernel!(K, k, x, y, T)
-end
-
-# IDEA: specialize first_gradient!(g, k, x, y) = ForwardDiff.gradient!(g, z->k(z, y), x)
-# computes covariance between value and gradient
-# function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput)
-#     r² = sum(abs2, difference(x, y))
-#     g .= derivative(k, r²)
-# end
-#
-# function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput())
-#     r² = sum(abs2, difference(x, y))
-#     g .= derivative(k, r²)
-# end
-
-# IDEA: specialize evaluate for IsotropicInput, DotProductInput
-# returns block matrix
-function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::InputTrait)
+    value_gradient_kernel!(K, k, x, y, IT)
+end
+
+# IDEA: for Sum and Product kernels, if input_trait is not passed, could default to Generic
+# No, should keep track of input type in kernel
+function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y)
+    value_gradient_kernel!(K, k, x, y, input_trait(k))
+end
+function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, IT::InputTrait)
     K.value_value = k(x, y)
-    K.value_gradient .= ForwardDiff.gradient(z->k(x, z), y) # ForwardDiff.gradient!(r, z->k(z, y), x)
-    K.gradient_value .= ForwardDiff.gradient(z->k(z, y), x)
-    K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, T)
+    value_gradient_covariance!(K.gradient_value, K.value_gradient, k, x, y, IT)
+    K.gradient_gradient = gradient_kernel!(K.gradient_gradient, k, x, y, IT)
     return K
 end
 
+# NOTE: since value_gradient_covariance! was added, specializations of value_gradient_kernel!
+# only yield marginal performance improvements (<2x)
+# could be removed to reduce LOC
 function value_gradient_kernel!(K::DerivativeKernelElement, k, x, y, T::DotProductInput)
     xy = dot(x, y)
     k0, k1 = value_derivative(k, xy)
@@ -493,6 +486,76 @@ function evaluate_block!(Gij::DerivativeKernelElement, k::ValueGradientKernel, x
     value_gradient_kernel!(Gij, k.k, x, y, IT)
 end
 
+################################################################################
+# computes covariance between value and gradient, needed for value-gradient kernel
+# technically, complexity would be O(d) even with generic implementation
+# in practice, at least an order of magitude can be gained by specialized implementations
+function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput)
+    # GradientConfig() # for generic version, this could be pre-computed for efficiency gains
+    ForwardDiff.gradient!(gx, z->k(z, y), x) # these are bottlenecks
+    ForwardDiff.gradient!(gy, z->k(x, z), y)
+    return gx, gy
+end
+
+function value_gradient_covariance!(gx, gy, k, x, y, ::GenericInput, α::Real, β::Real)
+    tx, ty = ForwardDiff.gradient(z->k(z, y), x), ForwardDiff.gradient(z->k(x, z), y)
+    @. gx = α * tx + β * gx
+    @. gy = α * ty + β * gy
+    return gx, gy
+end
+
+function value_gradient_covariance!(gx, gy, k::Sum, x, y, ::GenericInput, α::Real = 1, β::Real = 0)
+    @. gx *= β
+    @. gy *= β
+    for h in k.args # input_trait(h) should not be called if h is a composite kernel for higher effiency
+        value_gradient_covariance!(gx, gy, h, x, y, input_trait(h), α, 1)
+    end
+    return gx, gy
+end
+
+# this is more tricky to do without additional allocations
+# could do it with one more vector
+function value_gradient_covariance!(gx, gy, k::Product, x, y, ::GenericInput, α::Real = 1, β::Real = 0)
+    @. gx *= β
+    @. gy *= β
+    prod_k_xy = k(x, y)
+    if !iszero(prod_k_xy)
+        for i in eachindex(k.args)
+            ki = k.args[i]
+            αi = α * prod_k_xy / ki(x, y)
+            value_gradient_covariance!(gx, gy, ki, x, y, input_trait(ki), αi, 1)
+        end
+    else # GradientKernel of Product requires less efficient O(dr²) special case if prod_k_xy is zero, for now, throw error
+        throw(DomainError("value_gradient_covariance! of Product not supported for products that are zero"))
+    end
+    return gx, gy
+end
+
+function value_gradient_covariance!(gx, gy, k, x, y, ::IsotropicInput, α::Real = 1, β::Real = 0)
+    r = difference(x, y)
+    r² = sum(abs2, r)
+    k1 = ForwardDiff.derivative(k, r²) # IDEA: this computation could be pooled with the gradient computation
+    @. gx = α * 2k1 * r + β * gx
+    @. gy = -α * 2k1 * r + β * gy
+    return gx, gy
+end
+
+function value_gradient_covariance!(gx, gy, k, x, y, ::DotProductInput, α::Real = 1, β::Real = 0)
+    xy = dot(x, y)
+    k1 = ForwardDiff.derivative(k, xy) # IDEA: this computation could be pooled with the gradient computation
+    @. gx = α * k1 * y + β * gx
+    @. gy = α * k1 * x + β * gy
+    return gx, gy
+end
+
+function value_gradient_covariance!(gx, gy, k, x, y, ::StationaryLinearFunctionalInput, α::Real = 1, β::Real = 0)
+    cr = dot(k.c, difference(x, y))
+    k1 = ForwardDiff.derivative(k, cr) # IDEA: this computation could be pooled with the gradient computation
+    @. gx = α * k1 * k.c + β * gx
+    @. gy = -α * k1 * k.c + β * gy
+    return gx, gy
+end
+
 ################################################################################
 # for 1d inputs
 struct DerivativeKernel{T, K} <: AbstractKernel{T}
diff --git a/src/gradient_algebra.jl b/src/gradient_algebra.jl
index 1df7b93..f30862e 100644
--- a/src/gradient_algebra.jl
+++ b/src/gradient_algebra.jl
@@ -35,39 +35,33 @@ function gradient_kernel(k::Product, x, y, ::GenericInput)
     gradient_kernel!(W, k, x, y, GenericInput())
 end
 
-# IDEA: could have kernel element for heterogeneous product kernels, problem: would need to
-# pre-allocate storage for Jacobian matrix or be allocating
-# struct ProductGradientKernelElement{T, K, X, Y} <: Factorization{T}
-#     k::K
-#     x::X
-#     y::Y
-# end
 function gradient_kernel!(W::Woodbury, k::Product, x::AbstractVector, y::AbstractVector, ::GenericInput = input_trait(k))
-    A = W.A # this is a LazyMatrixSum of LazyMatrixProducts
-    prod_k_j = k(x, y)
-
-    # k_vec(x, y) = [h(x, y) for h in k.args] # include in loop
-    # ForwardDiff.jacobian!(W.U', z->k_vec(z, y), x) # this is actually less allocating than the gradient! option
-    # ForwardDiff.jacobian!(W.V, z->k_vec(x, z), y)
-    # GradientConfig() # for generic version, this could be pre-computed for efficiency gains
     r = length(k.args)
-    for i in 1:r # parallelize this?
-        h, H = k.args[i], A.args[i]
-        hxy = h(x, y)
-        D = H.args[1]
-        D.diag .= prod_k_j / hxy
-        # input_trait(h) could be pre-computed, or should not be passed here, because the factors might be composite kernels themselves
-        H.args[2] = gradient_kernel!(H.args[2], h, x, y, input_trait(h))
-
-        ui, vi = @views W.U[:, i], W.V[i, :]
-        ForwardDiff.gradient!(ui, z->h(z, y), x) # these are bottlenecks
-        ForwardDiff.gradient!(vi, z->h(x, z), y) # TODO: replace by value_gradient_covariance!
-        @. ui *= prod_k_j / hxy
-        @. vi /= hxy
+    A = W.A # this is a LazyMatrixSum of LazyMatrixProducts
+    prod_k_xy = k(x, y)
+    if !iszero(prod_k_xy) # main case
+        for i in 1:r # parallelize this?
+            h, H = k.args[i], A.args[i]
+            ui, vi = @views W.U[:, i], W.V[i, :]
+            product_gradient_kernel_helper!(h, H, prod_k_xy, ui, vi, x, y)
+        end
+    else # requires less efficient O(dr²) special case if prod_k_xy is zero, for now, throw error
+        throw(DomainError("GradientKernel of Product not supported for products that are zero, since it would require a O(dr²) special case."))
     end
     return W
 end
 
+@inline function product_gradient_kernel_helper!(ki, Ki, prod_k_xy, ui, vi, x, y, IT::InputTrait = input_trait(ki))
+    ki_xy = ki(x, y)
+    D = Ki.args[1]
+    @. D.diag = prod_k_xy / ki_xy
+    Ki.args[2] = gradient_kernel!(Ki.args[2], ki, x, y, IT)
+    value_gradient_covariance!(ui, vi, ki, x, y, IT)
+    @. ui *= prod_k_xy / ki_xy
+    @. vi /= ki_xy
+    return Ki
+end
+
 ############################# Separable Product ################################
 # for product kernel with generic input
 function allocate_gradient_kernel(k::SeparableProduct, x::AbstractVector{<:Number},
@@ -90,16 +84,16 @@ end
 
 function gradient_kernel!(W::Woodbury, k::SeparableProduct, x::AbstractVector, y::AbstractVector, ::GenericInput = input_trait(k))
     A = W.A # this is a LazyMatrixProducts of Diagonals
-    prod_k_j = k(x, y)
+    prod_k_xy = k(x, y)
     D, H = A.args # first is scaling matrix by leave_one_out_products, second is diagonal of derivative kernels
     for (i, ki) in enumerate(k.args)
         xi, yi = x[i], y[i]
         kixy = ki(xi, yi)
-        # D[i, i] = prod_k_j / kixy
+        # D[i, i] = prod_k_xy / kixy
         D[i, i] = ki(xi, yi)
         W.U[i, i] = ForwardDiff.derivative(z->ki(z, yi), xi)
         W.V[i, i] = ForwardDiff.derivative(z->ki(xi, z), yi)
-        W.U[i, i] *= prod_k_j / kixy
+        W.U[i, i] *= prod_k_xy / kixy
         W.V[i, i] /= kixy
         H[i, i] = DerivativeKernel(ki)(xi, yi)
     end
diff --git a/src/properties.jl b/src/properties.jl
index 90fd9bf..4e05215 100644
--- a/src/properties.jl
+++ b/src/properties.jl
@@ -42,15 +42,16 @@ input_trait(::IsotropicKernel) = IsotropicInput()
 input_trait(::Union{Dot, ExponentialDot}) = DotProductInput()
 input_trait(P::Power) = input_trait(P.k)
 
+input_trait(S::Union{Product, Sum}) = S.input_trait
 # special treatment for constant kernel, since it can function for any input
-function input_trait(S::ProductsAndSums)
-    i = findfirst(x->!isa(x, Constant), S.args)
+function sum_and_product_input_trait(args)
+    i = findfirst(x->!isa(x, Constant), args)
     if isnothing(i) # all constant kernels
         return IsotropicInput()
     else
-        trait = input_trait(S.args[i]) # first non-constant kernel
-        for j in i+1:length(S.args)
-            k = S.args[j]
+        trait = input_trait(args[i]) # first non-constant kernel
+        for j in i+1:length(args)
+            k = args[j]
             if k isa Constant # ignore constants, since they can function as any input type
                 continue
             elseif input_trait(k) != trait # if the non-constant kernels don't have the same input type,
diff --git a/test/gradient_algebra.jl b/test/gradient_algebra.jl
index 13ba819..763bbca 100644
--- a/test/gradient_algebra.jl
+++ b/test/gradient_algebra.jl
@@ -31,7 +31,6 @@ const AbstractMatOrFac = Union{AbstractMatrix, Factorization}
 end
 
 @testset "gradient algebra" begin
-    # TODO: test for ValueGradientKernel
     # test data
     d, n = 3, 7
     X = randn(d, n)
@@ -48,34 +47,26 @@ end
     @test MK ≈ Mkk + Mkh
 
     # sum of heterogeneous kernel types
-    k = CovarianceFunctions.EQ() # EQ
-    h = CovarianceFunctions.Dot()^2 + 1 # quadratic
-    gk, gh = GradientKernel(k), GradientKernel(h)
-    G = GradientKernel(k + h)
-    K, Kk, Kh = gramian(G, X), gramian(gk, X), gramian(gh, X)
-
-    MK, Mkk, Mkh = Matrix(K), Matrix(Kk), Matrix(Kh)
-    @test K isa BlockFactorization
-    @test K.A[1, 1] isa LazyMatrixSum
-    @test MK ≈ Mkk + Mkh
-    a = randn(d*n)
-    @test K*a ≈ MK*a
-    @test K*a ≈ Mkk*a + Mkh*a
-    @test K*a ≈ Kk*a + Kh*a
-
+    eq = CovarianceFunctions.EQ() # EQ
+    line = CovarianceFunctions.Dot() # dot
+    quad = line^2 + 1
+    cosine = CovarianceFunctions.Cosine(randn(d))
     x, y = X[:, 1], X[:, 2]
-    # could add k+h
-    # TODO: Chained, VerticalScaling, Warped, SeparableSum, SeparableProduct
-    algebraic_combinations = [k * h, SeparableProduct(k, h, k)]
-    for kh in algebraic_combinations
-        # testing product gradient kernel
-        G = GradientKernel(kh)
-        kh_control = (x, y) -> (kh)(x, y)
-        G_control = GradientKernel(kh_control)
-        Gxy = G(x, y)
-        Gxy_control = G_control(x, y)
-        @test typeof(Gxy) != typeof(Gxy_control) # means special structure was discovered
-        @test Matrix(Gxy) ≈ Gxy_control
+    # TODO: Chained, VerticalScaling, Warped, SeparableSum
+    heterogeneous_combinations = [eq + line, eq + quad, eq * cosine, eq * quad,
+                                SeparableProduct(k, h, k), line * line]
+    gradient_kernels = [GradientKernel, ValueGradientKernel]
+    for gradient_kernel in gradient_kernels
+        for kh in heterogeneous_combinations
+            # testing product gradient kernel
+            G = gradient_kernel(kh)
+            kh_control = (x, y) -> (kh)(x, y)
+            G_control = gradient_kernel(kh_control)
+            Gxy = G(x, y)
+            Gxy_control = G_control(x, y)
+            @test typeof(Gxy) != typeof(Gxy_control) # means special structure was discovered
+            @test Matrix(Gxy) ≈ Gxy_control
+        end
     end
 end