Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Covariance-related functions for general AbstractArray #599

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 37 additions & 21 deletions src/cov.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# auxiliary functions

function _symmetrize!(a::DenseMatrix)
function _symmetrize!(a::AbstractMatrix)
m, n = size(a)
m == n || error("a must be a square matrix.")
for j = 1:n
Expand All @@ -15,20 +15,31 @@ function _symmetrize!(a::DenseMatrix)
return a
end

function _scalevars(x::DenseMatrix, s::AbstractWeights, dims::Int)
function _symmetrize!(a::AbstractSparseMatrix)
m, n = size(a)
m == n || error("a must be a square matrix.")
for (i,j,vl) in zip(findnz(a)...)
i > j || continue
vr = a[j,i]
a[i,j] = a[j,i] = middle(vl, vr)
end
return a
end

function _scalevars(x::AbstractMatrix, s::AbstractWeights, dims::Int)
dims == 1 ? Diagonal(s) * x :
dims == 2 ? x * Diagonal(s) :
error("dims should be either 1 or 2.")
end

## scatter matrix

_unscaled_covzm(x::DenseMatrix, dims::Colon) = unscaled_covzm(x)
_unscaled_covzm(x::DenseMatrix, dims::Integer) = unscaled_covzm(x, dims)
_unscaled_covzm(x::AbstractMatrix, dims::Colon) = unscaled_covzm(x)
_unscaled_covzm(x::AbstractMatrix, dims::Integer) = unscaled_covzm(x, dims)

_unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Colon) =
_unscaled_covzm(x::AbstractMatrix, wv::AbstractWeights, dims::Colon) =
_symmetrize!(unscaled_covzm(x, _scalevars(x, wv)))
_unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Integer) =
_unscaled_covzm(x::AbstractMatrix, wv::AbstractWeights, dims::Integer) =
_symmetrize!(unscaled_covzm(x, _scalevars(x, wv, dims), dims))

"""
Expand Down Expand Up @@ -75,30 +86,29 @@ Finally, bias correction is applied to the covariance calculation if
"""
function mean_and_cov end

scattermat(x::DenseMatrix; mean=nothing, dims::Int=1) =
scattermat(x::AbstractMatrix; mean=nothing, dims::Int=1) =
_scattermatm(x, mean, dims)
_scattermatm(x::DenseMatrix, ::Nothing, dims::Int) =
_scattermatm(x::AbstractMatrix, ::Nothing, dims::Int) =
_unscaled_covzm(x .- mean(x, dims=dims), dims)
_scattermatm(x::DenseMatrix, mean, dims::Int=1) =
_scattermatm(x::AbstractMatrix, mean, dims::Int=1) =
_unscaled_covzm(x .- mean, dims)

scattermat(x::DenseMatrix, wv::AbstractWeights; mean=nothing, dims::Int=1) =
scattermat(x::AbstractMatrix, wv::AbstractWeights; mean=nothing, dims::Int=1) =
_scattermatm(x, wv, mean, dims)
_scattermatm(x::DenseMatrix, wv::AbstractWeights, ::Nothing, dims::Int) =
_scattermatm(x::AbstractMatrix, wv::AbstractWeights, ::Nothing, dims::Int) =
_unscaled_covzm(x .- mean(x, wv, dims=dims), wv, dims)
_scattermatm(x::DenseMatrix, wv::AbstractWeights, mean, dims::Int) =
_scattermatm(x::AbstractMatrix, wv::AbstractWeights, mean, dims::Int) =
_unscaled_covzm(x .- mean, wv, dims)

## weighted cov
covm(x::DenseMatrix, mean, w::AbstractWeights, dims::Int=1;
covm(x::AbstractMatrix, mean, w::AbstractWeights, dims::Int=1;
corrected::DepBool=nothing) =
rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, corrected)))


cov(x::DenseMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) =
cov(x::AbstractMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) =
covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, corrected))

function corm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1)
function corm(x::AbstractMatrix, mean, w::AbstractWeights, vardim::Int=1)
c = covm(x, mean, w, vardim; corrected=false)
s = stdm(x, w, mean, vardim; corrected=false)
cov2cor!(c, s)
Expand All @@ -110,14 +120,18 @@ end
Compute the Pearson correlation matrix of `X` along the dimension
`dims` with a weighting `w` .
"""
cor(x::DenseMatrix, w::AbstractWeights, dims::Int=1) =
cor(x::AbstractMatrix, w::AbstractWeights, dims::Int=1) =
corm(x, mean(x, w, dims=dims), w, dims)

function mean_and_cov(x::DenseMatrix, dims::Int=1; corrected::Bool=true)
function mean_and_cov(x::AbstractVector; corrected::Bool=true)
m = mean(x)
return m, covm(x, m, corrected=corrected)
end
function mean_and_cov(x::AbstractMatrix, dims::Int=1; corrected::Bool=true)
m = mean(x, dims=dims)
return m, covm(x, m, dims, corrected=corrected)
end
function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, dims::Int=1;
function mean_and_cov(x::AbstractMatrix, wv::AbstractWeights, dims::Int=1;
corrected::DepBool=nothing)
m = mean(x, wv, dims=dims)
return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, corrected))
Expand Down Expand Up @@ -148,8 +162,10 @@ standard deviations `s`.
function cor2cov!(C::AbstractMatrix, s::AbstractArray)
n = length(s)
size(C) == (n, n) || throw(DimensionMismatch("inconsistent dimensions"))
for i in CartesianIndices(size(C))
@inbounds C[i] *= s[i[1]] * s[i[2]]
@inbounds for i in CartesianIndices(size(C))
si = s[i[1]] * s[i[2]]
# the covariance is 0 when si==0, although C[i] is NaN in this case
C[i] = iszero(si) ? zero(eltype(C)) : C[i] * si
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case eltype(C) isn't a concrete type:

Suggested change
C[i] = iszero(si) ? zero(eltype(C)) : C[i] * si
Ci = C[i]
C[i] = iszero(si) ? zero(Ci) : Ci * si

Can you explain a bit more what happens here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If one of the variables has zero variance, its covariance with any other variable is 0, but the correlation is undefined, and the result of cor would have NaN in that element. In that case Ci == NaN, si == 0, but the correct output is 0, not NaN.
For example,

X = [1:3 ones(3)]
cov(X) == [1 0; 0 0]
cor(X) == [1 NaN; NaN 1]
cor2cov(cor(X), std.(eachcol(X))) # previously [1 NaN; NaN 0], which is incorrect

end
return C
end
Expand Down
79 changes: 49 additions & 30 deletions test/cov.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
using StatsBase
using LinearAlgebra, Random, Test
using SparseArrays

struct EmptyCovarianceEstimator <: CovarianceEstimator end

struct WrappedArray{T,N,A} <: AbstractArray{T,N}
a::A
WrappedArray(a::AbstractArray{T,N}) where {T,N} = new{T,N,typeof(a)}(a)
end
Base.size(w::WrappedArray) = size(w.a)
Base.getindex(w::WrappedArray{T,N}, I::Vararg{Int, N}) where {T,N} = getindex(w.a, I...)
Base.setindex!(w::WrappedArray{T,N}, v, I::Vararg{Int, N}) where {T,N} = setindex!(w.a, v, I...)

≈ₙ(x,y) = isapprox(x, y; nans=true)

@testset "StatsBase.Covariance" begin
weight_funcs = (weights, aweights, fweights, pweights)
array = randn(3, 8)
wrapped_array = WrappedArray(array)
sparse_array = sprandn(3, 8, 0.2)

@testset "$f" for f in weight_funcs
X = randn(3, 8)
@testset "$f,$(typeof(X))" for f in weight_funcs, X in (array, wrapped_array, sparse_array)

Z1 = X .- mean(X, dims = 1)
Z2 = X .- mean(X, dims = 2)
Expand Down Expand Up @@ -87,27 +100,32 @@ weight_funcs = (weights, aweights, fweights, pweights)
@testset "Mean and covariance" begin
(m, C) = mean_and_cov(X; corrected=false)
@test m == mean(X, dims=1)
@test C == cov(X, dims=1, corrected=false)
@test C cov(X, dims=1, corrected=false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to use rather than isequal?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strict equality fails for sparse matrices because cov is specialized for SparseMatrixCSC in Statistics (https://github.com/JuliaLang/Statistics.jl/blob/b384104d35ff0e7cf311485607b177223ed72b9a/src/Statistics.jl#L1058), but mean_and_cov uses covm rather than cov.
I think if we want to achieve strict equality here, the fix should be in Statistics (specializing covm rather than cov)


(m, C) = mean_and_cov(X, 1; corrected=false)
@test m == mean(X, dims=1)
@test C == cov(X, dims=1, corrected = false)
@test C cov(X, dims=1, corrected = false)

(m, C) = mean_and_cov(X, 2; corrected=false)
@test m == mean(X, dims=2)
@test C == cov(X, dims=2, corrected = false)
@test C cov(X, dims=2, corrected = false)

(m, C) = mean_and_cov(X, wv1; corrected=false)
@test m == mean(X, wv1, dims=1)
@test C == cov(X, wv1, 1, corrected=false)
@test C cov(X, wv1, 1, corrected=false)

(m, C) = mean_and_cov(X, wv1, 1; corrected=false)
@test m == mean(X, wv1, dims=1)
@test C == cov(X, wv1, 1, corrected=false)
@test C cov(X, wv1, 1, corrected=false)

(m, C) = mean_and_cov(X, wv2, 2; corrected=false)
@test m == mean(X, wv2, dims=2)
@test C == cov(X, wv2, 2, corrected=false)
@test C ≈ cov(X, wv2, 2, corrected=false)

v = [X[:,i] for i in axes(X,2)]
(m, C) = mean_and_cov(v; corrected=false)
@test m == mean(v)
@test C ≈ cov(v, corrected=false)
end
@testset "Conversions" begin
std1 = std(X, wv1, 1; corrected=false)
Expand All @@ -120,10 +138,10 @@ weight_funcs = (weights, aweights, fweights, pweights)
cor2 = cor(X, wv2, 2)

@testset "cov2cor" begin
@test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1)
@test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2)
@test cov2cor(cov1, std1) ≈ cor1
@test cov2cor(cov2, std2) ≈ cor2
@test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1)
@test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2)
@test cov2cor(cov1, std1) ≈ cor1
@test cov2cor(cov2, std2) ≈ cor2
end
@testset "cor2cov" begin
@test cor2cov(cor(X, dims = 1), std(X, dims = 1)) ≈ cov(X, dims = 1)
Expand Down Expand Up @@ -158,30 +176,30 @@ weight_funcs = (weights, aweights, fweights, pweights)
@testset "Mean and covariance" begin
(m, C) = mean_and_cov(X; corrected=true)
@test m == mean(X, dims=1)
@test C == cov(X, dims=1, corrected = true)
@test C cov(X, dims=1, corrected = true)

(m, C) = mean_and_cov(X, 1; corrected=true)
@test m == mean(X, dims=1)
@test C == cov(X, dims=1, corrected = true)
@test C cov(X, dims=1, corrected = true)

(m, C) = mean_and_cov(X, 2; corrected=true)
@test m == mean(X, dims=2)
@test C == cov(X, dims=2, corrected = true)
@test C cov(X, dims=2, corrected = true)

if isa(wv1, Weights)
@test_throws ArgumentError mean_and_cov(X, wv1; corrected=true)
else
(m, C) = mean_and_cov(X, wv1; corrected=true)
@test m == mean(X, wv1, dims=1)
@test C == cov(X, wv1, 1; corrected=true)
@test C cov(X, wv1, 1; corrected=true)

(m, C) = mean_and_cov(X, wv1, 1; corrected=true)
@test m == mean(X, wv1, dims=1)
@test C == cov(X, wv1, 1; corrected=true)
@test C cov(X, wv1, 1; corrected=true)

(m, C) = mean_and_cov(X, wv2, 2; corrected=true)
@test m == mean(X, wv2, dims=2)
@test C == cov(X, wv2, 2; corrected=true)
@test C cov(X, wv2, 2; corrected=true)
end
end
@testset "Conversions" begin
Expand All @@ -196,25 +214,26 @@ weight_funcs = (weights, aweights, fweights, pweights)
cor2 = cor(X, wv2, 2)

@testset "cov2cor" begin
@test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1)
@test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2)
@test cov2cor(cov1, std1) ≈ cor1
@test cov2cor(cov2, std2) ≈ cor2
@test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1)
@test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2)
@test cov2cor(cov1, std1) ≈ cor1
@test cov2cor(cov2, std2) ≈ cor2
end

@testset "cov2cor!" begin
tmp_cov1 = copy(cov1)
@test !(tmp_cov1 ≈ cor1)
@test !(tmp_cov1 ≈ cor1)
StatsBase.cov2cor!(tmp_cov1, std1)
@test tmp_cov1 ≈ cor1
@test tmp_cov1 ≈ cor1

tmp_cov2 = copy(cov2)
@test !(tmp_cov2 ≈ cor2)
@test !(tmp_cov2 ≈ cor2)
StatsBase.cov2cor!(tmp_cov2, std2)
@test tmp_cov2 ≈ cor2
@test tmp_cov2 ≈ cor2
end

@testset "cor2cov" begin

@test cor2cov(cor(X, dims = 1), std(X, dims = 1)) ≈ cov(X, dims = 1)
@test cor2cov(cor(X, dims = 2), std(X, dims = 2)) ≈ cov(X, dims = 2)
@test cor2cov(cor1, std1) ≈ cov1
Expand All @@ -237,8 +256,8 @@ weight_funcs = (weights, aweights, fweights, pweights)
end

@testset "Correlation" begin
@test cor(X, f(ones(3)), 1) ≈ cor(X, dims = 1)
@test cor(X, f(ones(8)), 2) ≈ cor(X, dims = 2)
@test cor(X, f(ones(3)), 1) ≈ cor(X, dims = 1)
@test cor(X, f(ones(8)), 2) ≈ cor(X, dims = 2)

cov1 = cov(X, wv1, 1; corrected=false)
std1 = std(X, wv1, 1; corrected=false)
Expand All @@ -247,8 +266,8 @@ weight_funcs = (weights, aweights, fweights, pweights)
expected_cor1 = StatsBase.cov2cor!(cov1, std1)
expected_cor2 = StatsBase.cov2cor!(cov2, std2)

@test cor(X, wv1, 1) ≈ expected_cor1
@test cor(X, wv2, 2) ≈ expected_cor2
@test cor(X, wv1, 1) ≈ expected_cor1
@test cor(X, wv2, 2) ≈ expected_cor2
end

@testset "Abstract covariance estimation" begin
Expand Down