JuliaStats · yha · Sep 13, 2020 · Oct 15, 2020 · Oct 15, 2020 · nalimilan
diff --git a/src/cov.jl b/src/cov.jl
@@ -2,7 +2,7 @@
 
 # auxiliary functions
 
-function _symmetrize!(a::DenseMatrix)
+function _symmetrize!(a::AbstractMatrix)
     m, n = size(a)
     m == n || error("a must be a square matrix.")
     for j = 1:n
@@ -15,20 +15,31 @@ function _symmetrize!(a::DenseMatrix)
     return a
 end
 
-function _scalevars(x::DenseMatrix, s::AbstractWeights, dims::Int)
+function _symmetrize!(a::AbstractSparseMatrix)
+    m, n = size(a)
+    m == n || error("a must be a square matrix.")
+    for (i,j,vl) in zip(findnz(a)...)
+        i > j || continue
+        vr = a[j,i]
+        a[i,j] = a[j,i] = middle(vl, vr)
+    end
+    return a
+end
+
+function _scalevars(x::AbstractMatrix, s::AbstractWeights, dims::Int)
     dims == 1 ? Diagonal(s) * x :
     dims == 2 ? x * Diagonal(s) :
     error("dims should be either 1 or 2.")
 end
 
 ## scatter matrix
 
-_unscaled_covzm(x::DenseMatrix, dims::Colon)   = unscaled_covzm(x)
-_unscaled_covzm(x::DenseMatrix, dims::Integer) = unscaled_covzm(x, dims)
+_unscaled_covzm(x::AbstractMatrix, dims::Colon)   = unscaled_covzm(x)
+_unscaled_covzm(x::AbstractMatrix, dims::Integer) = unscaled_covzm(x, dims)
 
-_unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Colon)   =
+_unscaled_covzm(x::AbstractMatrix, wv::AbstractWeights, dims::Colon)   =
     _symmetrize!(unscaled_covzm(x, _scalevars(x, wv)))
-_unscaled_covzm(x::DenseMatrix, wv::AbstractWeights, dims::Integer) =
+_unscaled_covzm(x::AbstractMatrix, wv::AbstractWeights, dims::Integer) =
     _symmetrize!(unscaled_covzm(x, _scalevars(x, wv, dims), dims))
 
 """
@@ -75,30 +86,29 @@ Finally, bias correction is applied to the covariance calculation if
 """
 function mean_and_cov end
 
-scattermat(x::DenseMatrix; mean=nothing, dims::Int=1) =
+scattermat(x::AbstractMatrix; mean=nothing, dims::Int=1) =
     _scattermatm(x, mean, dims)
-_scattermatm(x::DenseMatrix, ::Nothing, dims::Int) =
+_scattermatm(x::AbstractMatrix, ::Nothing, dims::Int) =
     _unscaled_covzm(x .- mean(x, dims=dims), dims)
-_scattermatm(x::DenseMatrix, mean, dims::Int=1) =
+_scattermatm(x::AbstractMatrix, mean, dims::Int=1) =
     _unscaled_covzm(x .- mean, dims)
 
-scattermat(x::DenseMatrix, wv::AbstractWeights; mean=nothing, dims::Int=1) =
+scattermat(x::AbstractMatrix, wv::AbstractWeights; mean=nothing, dims::Int=1) =
     _scattermatm(x, wv, mean, dims)
-_scattermatm(x::DenseMatrix, wv::AbstractWeights, ::Nothing, dims::Int) =
+_scattermatm(x::AbstractMatrix, wv::AbstractWeights, ::Nothing, dims::Int) =
     _unscaled_covzm(x .- mean(x, wv, dims=dims), wv, dims)
-_scattermatm(x::DenseMatrix, wv::AbstractWeights, mean, dims::Int) =
+_scattermatm(x::AbstractMatrix, wv::AbstractWeights, mean, dims::Int) =
     _unscaled_covzm(x .- mean, wv, dims)
 
 ## weighted cov
-covm(x::DenseMatrix, mean, w::AbstractWeights, dims::Int=1;
+covm(x::AbstractMatrix, mean, w::AbstractWeights, dims::Int=1;
      corrected::DepBool=nothing) =
     rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, corrected)))
 
-
-cov(x::DenseMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) =
+cov(x::AbstractMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) =
     covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, corrected))
 
-function corm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1)
+function corm(x::AbstractMatrix, mean, w::AbstractWeights, vardim::Int=1)
     c = covm(x, mean, w, vardim; corrected=false)
     s = stdm(x, w, mean, vardim; corrected=false)
     cov2cor!(c, s)
@@ -110,14 +120,18 @@ end
 Compute the Pearson correlation matrix of `X` along the dimension
 `dims` with a weighting `w` .
 """
-cor(x::DenseMatrix, w::AbstractWeights, dims::Int=1) =
+cor(x::AbstractMatrix, w::AbstractWeights, dims::Int=1) =
     corm(x, mean(x, w, dims=dims), w, dims)
 
-function mean_and_cov(x::DenseMatrix, dims::Int=1; corrected::Bool=true)
+function mean_and_cov(x::AbstractVector; corrected::Bool=true)
+    m = mean(x)
+    return m, covm(x, m, corrected=corrected)
+end
+function mean_and_cov(x::AbstractMatrix, dims::Int=1; corrected::Bool=true)
     m = mean(x, dims=dims)
     return m, covm(x, m, dims, corrected=corrected)
 end
-function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, dims::Int=1;
+function mean_and_cov(x::AbstractMatrix, wv::AbstractWeights, dims::Int=1;
                       corrected::DepBool=nothing)
     m = mean(x, wv, dims=dims)
     return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, corrected))
@@ -148,8 +162,10 @@ standard deviations `s`.
 function cor2cov!(C::AbstractMatrix, s::AbstractArray)
     n = length(s)
     size(C) == (n, n) || throw(DimensionMismatch("inconsistent dimensions"))
-    for i in CartesianIndices(size(C))
-        @inbounds C[i] *= s[i[1]] * s[i[2]]
+    @inbounds for i in CartesianIndices(size(C))
+        si = s[i[1]] * s[i[2]]
+        # the covariance is 0 when si==0, although C[i] is NaN in this case
+        C[i] = iszero(si) ? zero(eltype(C)) : C[i] * si
-        C[i] = iszero(si) ? zero(eltype(C)) : C[i] * si
+        Ci = C[i]
+        C[i] = iszero(si) ? zero(Ci) : Ci * si
-        C[i] = iszero(si) ? zero(eltype(C)) : C[i] * si
+        Ci = C[i]
+        C[i] = iszero(si) ? zero(Ci) : Ci * si
     end
     return C
 end

diff --git a/test/cov.jl b/test/cov.jl
@@ -1,13 +1,26 @@
 using StatsBase
 using LinearAlgebra, Random, Test
+using SparseArrays
 
 struct EmptyCovarianceEstimator <: CovarianceEstimator end
 
+struct WrappedArray{T,N,A} <: AbstractArray{T,N}
+    a::A
+    WrappedArray(a::AbstractArray{T,N}) where {T,N} = new{T,N,typeof(a)}(a)
+end
+Base.size(w::WrappedArray) = size(w.a)
+Base.getindex(w::WrappedArray{T,N}, I::Vararg{Int, N}) where {T,N} = getindex(w.a, I...)
+Base.setindex!(w::WrappedArray{T,N}, v, I::Vararg{Int, N}) where {T,N} = setindex!(w.a, v, I...)
+
+≈ₙ(x,y) = isapprox(x, y; nans=true)
+
 @testset "StatsBase.Covariance" begin
 weight_funcs = (weights, aweights, fweights, pweights)
+array = randn(3, 8)
+wrapped_array = WrappedArray(array)
+sparse_array = sprandn(3, 8, 0.2)
 
-@testset "$f" for f in weight_funcs
-    X = randn(3, 8)
+@testset "$f,$(typeof(X))" for f in weight_funcs, X in (array, wrapped_array, sparse_array)
 
     Z1 = X .- mean(X, dims = 1)
     Z2 = X .- mean(X, dims = 2)
@@ -87,27 +100,32 @@ weight_funcs = (weights, aweights, fweights, pweights)
         @testset "Mean and covariance" begin
             (m, C) = mean_and_cov(X; corrected=false)
             @test m == mean(X, dims=1)
-            @test C == cov(X, dims=1, corrected=false)
+            @test C ≈ cov(X, dims=1, corrected=false)
 
             (m, C) = mean_and_cov(X, 1; corrected=false)
             @test m == mean(X, dims=1)
-            @test C == cov(X, dims=1, corrected = false)
+            @test C ≈ cov(X, dims=1, corrected = false)
 
             (m, C) = mean_and_cov(X, 2; corrected=false)
             @test m == mean(X, dims=2)
-            @test C == cov(X, dims=2, corrected = false)
+            @test C ≈ cov(X, dims=2, corrected = false)
 
             (m, C) = mean_and_cov(X, wv1; corrected=false)
             @test m == mean(X, wv1, dims=1)
-            @test C == cov(X, wv1, 1, corrected=false)
+            @test C ≈ cov(X, wv1, 1, corrected=false)
 
             (m, C) = mean_and_cov(X, wv1, 1; corrected=false)
             @test m == mean(X, wv1, dims=1)
-            @test C == cov(X, wv1, 1, corrected=false)
+            @test C ≈ cov(X, wv1, 1, corrected=false)
 
             (m, C) = mean_and_cov(X, wv2, 2; corrected=false)
             @test m == mean(X, wv2, dims=2)
-            @test C == cov(X, wv2, 2, corrected=false)
+            @test C ≈ cov(X, wv2, 2, corrected=false)
+
+            v = [X[:,i] for i in axes(X,2)]
+            (m, C) = mean_and_cov(v; corrected=false)
+            @test m == mean(v)
+            @test C ≈ cov(v, corrected=false)
         end
         @testset "Conversions" begin
             std1 = std(X, wv1, 1; corrected=false)
@@ -120,10 +138,10 @@ weight_funcs = (weights, aweights, fweights, pweights)
             cor2 = cor(X, wv2, 2)
 
             @testset "cov2cor" begin
-                @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1)
-                @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2)
-                @test cov2cor(cov1, std1) ≈ cor1
-                @test cov2cor(cov2, std2) ≈ cor2
+                @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ₙ cor(X, dims = 1)
+                @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ₙ cor(X, dims = 2)
+                @test cov2cor(cov1, std1) ≈ₙ cor1
+                @test cov2cor(cov2, std2) ≈ₙ cor2
             end
             @testset "cor2cov" begin
                 @test cor2cov(cor(X, dims = 1), std(X, dims = 1)) ≈ cov(X, dims = 1)
@@ -158,30 +176,30 @@ weight_funcs = (weights, aweights, fweights, pweights)
         @testset "Mean and covariance" begin
             (m, C) = mean_and_cov(X; corrected=true)
             @test m == mean(X, dims=1)
-            @test C == cov(X, dims=1, corrected = true)
+            @test C ≈ cov(X, dims=1, corrected = true)
 
             (m, C) = mean_and_cov(X, 1; corrected=true)
             @test m == mean(X, dims=1)
-            @test C == cov(X, dims=1, corrected = true)
+            @test C ≈ cov(X, dims=1, corrected = true)
 
             (m, C) = mean_and_cov(X, 2; corrected=true)
             @test m == mean(X, dims=2)
-            @test C == cov(X, dims=2, corrected = true)
+            @test C ≈ cov(X, dims=2, corrected = true)
 
             if isa(wv1, Weights)
                 @test_throws ArgumentError mean_and_cov(X, wv1; corrected=true)
             else
                 (m, C) = mean_and_cov(X, wv1; corrected=true)
                 @test m == mean(X, wv1, dims=1)
-                @test C == cov(X, wv1, 1; corrected=true)
+                @test C ≈ cov(X, wv1, 1; corrected=true)
 
                 (m, C) = mean_and_cov(X, wv1, 1; corrected=true)
                 @test m == mean(X, wv1, dims=1)
-                @test C == cov(X, wv1, 1; corrected=true)
+                @test C ≈ cov(X, wv1, 1; corrected=true)
 
                 (m, C) = mean_and_cov(X, wv2, 2; corrected=true)
                 @test m == mean(X, wv2, dims=2)
-                @test C == cov(X, wv2, 2; corrected=true)
+                @test C ≈ cov(X, wv2, 2; corrected=true)
             end
         end
         @testset "Conversions" begin
@@ -196,25 +214,26 @@ weight_funcs = (weights, aweights, fweights, pweights)
                 cor2 = cor(X, wv2, 2)
 
                 @testset "cov2cor" begin
-                    @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ cor(X, dims = 1)
-                    @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ cor(X, dims = 2)
-                    @test cov2cor(cov1, std1) ≈ cor1
-                    @test cov2cor(cov2, std2) ≈ cor2
+                    @test cov2cor(cov(X, dims = 1), std(X, dims = 1)) ≈ₙ cor(X, dims = 1)
+                    @test cov2cor(cov(X, dims = 2), std(X, dims = 2)) ≈ₙ cor(X, dims = 2)
+                    @test cov2cor(cov1, std1) ≈ₙ cor1
+                    @test cov2cor(cov2, std2) ≈ₙ cor2
                 end
 
                 @testset "cov2cor!" begin
                     tmp_cov1 = copy(cov1)
-                    @test !(tmp_cov1 ≈ cor1)
+                    @test !(tmp_cov1 ≈ₙ cor1)
                     StatsBase.cov2cor!(tmp_cov1, std1)
-                    @test tmp_cov1 ≈ cor1
+                    @test tmp_cov1 ≈ₙ cor1
 
                     tmp_cov2 = copy(cov2)
-                    @test !(tmp_cov2 ≈ cor2)
+                    @test !(tmp_cov2 ≈ₙ cor2)
                     StatsBase.cov2cor!(tmp_cov2, std2)
-                    @test tmp_cov2 ≈ cor2
+                    @test tmp_cov2 ≈ₙ cor2
                 end
 
                 @testset "cor2cov" begin
+
                     @test cor2cov(cor(X, dims = 1), std(X, dims = 1)) ≈ cov(X, dims = 1)
                     @test cor2cov(cor(X, dims = 2), std(X, dims = 2)) ≈ cov(X, dims = 2)
                     @test cor2cov(cor1, std1) ≈ cov1
@@ -237,8 +256,8 @@ weight_funcs = (weights, aweights, fweights, pweights)
     end
 
     @testset "Correlation" begin
-        @test cor(X, f(ones(3)), 1) ≈ cor(X, dims = 1)
-        @test cor(X, f(ones(8)), 2) ≈ cor(X, dims = 2)
+        @test cor(X, f(ones(3)), 1) ≈ₙ cor(X, dims = 1)
+        @test cor(X, f(ones(8)), 2) ≈ₙ cor(X, dims = 2)
 
         cov1 = cov(X, wv1, 1; corrected=false)
         std1 = std(X, wv1, 1; corrected=false)
@@ -247,8 +266,8 @@ weight_funcs = (weights, aweights, fweights, pweights)
         expected_cor1 = StatsBase.cov2cor!(cov1, std1)
         expected_cor2 = StatsBase.cov2cor!(cov2, std2)
 
-        @test cor(X, wv1, 1) ≈ expected_cor1
-        @test cor(X, wv2, 2) ≈ expected_cor2
+        @test cor(X, wv1, 1) ≈ₙ expected_cor1
+        @test cor(X, wv2, 2) ≈ₙ expected_cor2
     end
 
     @testset "Abstract covariance estimation" begin