Faster Sparse Covariance Matrix (#22735)

* optimize sparse covariance matrix * test for optimized sparse covariance matrix * make spcov a function for the base method cov * fix unit test * use inbounds on for loop * better comments and even faster implementation * test for NaN and Inf * robust testing for NaN, Inf * NaN and Inf testing * fix corrected keyword * documentation about Inf * style * less allocations * parens to prevent overflow * simplify comments
JuliaLang · Aug 6, 2017 · ae17198 · ae17198 · nanosoldier · Aug 6, 2017
1 parent f047602
commit ae17198
Show file tree

Hide file tree

Showing 2 changed files with 146 additions and 0 deletions.
diff --git a/base/sparse/linalg.jl b/base/sparse/linalg.jl
@@ -941,3 +941,22 @@ end
 chol(A::SparseMatrixCSC) = error("Use cholfact() instead of chol() for sparse matrices.")
 lu(A::SparseMatrixCSC) = error("Use lufact() instead of lu() for sparse matrices.")
 eig(A::SparseMatrixCSC) = error("Use eigs() instead of eig() for sparse matrices.")
+
+function Base.cov(X::SparseMatrixCSC, vardim::Int=1; corrected::Bool=true)
+    a, b = size(X)
+    n, p = vardim == 1 ? (a, b) : (b, a)
+
+    # Cov(X) = E[(X-μ)'(X-μ)]
+    # = X'X - X'μ
+
+    # Compute X'X using sparse matrix operations
+    out = Matrix(Base.unscaled_covzm(X, vardim))
+
+    # Compute X'μ
+    sums = sum(X, vardim)
+    @inbounds for j in 1:p, i in 1:p
+        part2 = sums[i] * (sums[j] / n)
+        out[i,j] -= part2
+    end
+    return scale!(out, inv(n-Int(corrected)))
+end
diff --git a/test/sparse/sparse.jl b/test/sparse/sparse.jl
@@ -1843,3 +1843,130 @@ end
     B = A[5:-1:1, 5:-1:1]
     @test issymmetric(B)
 end
+
+# Faster covariance function for sparse matrices
+# Prevents densifying the input matrix when subtracting the mean
+# Test against dense implementation
+# PR https://github.com/JuliaLang/julia/pull/22735
+# Part of this test needed to be hacked due to the treatment
+# of Inf in sparse matrix algebra
+# https://github.com/JuliaLang/julia/issues/22921
+# The issue will be resolved in
+# https://github.com/JuliaLang/julia/issues/22733
+@testset "optimizing sparse covariance" begin
+    n = 10
+    p = 5
+    srand(1)
+    x_sparse = sprand(n, p, .50)
+    x_dense = convert(Matrix{Float64}, x_sparse)
+    @test cov(x_sparse, 1, corrected=true)  ≈ cov(x_dense, 1, corrected=true)
+    @test cov(x_sparse, 1, corrected=false) ≈ cov(x_dense, 1, corrected=false)
+    @test cov(x_sparse, 2, corrected=true)  ≈ cov(x_dense, 2, corrected=true)
+    @test cov(x_sparse, 2, corrected=false) ≈ cov(x_dense, 2, corrected=false)
+
+    # Test with NaN
+    x_sparse[1,1] = NaN
+    x_dense[1,1]  = NaN
+
+    cov_sparse = cov(x_sparse, 1, corrected=true)
+    cov_dense = cov(x_dense, 1, corrected=true)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 2, corrected=true)
+    cov_dense = cov(x_dense, 2, corrected=true)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 1, corrected=false)
+    cov_dense = cov(x_dense, 1, corrected=false)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 2, corrected=false)
+    cov_dense = cov(x_dense, 2, corrected=false)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    # Test with Inf
+    x_sparse[1,1] = Inf
+    x_dense[1,1]  = Inf
+
+    cov_sparse = cov(x_sparse, 1, corrected=true)
+
+    # Sparse matrix algebra generates Inf, but
+    # dense matrix algebra generates NaN
+    # NaN  NaN           -Inf           -Inf          NaN
+    # NaN    0.124035       0.00830252    -0.0430049    0.021373
+    # -Inf    0.00830252     0.111628      -0.0149783    0.00773125
+    # -Inf   -0.0430049     -0.0149783      0.099782    -0.0496011
+    # NaN    0.021373       0.00773125    -0.0496011    0.126186
+
+    cov_sparse[isinf.(cov_sparse)] = NaN
+    cov_dense = cov(x_dense, 1, corrected=true)
+
+    # NaN  NaN           NaN           NaN          NaN
+    # NaN    0.124035      0.00830252   -0.0430049    0.021373
+    # NaN    0.00830252    0.111628     -0.0149783    0.00773125
+    # NaN   -0.0430049    -0.0149783     0.099782    -0.0496011
+    # NaN    0.021373      0.00773125   -0.0496011    0.126186
+
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 2, corrected=true)
+    cov_sparse[isinf.(cov_sparse)] = NaN
+    cov_dense = cov(x_dense, 2, corrected=true)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 1, corrected=false)
+    cov_sparse[isinf.(cov_sparse)] = NaN
+    cov_dense = cov(x_dense, 1, corrected=false)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 2, corrected=false)
+    cov_sparse[isinf.(cov_sparse)] = NaN
+    cov_dense = cov(x_dense, 2, corrected=false)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    # Test with NaN and Inf
+    x_sparse[2,1] = NaN
+    x_dense[2,1]  = NaN
+
+    cov_sparse = cov(x_sparse, 1, corrected=true)
+    cov_dense = cov(x_dense, 1, corrected=true)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 2, corrected=true)
+    cov_sparse[isinf.(cov_sparse)] = NaN
+    cov_dense = cov(x_dense, 2, corrected=true)
+    @test cov_sparse[3:end, 3:end] ≈ cov_dense[3:end, 3:end]
+    @test isequal(cov_sparse[1:end, 1:2], cov_dense[1:end, 1:2])
+    @test isequal(cov_sparse[1:2, 1:end], cov_dense[1:2, 1:end])
+
+    cov_sparse = cov(x_sparse, 1, corrected=false)
+    cov_dense = cov(x_dense, 1, corrected=false)
+    @test cov_sparse[2:end, 2:end] ≈ cov_dense[2:end, 2:end]
+    @test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
+    @test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
+
+    cov_sparse = cov(x_sparse, 2, corrected=false)
+    cov_sparse[isinf.(cov_sparse)] = NaN
+    cov_dense = cov(x_dense, 2, corrected=false)
+    @test cov_sparse[3:end, 3:end] ≈ cov_dense[3:end, 3:end]
+    @test isequal(cov_sparse[1:end, 1:2], cov_dense[1:end, 1:2])
+    @test isequal(cov_sparse[1:2, 1:end], cov_dense[1:2, 1:end])
+end