Skip to content

Commit

Permalink
Faster Sparse Covariance Matrix (#22735)
Browse files Browse the repository at this point in the history
* optimize sparse covariance matrix

* test for optimized sparse covariance matrix

* make spcov a function for the base method cov

* fix unit test

* use inbounds on for loop

* better comments and even faster implementation

* test for NaN and Inf

* robust testing for NaN, Inf

* NaN and Inf testing

* fix corrected keyword

* documentation about Inf

* style

* less allocations

* parens to prevent overflow

* simplify comments
  • Loading branch information
jeffwong authored and andreasnoack committed Aug 6, 2017
1 parent f047602 commit ae17198
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 0 deletions.
19 changes: 19 additions & 0 deletions base/sparse/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -941,3 +941,22 @@ end
chol(A::SparseMatrixCSC) = error("Use cholfact() instead of chol() for sparse matrices.")
lu(A::SparseMatrixCSC) = error("Use lufact() instead of lu() for sparse matrices.")
eig(A::SparseMatrixCSC) = error("Use eigs() instead of eig() for sparse matrices.")

function Base.cov(X::SparseMatrixCSC, vardim::Int=1; corrected::Bool=true)
a, b = size(X)
n, p = vardim == 1 ? (a, b) : (b, a)

# Cov(X) = E[(X-μ)'(X-μ)]
# = X'X - X'μ

# Compute X'X using sparse matrix operations
out = Matrix(Base.unscaled_covzm(X, vardim))

# Compute X'μ
sums = sum(X, vardim)
@inbounds for j in 1:p, i in 1:p
part2 = sums[i] * (sums[j] / n)
out[i,j] -= part2
end
return scale!(out, inv(n-Int(corrected)))
end
127 changes: 127 additions & 0 deletions test/sparse/sparse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1843,3 +1843,130 @@ end
B = A[5:-1:1, 5:-1:1]
@test issymmetric(B)
end

# Faster covariance function for sparse matrices
# Prevents densifying the input matrix when subtracting the mean
# Test against dense implementation
# PR https://github.com/JuliaLang/julia/pull/22735
# Part of this test needed to be hacked due to the treatment
# of Inf in sparse matrix algebra
# https://github.com/JuliaLang/julia/issues/22921
# The issue will be resolved in
# https://github.com/JuliaLang/julia/issues/22733
@testset "optimizing sparse covariance" begin
n = 10
p = 5
srand(1)
x_sparse = sprand(n, p, .50)
x_dense = convert(Matrix{Float64}, x_sparse)
@test cov(x_sparse, 1, corrected=true) cov(x_dense, 1, corrected=true)
@test cov(x_sparse, 1, corrected=false) cov(x_dense, 1, corrected=false)
@test cov(x_sparse, 2, corrected=true) cov(x_dense, 2, corrected=true)
@test cov(x_sparse, 2, corrected=false) cov(x_dense, 2, corrected=false)

# Test with NaN
x_sparse[1,1] = NaN
x_dense[1,1] = NaN

cov_sparse = cov(x_sparse, 1, corrected=true)
cov_dense = cov(x_dense, 1, corrected=true)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 2, corrected=true)
cov_dense = cov(x_dense, 2, corrected=true)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 1, corrected=false)
cov_dense = cov(x_dense, 1, corrected=false)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 2, corrected=false)
cov_dense = cov(x_dense, 2, corrected=false)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

# Test with Inf
x_sparse[1,1] = Inf
x_dense[1,1] = Inf

cov_sparse = cov(x_sparse, 1, corrected=true)

# Sparse matrix algebra generates Inf, but
# dense matrix algebra generates NaN
# NaN NaN -Inf -Inf NaN
# NaN 0.124035 0.00830252 -0.0430049 0.021373
# -Inf 0.00830252 0.111628 -0.0149783 0.00773125
# -Inf -0.0430049 -0.0149783 0.099782 -0.0496011
# NaN 0.021373 0.00773125 -0.0496011 0.126186

cov_sparse[isinf.(cov_sparse)] = NaN
cov_dense = cov(x_dense, 1, corrected=true)

# NaN NaN NaN NaN NaN
# NaN 0.124035 0.00830252 -0.0430049 0.021373
# NaN 0.00830252 0.111628 -0.0149783 0.00773125
# NaN -0.0430049 -0.0149783 0.099782 -0.0496011
# NaN 0.021373 0.00773125 -0.0496011 0.126186

@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 2, corrected=true)
cov_sparse[isinf.(cov_sparse)] = NaN
cov_dense = cov(x_dense, 2, corrected=true)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 1, corrected=false)
cov_sparse[isinf.(cov_sparse)] = NaN
cov_dense = cov(x_dense, 1, corrected=false)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 2, corrected=false)
cov_sparse[isinf.(cov_sparse)] = NaN
cov_dense = cov(x_dense, 2, corrected=false)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

# Test with NaN and Inf
x_sparse[2,1] = NaN
x_dense[2,1] = NaN

cov_sparse = cov(x_sparse, 1, corrected=true)
cov_dense = cov(x_dense, 1, corrected=true)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 2, corrected=true)
cov_sparse[isinf.(cov_sparse)] = NaN
cov_dense = cov(x_dense, 2, corrected=true)
@test cov_sparse[3:end, 3:end] cov_dense[3:end, 3:end]
@test isequal(cov_sparse[1:end, 1:2], cov_dense[1:end, 1:2])
@test isequal(cov_sparse[1:2, 1:end], cov_dense[1:2, 1:end])

cov_sparse = cov(x_sparse, 1, corrected=false)
cov_dense = cov(x_dense, 1, corrected=false)
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])

cov_sparse = cov(x_sparse, 2, corrected=false)
cov_sparse[isinf.(cov_sparse)] = NaN
cov_dense = cov(x_dense, 2, corrected=false)
@test cov_sparse[3:end, 3:end] cov_dense[3:end, 3:end]
@test isequal(cov_sparse[1:end, 1:2], cov_dense[1:end, 1:2])
@test isequal(cov_sparse[1:2, 1:end], cov_dense[1:2, 1:end])
end

2 comments on commit ae17198

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily benchmark build, I will reply here when finished:

@nanosoldier runbenchmarks(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here. cc @ararslan

Please sign in to comment.