JuliaStats · pdeffebach · Mar 13, 2020 · Mar 13, 2020 · Mar 13, 2020 · Apr 9, 2020
diff --git a/Project.toml b/Project.toml
@@ -1,5 +1,4 @@
 name = "Statistics"
-uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/src/Statistics.jl b/src/Statistics.jl
@@ -388,6 +388,8 @@ function sqrt!(A::AbstractArray)
     A
 end
 
+sqrt!(x::Number) = sqrt(x)
+
 stdm(A::AbstractArray, m; corrected::Bool=true) =
     sqrt.(varm(A, m; corrected=corrected))
 
@@ -479,13 +481,19 @@ end
 _vmean(x::AbstractVector, vardim::Int) = mean(x)
 _vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim)
 
-# core functions
+_abs2(x::Number) = abs2(x)
+_abs2(x)       = x*x'
+
+_conjmul(x::Number, y::Number) = x * conj(y)
+_conjmul(x, y) = x * _conj(y)'  
 
-unscaled_covzm(x::AbstractVector{<:Number})    = sum(abs2, x)
-unscaled_covzm(x::AbstractVector)              = sum(t -> t*t', x)
+# core functions
+unscaled_covzm(itr) = sum(_abs2, itr)
+unscaled_covzm(x::AbstractVector)              = sum(_abs2, x)
 unscaled_covzm(x::AbstractMatrix, vardim::Int) = (vardim == 1 ? _conj(x'x) : x * x')
 
-unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(conj(y[i])*x[i] for i in eachindex(y, x))
+unscaled_covzm(x, y) = sum(t -> _conjmul(first(t), last(t)), zip(x, y))
+unscaled_covzm(x::AbstractVector, y::AbstractVector) = sum(_conjmul(x[i], y[i]) for i in eachindex(y, x))
 unscaled_covzm(x::AbstractVector, y::AbstractMatrix, vardim::Int) =
     (vardim == 1 ? *(transpose(x), _conj(y)) : *(transpose(x), transpose(_conj(y))))
 unscaled_covzm(x::AbstractMatrix, y::AbstractVector, vardim::Int) =
@@ -494,7 +502,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =
     (vardim == 1 ? *(transpose(x), _conj(y)) : *(x, adjoint(y)))
 
 # covzm (with centered data)
-
+covzm(itr::Any; corrected::Bool=true) = covzm(collect(itr); corrected = corrected)
 covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected))
 function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
     C = unscaled_covzm(x, vardim)
@@ -504,6 +512,7 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
     A .= A .* b
     return A
 end
+covzm(x::Any, y::Any; corrected::Bool=true) = covzm(collect(x), collect(y); corrected = corrected)
 covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
     unscaled_covzm(x, y) / (length(x) - Int(corrected))
 function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true)
@@ -518,20 +527,37 @@ end
 # covm (with provided mean)
 ## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector}
 ## which can't be handled by broadcast
+covm(itr::Any, itrmean; corrected::Bool=true) =
+    covm(map(t -> t - itrmean, x); corrected = corrected)
 covm(x::AbstractVector, xmean; corrected::Bool=true) =
     covzm(map(t -> t - xmean, x); corrected=corrected)
 covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) =
     covzm(x .- xmean, vardim; corrected=corrected)
+covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true) = 
+    covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
 covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
     covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
 covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) =
     covzm(x .- xmean, y .- ymean, vardim; corrected=corrected)
 
 # cov (API)
+"""
+    cov(x::Any; corrected::Bool=true)
+
+Compute the variance of the iterator `x`. If `corrected` is `true` (the default) then the sum
+is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n`
+is the number of elements in the iterator, which is not necessarily known. 
+"""
+function cov(x::Any, corrected::Bool=true)
+    cx = collect(x)
+    covm(cx, mean(cx); corrected=corrected)
+end
+
 """
     cov(x::AbstractVector; corrected::Bool=true)
 
-Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum
+Compute the variance of the vector `x`. If `x` is a vector of vectors, returns the estimated
+variance-covariance matrix of elements in `x`. If `corrected` is `true` (the default) then the sum
 is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`.
 """
 cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected)
@@ -546,6 +572,22 @@ if `corrected` is `false` where `n = size(X, dims)`.
 cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) =
     covm(X, _vmean(X, dims), dims; corrected=corrected)
 
+"""
+    cov(x::Any, y::Any; corrected::Bool=true)
+
+Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the
+default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where
+``*`` denotes the complex conjugate and `n` is the number of elements in `x` which must equal 
+the number of elements in `y`. If `x` and `y` are both vectors of vectors, computes the analagous
+estimator for the covariance matrix for `xi` and `yi. If `corrected` is `false`, computes 
+``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``.
+"""
+function cov(x::Any, y::Any; corrected::Bool=true)
+    cx = collect(x)
+    cy = collect(y)
+    covm(cx, mean(cx), cy, mean(cy); corrected=corrected)
+end
+
 """
     cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true)
 
@@ -628,10 +670,19 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray)
     end
     return C
 end
-
+function cov2cor!(xy::Number, xsd::Number, ysd::Number)
+    xx = abs2(xsd)
+    yy = abs2(ysd)
+    clampcor(xy / max(xx, yy) / sqrt(min(xx, yy) / max(xx, yy)))
+end
 # corzm (non-exported, with centered data)
 
-corzm(x::AbstractVector{T}) where {T} = one(real(T))
+corzm(itr) = corzm(collect(itr))
+corzm(x::AbstractVector{T}) where {T<:Number} = one(real(T))
+function corzm(x::AbstractVector{T}) where {T}
+    c = unscaled_covzm(x)
+    return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...)))
+end
 function corzm(x::AbstractMatrix, vardim::Int=1)
     c = unscaled_covzm(x, vardim)
     return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...)))
@@ -644,9 +695,16 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) =
     cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt!(sum(abs2, y, dims=vardim)))
 
 # corm
-
-corm(x::AbstractVector{T}, xmean) where {T} = one(real(T))
+corm(itr::Any, itrmean) = corm(collect(itr), itrmean)
+corm(x::AbstractVector{<:Number}, xmean) = one(real(eltype(x)))
+function corm(x::AbstractVector, xmean)
+    c = sum(t -> _abs2(t - xmean), x)
+    return cov2cor!(c, collect(sqrt(c[i,i]) for i in 1:min(size(c)...)))
+end
 corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim)
+corm(x::Any, mx, y::Any, my) = corm(collect(x), mx, collect(y), my)
+_one(x) = one(x)
+_one(x::AbstractArray) = fill(one(x[1]), size(x))  
 function corm(x::AbstractVector, mx, y::AbstractVector, my)
     require_one_based_indexing(x, y)
     n = length(x)
@@ -655,31 +713,51 @@ function corm(x::AbstractVector, mx, y::AbstractVector, my)
 
     @inbounds begin
         # Initialize the accumulators
-        xx = zero(sqrt(abs2(one(x[1]))))
-        yy = zero(sqrt(abs2(one(y[1]))))
-        xy = zero(x[1] * y[1]')
+        xx = zero(sqrt!(_abs2(_one(x[1]))))
+        yy = zero(sqrt!(_abs2(_one(y[1]))))
+        xy = zero(_conjmul(x[1], y[1]))
 
         @simd for i in eachindex(x, y)
             xi = x[i] - mx
             yi = y[i] - my
-            xx += abs2(xi)
-            yy += abs2(yi)
-            xy += xi * yi'
+            xx += _abs2(xi)
+            yy += _abs2(yi)
+            xy += _conjmul(xi, yi)
         end
     end
-    return clampcor(xy / max(xx, yy) / sqrt(min(xx, yy) / max(xx, yy)))
+
+    @show xx
+    @show yy
+    @show xy
+
+    return cov2cor!(xy, xx, yy)
 end
 
 corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) =
     corzm(x .- xmean, y .- ymean, vardim)
 
 # cor
 """
-    cor(x::AbstractVector)
+    cor(itr::Any)
+
+Return the number one if `itr` iterates through scalars. Returns the correlation between
+elements of the iterator otherwise. 
+"""
+cor(itr::Any) = cor(collect(itr))
+
+"""
+    cor(x::AbstractVector{<:Number})
 
 Return the number one.
 """
-cor(x::AbstractVector) = one(real(eltype(x)))
+cor(x::AbstractVector{<:Number}) = one(real(eltype(x)))
+
+"""
+    cor(x::AbstractVector)
+
+Return the Pearson correlation matrix between elements in `x`.
+"""
+cor(x::AbstractVector) = corm(x, mean(x))
 
 """
     cor(X::AbstractMatrix; dims::Int=1)
@@ -688,6 +766,18 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di
 """
 cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims)
 
+"""
+    cor(x::Any, y::Any)
+
+Compute the Pearson correlation between the iterators `x` and `y`.
+"""
+function cor(x::Any, y::Any) 
+    cx = collect(x)
+    cy = collect(y)
+
+    corm(cx, mean(cx), cy, mean(cy))
+end
+
 """
     cor(x::AbstractVector, y::AbstractVector)
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -314,6 +314,10 @@ Y = [6.0  2.0;
      5.0  8.0;
      3.0  4.0;
      2.0  3.0]
+X_vec = [[X[i,1], X[i,2]] for i in 1:size(X, 1)]
+X_gen = ([X[i,1], X[i,2]] for i in 1:size(X, 1))
+Y_vec = [[Y[i,1], Y[i,2]] for i in 1:size(Y, 1)]
+Y_gen = ([Y[i,1], Y[i,2]] for i in 1:size(Y, 1))
 
 @testset "covariance" begin
     for vd in [1, 2], zm in [true, false], cr in [true, false]
@@ -328,6 +332,8 @@ Y = [6.0  2.0;
             end
             x1 = vec(X[:,1])
             y1 = vec(Y[:,1])
+            x1_gen = (x for x in x1)
+            y1_gen = (y for y in y1)
         else
             k = size(X, 1)
             Cxx = zeros(k, k)
@@ -338,6 +344,8 @@ Y = [6.0  2.0;
             end
             x1 = vec(X[1,:])
             y1 = vec(Y[1,:])
+            x1_gen = (x for x in x1)
+            y1_gen = (y for y in y1)
         end
 
         c = zm ? Statistics.covm(x1, 0, corrected=cr) :
@@ -346,14 +354,14 @@ Y = [6.0  2.0;
         @test c ≈ Cxx[1,1]
         @inferred cov(x1, corrected=cr)
 
-        @test cov(X) == Statistics.covm(X, mean(X, dims=1))
+        @test cov(X) == cov(X_vec) == cov(X_gen) == Statistics.covm(X, mean(X, dims=1))
         C = zm ? Statistics.covm(X, 0, vd, corrected=cr) :
                  cov(X, dims=vd, corrected=cr)
         @test size(C) == (k, k)
         @test C ≈ Cxx
         @inferred cov(X, dims=vd, corrected=cr)
 
-        @test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1))
+        @test cov(x1, y1) == cov(x1_gen, y1_gen) == Statistics.covm(x1, mean(x1), y1, mean(y1))
         c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) :
                  cov(x1, y1, corrected=cr)
         @test isa(c, Float64)
@@ -378,7 +386,10 @@ Y = [6.0  2.0;
         @test vec(C) ≈ Cxy[:,1]
         @inferred cov(X, y1, dims=vd, corrected=cr)
 
-        @test cov(X, Y) == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1))
+        # Separate tests for equality and approximation
+        C = cov(X, Y)
+        @test C == Statistics.covm(X, mean(X, dims=1), Y, mean(Y, dims=1))
+        @test C ≈ cov(X_vec, Y_vec) ≈ cov(X_gen, Y_gen) 
         C = zm ? Statistics.covm(X, 0, Y, 0, vd, corrected=cr) :
                  cov(X, Y, dims=vd, corrected=cr)
         @test size(C) == (k, k)
@@ -428,17 +439,18 @@ end
         end
 
         c = zm ? Statistics.corm(x1, 0) : cor(x1)
+        c_gen  = zm ? Statistics.corm((xi for xi in x1), 0) : cor(xi for xi in x1)
         @test isa(c, Float64)
-        @test c ≈ Cxx[1,1]
+        @test c ≈ c_gen ≈ Cxx[1,1]
         @inferred cor(x1)
 
-        @test cor(X) == Statistics.corm(X, mean(X, dims=1))
+        @test cor(X) == Statistics.corm(X, mean(X, dims=1)) == cor(X_vec) == cor(X_gen) 
         C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd)
         @test size(C) == (k, k)
         @test C ≈ Cxx
         @inferred cor(X, dims=vd)
 
-        @test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1))
+        @test cor(x1, y1) == cor((ix for ix in x1), (iy for iy in y1)) == Statistics.corm(x1, mean(x1), y1, mean(y1))
         c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1)
         @test isa(c, Float64)
         @test c ≈ Cxy[1,1]
@@ -460,7 +472,8 @@ end
         @test vec(C) ≈ Cxy[:,1]
         @inferred cor(X, y1, dims=vd)
 
-        @test cor(X, Y) == Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1))
+        @test_skip cor(X, Y) == cor(X_vec, Y_vec) == cor(X_gen, Y_gen) == 
+            Statistics.corm(X, mean(X, dims=1), Y, mean(Y, dims=1))
         C = zm ? Statistics.corm(X, 0, Y, 0, vd) : cor(X, Y, dims=vd)
         @test size(C) == (k, k)
         @test C ≈ Cxy
@@ -644,12 +657,15 @@ end
 @testset "cov and cor of complex arrays (issue #21093)" begin
     x = [2.7 - 3.3im, 0.9 + 5.4im, 0.1 + 0.2im, -1.7 - 5.8im, 1.1 + 1.9im]
     y = [-1.7 - 1.6im, -0.2 + 6.5im, 0.8 - 10.0im, 9.1 - 3.4im, 2.7 - 5.5im]
-    @test cov(x, y) ≈ 4.8365 - 12.119im
-    @test cov(y, x) ≈ 4.8365 + 12.119im
+    x_gen = (i for i in x)
+    y_gen = (i for i in y)
+    xy_vec = [[x[i], y[i]] for i in 1:length(x)]
+    @test cov(x, y) ≈ cov(x_gen, y_gen) ≈4.8365 - 12.119im
+    @test cov(y, x) ≈ cov(y_gen, x_gen) ≈ 4.8365 + 12.119im
     @test cov(x, reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1)
     @test cov(reshape(x, :, 1), y) ≈ reshape([4.8365 - 12.119im], 1, 1)
     @test cov(reshape(x, :, 1), reshape(y, :, 1)) ≈ reshape([4.8365 - 12.119im], 1, 1)
-    @test cov([x y]) ≈ [21.779 4.8365-12.119im;
+    @test cov([x y]) ≈ cov(xy_vec) ≈ cov((i for i in xy_vec)) ≈ [21.779 4.8365-12.119im;
                         4.8365+12.119im 54.548]
     @test cor(x, y) ≈ 0.14032104449218274 - 0.35160772008699703im
     @test cor(y, x) ≈ 0.14032104449218274 + 0.35160772008699703im