Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make covariance and correlation work for iterators, skipmissing in particular. #34

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
name = "Statistics"
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. It's a hack to make sure julia knows to load this folder, it's described here for Pkg.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normally the Travis script does that automatically, so you can revert this: https://github.com/JuliaLang/Statistics.jl/blob/master/.travis.yml#L24

Though you need it to run tests locally.


[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down
59 changes: 56 additions & 3 deletions src/Statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =
(vardim == 1 ? *(transpose(x), _conj(y)) : *(x, adjoint(y)))

# covzm (with centered data)

nalimilan marked this conversation as resolved.
Show resolved Hide resolved
covzm(itr::Any; corrected::Bool = true) = covzm(collect(itr); corrected = corrected)
covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected))
function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
C = unscaled_covzm(x, vardim)
Expand All @@ -504,6 +504,7 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
A .= A .* b
return A
end
covzm(x::Any, y::Any; corrected::Bool = true) = covzm(collect(x), collect(y); corrected = corrected)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
unscaled_covzm(x, y) / (length(x) - Int(corrected))
function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true)
Expand All @@ -518,16 +519,31 @@ end
# covm (with provided mean)
## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector}
## which can't be handled by broadcast
covm(itr::Any, itrmean; corrected::Bool = true) = covm(collect(itr), itrmean)
covm(x::AbstractVector, xmean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x); corrected=corrected)
covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, vardim; corrected=corrected)
covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, y .- ymean, vardim; corrected=corrected)

# cov (API)
"""
cov(itr::Any; corrected::Bool=true)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved

Compute the variance of the iterator `itr`. If `corrected` is `true` (the default) then the sum
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where
`n = length(collect(itr))`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
`n = length(collect(itr))`.
``n`` is the number of elements.

"""
function cov(itr::Any; corrected::Bool = true)
x = collect(itr)
covm(x, mean(x); corrected = corrected)
end

"""
cov(x::AbstractVector; corrected::Bool=true)

Expand All @@ -546,6 +562,22 @@ if `corrected` is `false` where `n = size(X, dims)`.
cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) =
covm(X, _vmean(X, dims), dims; corrected=corrected)


pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
"""
cov(x::Any, y::Any; corrected::Bool=true)

Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the
default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where
``*`` denotes the complex conjugate and `n = length(collect(x)) = length(collect(y))`. If `corrected` is
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
``*`` denotes the complex conjugate and `n = length(collect(x)) = length(collect(y))`. If `corrected` is
``*`` denotes the complex conjugate and ``n`` the number of elements. If `corrected` is

`false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``.
"""
function cov(x::Any, y::Any; corrected::Bool = true)
cx = collect(x)
cy = collect(y)

covm(cx, mean(cx), cy, mean(cy); corrected = corrected)
end

"""
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true)

Expand Down Expand Up @@ -630,7 +662,7 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray)
end

# corzm (non-exported, with centered data)

corzm(x::Any) = corzm(collect(x))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same remark here and for corm as for cor about using the eltype when it's known.

corzm(x::AbstractVector{T}) where {T} = one(real(T))
function corzm(x::AbstractMatrix, vardim::Int=1)
c = unscaled_covzm(x, vardim)
Expand All @@ -644,9 +676,10 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) =
cov2cor!(unscaled_covzm(x, y, vardim), sqrt!(sum(abs2, x, dims=vardim)), sqrt!(sum(abs2, y, dims=vardim)))

# corm

corm(x::Any, xmean) = corzm(collect(x), xmean)
corm(x::AbstractVector{T}, xmean) where {T} = one(real(T))
corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim)
corm(x::Any, mx, y::Any, my) = corm(collect(x), mx, collect(y), my)
function corm(x::AbstractVector, mx, y::AbstractVector, my)
require_one_based_indexing(x, y)
n = length(x)
Expand Down Expand Up @@ -674,6 +707,14 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) =
corzm(x .- xmean, y .- ymean, vardim)

# cor
"""
cor(itr::Any)

Return the number one.
"""
cor(itr::Any) = one(real(eltype(collect(x))))


"""
cor(x::AbstractVector)

Expand All @@ -688,6 +729,18 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di
"""
cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims)

"""
cor(x::AbstractVector, y::AbstractVector)

Compute the Pearson correlation between the vectors `x` and `y`.
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
"""
function cor(x::Any, y::Any)
cx = collect(x)
cy = collect(y)

corm(cx, mean(cx), cy, mean(cy))
end

"""
cor(x::AbstractVector, y::AbstractVector)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this docstring which is a special case of the previous one.


Expand Down