Merge branch 'master' into patch-1

JuliaStats · May 16, 2024 · 9913c61 · 9913c61
2 parents dafd32b + 4e25f43
commit 9913c61
Show file tree

Hide file tree

Showing 11 changed files with 123 additions and 87 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -35,9 +35,24 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v3
+      - uses: codecov/codecov-action@v4
         with:
           file: lcov.info
+          # This is a public repo.
+          # We are using v4 of the codecov-action.
+          # If the PR is from a fork, then Codecov allows us to use
+          # tokenless Codecov uploads.
+          # If the PR is not from a fork, then Codecov does not allow
+          # us to use tokenless Codecov uploads, and thus we must use
+          # the token.
+          # Tokenless Codcov uploads are prone to random failures,
+          # due to Codecov's servers hitting GitHub rate limits.
+          # Therefore:
+          # 1. If this CI run is not a PR run, we set `fail_ci_if_error` to `true`.
+          # 2. If this CI run is a PR run and the PR is not from a fork, we set `fail_ci_if_error` to `true`.
+          # 3. If this CI run is a PR run and the PR is from a fork, we set `fail_ci_if_error` to `false`.
+          fail_ci_if_error: ${{ (github.event_name != 'pull_request') || (github.repository == github.event.pull_request.head.repo.full_name) }}
+          token: ${{ secrets.CODECOV_TOKEN }}
   docs:
     permissions:
       contents: write

diff --git a/Project.toml b/Project.toml
@@ -1,9 +1,10 @@
 name = "StatsBase"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 authors = ["JuliaStats"]
-version = "0.34.2"
+version = "0.34.3"
 
 [deps]
+AliasTables = "66dad0bd-aa9a-41b7-9441-69ab47430ed8"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -17,11 +18,17 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
 
 [compat]
+AliasTables = "1"
 DataAPI = "1"
 DataStructures = "0.10, 0.11, 0.12, 0.13, 0.14, 0.17, 0.18"
+LinearAlgebra = "<0.0.1, 1"
 LogExpFunctions = "0.3"
 Missings = "0.3, 0.4, 1.0"
+Printf = "<0.0.1, 1"
+Random = "<0.0.1, 1"
 SortingAlgorithms = "0.3, 1.0"
+SparseArrays = "<0.0.1, 1"
+Statistics = "1"
 StatsAPI = "1.2"
 julia = "1"
 

diff --git a/src/deprecates.jl b/src/deprecates.jl
@@ -46,3 +46,65 @@ end
 @deprecate stdm(x::AbstractArray{<:Real}, w::AbstractWeights, m::AbstractArray{<:Real}, dim::Int; corrected::Union{Bool, Nothing}=nothing) std(x, w, dim, mean=m, corrected=corrected) false
 @deprecate varm(x::AbstractArray{<:Real}, w::AbstractWeights, m::AbstractArray{<:Real}, dim::Int; corrected::Union{Bool, Nothing}=nothing) var(x, w, dim, mean=m, corrected=corrected) false
 @deprecate varm!(R::AbstractArray, x::AbstractArray{<:Real}, w::AbstractWeights, m::AbstractArray{<:Real}, dim::Int; corrected::Union{Bool, Nothing}=nothing) var!(R, x, w, dim, mean=m, corrected=corrected) false
+
+### This was never part of the public API
+### Deprecated April 2024
+function make_alias_table!(w::AbstractVector, wsum,
+                           a::AbstractVector{Float64},
+                           alias::AbstractVector{Int})
+    Base.depwarn("make_alias_table! is both internal and deprecated, use AliasTables.jl instead", :make_alias_table!)
+    # Arguments:
+    #
+    #   w [in]:         input weights
+    #   wsum [in]:      pre-computed sum(w)
+    #
+    #   a [out]:        acceptance probabilities
+    #   alias [out]:    alias table
+    #
+    # Note: a and w can be the same array, then that array will be
+    #       overwritten inplace by acceptance probabilities
+    #
+    # Returns nothing
+    #
+
+    n = length(w)
+    length(a) == length(alias) == n ||
+        throw(DimensionMismatch("Inconsistent array lengths."))
+
+    ac = n / wsum
+    for i = 1:n
+        @inbounds a[i] = w[i] * ac
+    end
+
+    larges = Vector{Int}(undef, n)
+    smalls = Vector{Int}(undef, n)
+    kl = 0  # actual number of larges
+    ks = 0  # actual number of smalls
+
+    for i = 1:n
+        @inbounds ai = a[i]
+        if ai > 1.0
+            larges[kl+=1] = i  # push to larges
+        elseif ai < 1.0
+            smalls[ks+=1] = i  # push to smalls
+        end
+    end
+
+    while kl > 0 && ks > 0
+        s = smalls[ks]; ks -= 1  # pop from smalls
+        l = larges[kl]; kl -= 1  # pop from larges
+        @inbounds alias[s] = l
+        @inbounds al = a[l] = (a[l] - 1.0) + a[s]
+        if al > 1.0
+            larges[kl+=1] = l  # push to larges
+        else
+            smalls[ks+=1] = l  # push to smalls
+        end
+    end
+
+    # this loop should be redundant, except for rounding
+    for i = 1:ks
+        @inbounds a[smalls[i]] = 1.0
+    end
+    nothing
+end
diff --git a/src/misc.jl b/src/misc.jl
@@ -59,7 +59,9 @@ run lengths.
 """
 function inverse_rle(vals::AbstractVector{T}, lens::AbstractVector{<:Integer}) where T
     m = length(vals)
-    length(lens) == m || raise_dimerror()
+    mlens = length(lens)
+    mlens == m || throw(DimensionMismatch(
+                        "number of vals ($m) does not match the number of lens ($mlens)"))
     n = sum(lens)
     n >= 0 || throw(ArgumentError("lengths must be non-negative"))
 

diff --git a/src/ranking.jl b/src/ranking.jl
@@ -8,7 +8,10 @@
 
 function _check_randparams(rks, x, p)
     n = length(rks)
-    length(x) == length(p) == n || raise_dimerror()
+    nx = length(x)
+    np = length(p)
+    nx == np == n || throw(
+        DimensionMismatch("lengths of x $nx and p $np do not match that of ranks $n"))
     return n
 end
 

diff --git a/src/sampling.jl b/src/sampling.jl
@@ -5,6 +5,7 @@
 #
 ###########################################################
 
+using AliasTables
 using Random: Sampler
 
 if VERSION < v"1.3.0-DEV.565"
@@ -637,65 +638,6 @@ end
 direct_sample!(a::AbstractArray, wv::AbstractWeights, x::AbstractArray) =
     direct_sample!(default_rng(), a, wv, x)
 
-function make_alias_table!(w::AbstractVector, wsum,
-                           a::AbstractVector{Float64},
-                           alias::AbstractVector{Int})
-    # Arguments:
-    #
-    #   w [in]:         input weights
-    #   wsum [in]:      pre-computed sum(w)
-    #
-    #   a [out]:        acceptance probabilities
-    #   alias [out]:    alias table
-    #
-    # Note: a and w can be the same array, then that array will be
-    #       overwritten inplace by acceptance probabilities
-    #
-    # Returns nothing
-    #
-
-    n = length(w)
-    length(a) == length(alias) == n ||
-        throw(DimensionMismatch("Inconsistent array lengths."))
-
-    ac = n / wsum
-    for i = 1:n
-        @inbounds a[i] = w[i] * ac
-    end
-
-    larges = Vector{Int}(undef, n)
-    smalls = Vector{Int}(undef, n)
-    kl = 0  # actual number of larges
-    ks = 0  # actual number of smalls
-
-    for i = 1:n
-        @inbounds ai = a[i]
-        if ai > 1.0
-            larges[kl+=1] = i  # push to larges
-        elseif ai < 1.0
-            smalls[ks+=1] = i  # push to smalls
-        end
-    end
-
-    while kl > 0 && ks > 0
-        s = smalls[ks]; ks -= 1  # pop from smalls
-        l = larges[kl]; kl -= 1  # pop from larges
-        @inbounds alias[s] = l
-        @inbounds al = a[l] = (a[l] - 1.0) + a[s]
-        if al > 1.0
-            larges[kl+=1] = l  # push to larges
-        else
-            smalls[ks+=1] = l  # push to smalls
-        end
-    end
-
-    # this loop should be redundant, except for rounding
-    for i = 1:ks
-        @inbounds a[smalls[i]] = 1.0
-    end
-    nothing
-end
-
 """
     alias_sample!([rng], a::AbstractArray, wv::AbstractWeights, x::AbstractArray)
 
@@ -706,31 +648,24 @@ Build an alias table, and sample therefrom.
 Reference: Walker, A. J. "An Efficient Method for Generating Discrete Random Variables
 with General Distributions." *ACM Transactions on Mathematical Software* 3 (3): 253, 1977.
 
-Noting `k=length(x)` and `n=length(a)`, this algorithm takes ``O(n \\log n)`` time
-for building the alias table, and then ``O(1)`` to draw each sample. It consumes ``2 k`` random numbers.
+Noting `k=length(x)` and `n=length(a)`, this algorithm takes ``O(n)`` time
+for building the alias table, and then ``O(1)`` to draw each sample. It consumes ``k`` random numbers.
 """
 function alias_sample!(rng::AbstractRNG, a::AbstractArray, wv::AbstractWeights, x::AbstractArray)
     Base.mightalias(a, x) &&
         throw(ArgumentError("output array x must not share memory with input array a"))
-    Base.mightalias(x, wv) &&
-        throw(ArgumentError("output array x must not share memory with weights array wv"))
-    1 == firstindex(a) == firstindex(wv) == firstindex(x) ||
+    1 == firstindex(a) == firstindex(wv) ||
         throw(ArgumentError("non 1-based arrays are not supported"))
-    wsum = sum(wv)
-    isfinite(wsum) || throw(ArgumentError("only finite weights are supported"))
-    n = length(a)
-    length(wv) == n || throw(DimensionMismatch("Inconsistent lengths."))
+    isfinite(sum(wv)) || throw(ArgumentError("only finite weights are supported"))
+    length(wv) == length(a) || throw(DimensionMismatch("Inconsistent lengths."))
 
     # create alias table
-    ap = Vector{Float64}(undef, n)
-    alias = Vector{Int}(undef, n)
-    make_alias_table!(wv, wsum, ap, alias)
+    at = AliasTable(wv)
 
     # sampling
-    s = Sampler(rng, 1:n)
-    for i = 1:length(x)
-        j = rand(rng, s)
-        x[i] = rand(rng) < ap[j] ? a[j] : a[alias[j]]
+    for i in eachindex(x)
+        j = rand(rng, at)
+        x[i] = a[j]
     end
     return x
 end

diff --git a/src/scalarstats.jl b/src/scalarstats.jl
@@ -433,7 +433,7 @@ Return the standard error of the mean for a collection `x`.
 A pre-computed `mean` may be provided.
 
 When not using weights, this is the (sample) standard deviation
-divided by the sample size. If weights are used, the
+divided by the square root of the sample size. If weights are used, the
 variance of the sample mean is calculated as follows:
 
 * `AnalyticWeights`: Not implemented.
@@ -742,7 +742,13 @@ function entropy(p)
     return -sum(xlogx, p)
 end
 
-entropy(p, b::Real) = entropy(p) / log(b)
+function entropy(p, b::Real)
+    e = entropy(p)
+    # Promote explicitly before applying `log` to avoid undesired promotions
+    # with `log(b)::Float64` arising from `b::Int` (ref: #924)
+    _b = first(promote(b, e))
+    return e / log(_b)
+end
 
 """
     renyientropy(p, α)

diff --git a/test/misc.jl b/test/misc.jl
@@ -9,6 +9,7 @@ z = [1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3, 3, 3]
 @test lens == [2, 3, 1, 1, 2, 4]
 @test inverse_rle(vals, lens) == z
 @test_throws ArgumentError inverse_rle(vals, fill(-1, length(lens)))
+@test_throws DimensionMismatch inverse_rle(vals, [1])
 
 z = [true, true, false, false, true, false, true, true, true]
 vals, lens = rle(z)

diff --git a/test/ranking.jl b/test/ranking.jl
@@ -37,3 +37,6 @@ s = ["c", "a", "b", "d", "d", "b", "e", "d"] # s is a vector of strings ordered
 @test tiedrank(s) == tiedrank(x)
 @test tiedrank(x, rev = true) == tiedrank(-x)
 @test tiedrank(x, lt = (x, y) -> isless(y, x)) == tiedrank(-x)
+
+
+@test_throws DimensionMismatch StatsBase._check_randparams([1,2], [1,2], [1])
diff --git a/test/scalarstats.jl b/test/scalarstats.jl
@@ -222,12 +222,7 @@ x = sort!(vcat([5:-1:i for i in 1:5]...))
 @test mad(Iterators.repeated(4, 10)) == 0
 @test mad(Integer[1,2,3,4]) === mad(1:4)
 let itr = (i for i in 1:10000)
-    if VERSION >= v"1.10.0-"
-        # FIXME: Allocations are closer to 10x this on 1.10
-        @test_broken (@benchmark mad($itr)).allocs < 200
-    else
-        @test (@benchmark mad($itr)).allocs < 200
-    end
+    @test (@benchmark mad($itr)).allocs < 200
 end
 
 # Issue 197
@@ -265,6 +260,10 @@ it = (xᵢ for xᵢ in x)
 @test @inferred(entropy([1//2, 1//2], 2))    ≈ 1.0
 @test @inferred(entropy([0.2, 0.3, 0.5], 2)) ≈ 1.4854752972273344
 
+# issue #924
+@test @inferred(entropy([0.5f0, 0.5f0], 2)) isa Float32
+@test @inferred(entropy([0.5f0, 0.5f0], MathConstants.e)) isa Float32
+
 @test_throws ArgumentError @inferred(entropy(Float64[]))
 @test_throws ArgumentError @inferred(entropy(Int[]))
 

diff --git a/test/wsampling.jl b/test/wsampling.jl
@@ -55,6 +55,9 @@ for wv in (
     check_wsample_wrep(a, (4, 7), wv, 5.0e-3; ordered=false)
 end
 
+@test_throws ArgumentError alias_sample!(rand(10), weights(fill(0, 10)), rand(10))
+@test_throws ArgumentError alias_sample!(rand(100), weights(randn(100)), rand(10))
+
 for rev in (true, false), T in (Int, Int16, Float64, Float16, BigInt, ComplexF64, Rational{Int})
     r = rev ? reverse(4:7) : (4:7)
     r = T===Int ? r : T.(r)