Improvements in EVT confidence function (#25)

* create new confidence file * add computing a normalized NRMSE * correct docstring description for extreme dim * add docstring of significance * update tests for EVT significance * require at least 2.8 complexity measutres for allprobs * add cramer von mises as an option * changelog * finally fix the wrong r0 error message * acxtually fix the issue
JuliaDynamics · Jul 29, 2023 · 9aca116 · 9aca116
1 parent 3a3db84
commit 9aca116
Show file tree

Hide file tree

Showing 7 changed files with 196 additions and 88 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# 1.7
+
+- Update `extremevaltheory_gpdfit_pvalues` to calculate further the NRMSE of the GPD fits.
+- Set default estimator to `:exp` for EVT.
+- Bugfixes in `extremevaltheory_gpdfit_pvalues`
+- Add Cramer Von Mises estimator for `extremevaltheory_gpdfit_pvalues`
+
 # 1.6
 
 - new function `extremevaltheory_gpdfit_pvalues` that can help quantify the "goodness" of the results of the EVT dimension

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FractalDimensions"
 uuid = "4665ce21-e117-4649-aed8-08bbe5ccbead"
 authors = ["George Datseris <[email protected]>"]
-version = "1.6.2"
+version = "1.7.0"
 
 [deps]
 ComplexityMeasures = "ab4b797d-85ee-42ba-b621-05d793b346a2"
@@ -19,7 +19,7 @@ StateSpaceSets = "40b095a5-5852-4c12-98c7-d43bf788e795"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-ComplexityMeasures = "2.5"
+ComplexityMeasures = "2.8"
 Distances = "0.10"
 Distributions = "0.25"
 HypothesisTests = "0.11"

diff --git a/src/FractalDimensions.jl b/src/FractalDimensions.jl
@@ -33,6 +33,7 @@ include("corrsum_based/correlationsum_fixedmass.jl")
 include("corrsum_based/takens_best_estimate.jl")
 
 include("extremes_based/extremesdim.jl")
+include("extremes_based/confidence.jl")
 
 include("timeseries_roughness/higuchi.jl")
 

diff --git a/src/corrsum_based/correlationsum_boxassisted.jl b/src/corrsum_based/correlationsum_boxassisted.jl
@@ -85,8 +85,12 @@ function boxed_correlationsum(
         X, εs, r0 = maximum(εs); q = 2, P = 2, kwargs...
     )
     P ≤ size(X, 2)   || error("Prism dimension has to be ≤ than `X` dimension.")
-    r0 ≥ maximum(εs) || error("Box size `r0` has to be ≥ than `maximum(εs)`.")
     issorted(εs)     || error("Sorted `εs` required for optimized version.")
+    if r0 < maximum(εs)
+        @warn("Box size `r0` has to be ≥ than `maximum(εs)`.")
+        r0 = maximum(εs)
+    end
+
     boxes, contents = data_boxing(X, r0, P)
     Cs = if q == 2
         boxed_correlationsum_2(boxes, contents, X, εs; kwargs...)

diff --git a/src/extremes_based/confidence.jl b/src/extremes_based/confidence.jl
@@ -0,0 +1,162 @@
+export extremevaltheory_gpdfit_pvalues
+export CramerVonMises
+
+# for confidence testing
+using HypothesisTests: OneSampleADTest, ApproximateOneSampleKSTest, pvalue
+using Distributions: GeneralizedPareto, pdf
+using ComplexityMeasures
+
+"""
+    extremevaltheory_gpdfit_pvalues(X, p; kw...)
+
+Return various computed quantities that may quantify the significance of the results of
+[`extremevaltheory_dims_persistences`](@ref)`(X, p; kw...)`, terms of quantifying
+how well a Generalized Pareto Distribution (GPD) describes exceedences
+in the input data.
+
+## Keyword arguments
+
+- `show_progress, estimator` as in [`extremevaltheory_dims_persistences`](@ref)
+- `TestType = ApproximateOneSampleKSTest`: the test type to use. It can be
+  `ApproximateOneSampleKSTest, ExactOneSampleKSTest, CramerVonMises`.
+  We noticed that `OneSampleADTest` sometimes yielded nonsensical results:
+  all p-values were equal and were very small ≈ 1e-6.
+- `nbins = round(Int, length(X)*(1-p)/20)`: number of bins to use when computing
+  the histogram of the exceedances for computing the NRMSE.
+  The default value will use equally spaced
+  bins that are equal to the length of the exceedances divided by 20.
+
+## Description
+
+The function computes the exceedances ``E_i`` for each point ``x_i \\in X`` as in
+[`extremevaltheory_dims_persistences`](@ref).
+It returns 5 quantities, all being vectors of length `length(X)`:
+
+- `Es`, all exceedences, as a vector of vectors.
+- `sigmas, xis` the fitted σ, ξ to the GPD fits for each exceedance
+- `nrmses` the normalized root mean square distance of the fitted GPD
+  to the histogram of the exceedances
+- `pvalues` the pvalues of a statistical test of the appropriateness of the GPD fit
+
+The output `nrmses` quantifies the distance between the fitted GPD and the empirical
+histogram of the exceedances. It is computed as
+```math
+NRMSE = \\sqrt{\\frac{\\sum{(P_j - G_j)^2}{\\sum{(P_j - U)^2}}
+```
+where ``P_j`` the empirical (observed) probability at bin ``j``, ``G_j`` the fitted GPD
+probability at the midpoint of bin ``j``, and ``U`` same as ``G_j`` but for the uniform
+distribution. The divisor of the equation normalizes the expression, so that the error
+of the empirical distribution is normalized to the error of the empirical distribution
+with fitting it with the uniform distribution. It is expected that NRMSE < 1.
+The smaller it is, the better the data are approximated by GPD versus uniform distribution.
+
+The output `pvalues` is a vector of p-values. `pvalues[i]` corresponds to the p-value
+of the hypothesis: _"The exceedences around point `X[i]` are sampled from a GPD"_ versus
+the alternative hypothesis that they are not.
+To extract the p-values, we perform a one-sample hypothesis via HypothesisTests.jl
+to the fitted GPD.
+Very small p-values then indicate
+that the hypothesis should be rejected and the data are not well described by a GPD.
+This can be an indication that we do not have enough data, or that we choose
+too high of a quantile probability `p`, or that the data are not suitable in general.
+This p-value based method for significance has been used in [^Faranda2017],
+but it is unclear precisely how it was used.
+
+For more details on how these quantities may quantify significance, see our review paper.
+
+[^Faranda2017]:
+    Faranda et al. (2017), Dynamical proxies of North Atlantic predictability and extremes,
+    [Scientific Reports, 7](https://doi.org/10.1038/srep41278)
+"""
+function extremevaltheory_gpdfit_pvalues(X::AbstractStateSpaceSet, p;
+        estimator = :mm, show_progress = envprog(), TestType = ApproximateOneSampleKSTest,
+        nbins = max(round(Int, length(X)*(1-p)/20), 10),
+    )
+    N = length(X)
+    progress = ProgressMeter.Progress(
+        N; desc = "Extreme value theory p-values: ", enabled = show_progress
+    )
+    logdists = [zeros(eltype(X), N) for _ in 1:Threads.nthreads()]
+    pvalues = zeros(N)
+    sigmas = zeros(N)
+    xis = zeros(N)
+    nrmses = zeros(N)
+    Es = [Float64[] for _ in 1:N]
+
+    Threads.@threads for j in eachindex(X)
+        logdist = logdists[Threads.threadid()]
+        @inbounds map!(x -> -log(euclidean(x, X[j])), logdist, vec(X))
+        σ, ξ, E = extremevaltheory_local_gpd_fit(logdist, p, estimator)
+        sigmas[j] = σ
+        xis[j] = ξ
+        Es[j] = E
+        # Note that exceedances are defined with 0 as their minimum
+        gpd = GeneralizedPareto(0, σ, ξ)
+        test = TestType(E, gpd)
+        pvalues[j] = pvalue(test)
+        nrmses[j] = gpd_nrmse(E, gpd, nbins)
+        ProgressMeter.next!(progress)
+    end
+    return Es, nrmses, pvalues, sigmas, xis
+end
+
+function gpd_nrmse(E, gpd, nbins)
+    # Compute histogram of E
+    bins = range(0, nextfloat(maximum(E), 2); length = nbins)
+    binning = FixedRectangularBinning(bins)
+    allprobs = allprobabilities(ValueHistogram(binning), E)
+    width = step(bins)
+
+    # We will calcuate the GPD pdf at the midpoint of each bin
+    midpoints = bins .+ width/2
+    rmse = zero(eltype(E))
+    mmse = zero(eltype(E))
+    # value of uniform density (equal probability at each bin)
+    meandens = (1/length(allprobs))/width
+
+    for (j, prob) in enumerate(allprobs)
+        dens = prob / width # density value, to compare with pdf
+        dens_gpd = pdf(gpd, midpoints[j])
+        rmse += (dens - dens_gpd)^2
+        mmse += (dens - meandens)^2
+    end
+    nrmse = sqrt(rmse/mmse)
+    if isinf(nrmse)
+        error("inf nrmse. Allprobs; $(allprobs). E; $(E)")
+    end
+    return nrmse
+end
+
+"""
+    CramerVonMises(X, dist)
+
+A crude implementation of the Cramer Von Mises test that yields a `p` value
+for the hypothesis that the data in `X` are sampled from the distribution `dist`.
+"""
+struct CramerVonMises{E, D}
+    e::E
+    d::D
+end
+
+import HypothesisTests
+using Distributions: cdf, Normal
+
+# I got this test from:
+# https://www.youtube.com/watch?v=pCz8WlKCJq8
+
+function HypothesisTests.pvalue(test::CramerVonMises)
+    X = test.e
+    gpd = test.d
+    n = length(X)
+    xs = sort!(X)
+
+    T = 1/(12n) + sum(i -> (cdf(gpd, xs[i]) - (2i - 1)/2n)^2, 1:n)
+
+    # An approximation of the T statistic is normally distributed
+    # with std of approximately sqrt(1/45)
+    # From there the z statistic
+    zstat = T/sqrt(1/45)
+
+    p = 2*(1 - cdf(Normal(0,1), zstat))
+    return p
+end
diff --git a/src/extremes_based/extremesdim.jl b/src/extremes_based/extremesdim.jl
@@ -34,11 +34,11 @@ function extremevaltheory_dims(X, p; kw...)
 end
 
 """
-    extremevaltheory_dims_persistences(x::AbstractStateSpaceSet, p::Real; kwargs)
+    extremevaltheory_dims_persistences(x::AbstractStateSpaceSet, p::Real; kwargs...)
 
 Return the local dimensions `Δloc` and the persistences `θloc` for each point in the
 given set for quantile probability `p`, according to the estimation done via extreme value
-theory [^Lucarini2016] [^Caby2018].
+theory [^Lucarini2016].
 The computation is parallelized to available threads (`Threads.nthreads()`).
 
 See also [`extremevaltheory_gpdfit_pvalues`](@ref) for obtaining confidence on the results.
@@ -60,28 +60,29 @@ See also [`extremevaltheory_gpdfit_pvalues`](@ref) for obtaining confidence on t
 ## Description
 
 For each state space point ``\\mathbf{x}_i`` in `X` we compute
-``g_j = -\\log(||\\mathbf{x}_i - \\mathbf{x}_j|| ) \\; \\forall j = 1, \\ldots, N`` with
+``g_i = -\\log(||\\mathbf{x}_i - \\mathbf{x}_j|| ) \\; \\forall j = 1, \\ldots, N`` with
 ``||\\cdot||`` the Euclidean distance. Next, we choose an extreme quantile probability
-``p`` (e.g., 0.99) for the distribution of ``g_j``. We compute ``g_p`` as the ``p``-th
-quantile of ``g_j``. Then, we collect the exceedances of ``g_j``, defined as
-``E = \\{ g_j - g_p: g_j \\ge g_p \\}``, i.e., all values of ``g_j`` larger or equal to
-``g_p``, also shifted by ``g_p``. There are in total ``n = N(1-q)`` values in ``E``.
-According to extreme value theory, in the limit ``N \\to \\infty`` the values ``E``
+``p`` (e.g., 0.99) for the distribution of ``g_i``. We compute ``g_p`` as the ``p``-th
+quantile of ``g_i``. Then, we collect the exceedances of ``g_i``, defined as
+``E_i = \\{ g_i - g_p: g_i \\ge g_p \\}``, i.e., all values of ``g_i`` larger or equal to
+``g_p``, also shifted by ``g_p``. There are in total ``n = N(1-q)`` values in ``E_i``.
+According to extreme value theory, in the limit ``N \\to \\infty`` the values ``E_i``
 follow a two-parameter Generalized Pareto Distribution (GPD) with parameters
 ``\\sigma,\\xi`` (the third parameter ``\\mu`` of the GPD is zero due to the
 positive-definite construction of ``E``). Within this extreme value theory approach,
 the local dimension ``\\Delta^{(E)}_i`` assigned to state space point ``\\textbf{x}_i``
 is given by the inverse of the ``\\sigma`` parameter of the
-GPD fit to the data[^Faranda2011], ``\\Delta^{(E)}_i = 1/\\sigma``.
+GPD fit to the data[^Lucarini2012], ``\\Delta^{(E)}_i = 1/\\sigma``.
 ``\\sigma`` is estimated according to the `estimator` keyword.
 
 [^Lucarini2016]:
     Lucarini et al., [Extremes and Recurrence in Dynamical Systems
     ](https://www.wiley.com/en-gb/Extremes+and+Recurrence+in+Dynamical+Systems-p-9781118632192)
 
-[^Caby2018]:
-    Caby et al., [Physica D 400 132143
-    ](https://doi.org/10.1016/j.physd.2019.06.009)
+[^Lucarini2012]:
+    Lucarini et al., Universal Behaviour of Extreme Value Statistics for Selected
+    Observables of Dynamical Systems, [Journal of Statistical Physics, 147(1), 63–73.](
+    https://doi.org/10.1007/s10955-012-0468-z) et al., [Physica D 400 132143
 """
 function extremevaltheory_dims_persistences(X::AbstractStateSpaceSet, p::Real;
         show_progress = envprog(), allocate_matrix = false, kw...
@@ -205,74 +206,3 @@ function extremevaltheory_local_dim_persistence(
     Δ, θ = extremevaltheory_local_dim_persistence(logdist, p; kw...)
     return Δ, θ
 end
-
-
-# for confidence testing
-using HypothesisTests: OneSampleADTest, ApproximateOneSampleKSTest, pvalue
-using Distributions: GeneralizedPareto
-
-"""
-    extremevaltheory_gpdfit_pvalues(X, p; kw...) → pvalues, sigmas, xis
-
-Quantify significance of the results of
-[`extremevaltheory_dims_persistences`](@ref)`(X, p; kw...)` by
-quantifying how well a Generalized Pareto Distribution (GPD) describes exceedences
-in the input data using the approach described in [^Faranda2017].
-
-Return the p-values of the statistical test, and the fitted `σ, ξ` values to each
-of the GPD fits of the exceedances for each point in `X`.
-
-## Description
-
-The output `pvalues` is a vector of p-values. `pvalues[i]` corresponds to the p-value
-of the hypothesis: _"The exceedences around point `X[i]` are sampled from a GPD"_ versus
-the alternative hypothesis that they are not. Very small p-values then indicate
-that the hypothesis should be rejected and the data are not well described by a GPD.
-This can be a good indication that we do not have enough data, or that we choose
-too high of a quantile probability `p`.
-
-Alternatively, if the majority of `pvalues` are sufficiently high, e.g., higher
-than 0.05, then we have some confidence that the probability `p` and/or amount of
-data follow well the theory of FD from EVT.
-
-To extract the p-values, we perform a one-sample hypothesis via HypothesisTests.jl
-to the fitted GPD.
-
-## Keyword arguments
-
-- `show_progress, estimator` as in [`extremevaltheory_dims_persistences`](@ref)
-- `TestType = ApproximateOneSampleKSTest`: the test type to use. It can be any
-  of the [one-sample non-parametric tests from HypothesisTests.jl](https://juliastats.org/HypothesisTests.jl/stable/nonparametric/#Nonparametric-tests)
-  however we noticed that `OneSampleADTest` sometimes yielded nonsensical results:
-  all p-values were equal and were very small ≈ 1e-6.
-
-[^Faranda2017]:
-    Faranda et al. (2017), Dynamical proxies of North Atlantic predictability and extremes,
-    [Scientific Reports, 7](https://doi.org/10.1038/srep41278)
-"""
-function extremevaltheory_gpdfit_pvalues(X::AbstractStateSpaceSet, p;
-        estimator = :mm, show_progress = envprog(), TestType = ApproximateOneSampleKSTest
-    )
-    N = length(X)
-    progress = ProgressMeter.Progress(
-        N; desc = "Extreme value theory p-values: ", enabled = show_progress
-    )
-    logdists = [zeros(eltype(X), N) for _ in 1:Threads.nthreads()]
-    pvalues = zeros(N)
-    sigmas = zeros(N)
-    xis = zeros(N)
-
-    Threads.@threads for j in eachindex(X)
-        logdist = logdists[Threads.threadid()]
-        @inbounds map!(x -> -log(euclidean(x, X[j])), logdist, vec(X))
-        σ, ξ, E = extremevaltheory_local_gpd_fit(logdist, p, estimator)
-        sigmas[j] = σ
-        xis[j] = ξ
-        # Note that exceedances are defined with 0 as their minimum
-        gpd = GeneralizedPareto(0, σ, ξ)
-        test = TestType(E, gpd)
-        pvalues[j] = pvalue(test)
-        ProgressMeter.next!(progress)
-    end
-    return pvalues, sigmas, xis
-end
diff --git a/test/extremevalue.jl b/test/extremevalue.jl
@@ -59,11 +59,15 @@ end
         end
     end
 
-    @testset "pvalues" begin
-        pvalues = extremevaltheory_gpdfit_pvalues(A, 0.99)[1]
+    @testset "significance" begin
+        Es, nrmses, pvalues, sigmas, xis = extremevaltheory_gpdfit_pvalues(A, 0.99)
         @test all(p -> 0 ≤ p ≤ 1, pvalues)
         badcount = count(<(0.05), pvalues)/length(pvalues)
         @test badcount < 0.1
+        @test all(p -> 0 ≤ p ≤ 1, nrmses)
+        @test mean(nrmses) < 0.5
+        @test all(E -> all(≥(0), E), Es)
+        @test all(≥(0), sigmas)
     end
 end