Skip to content

Commit

Permalink
Merge pull request #63 from stellema/master
Browse files Browse the repository at this point in the history
Add check_relative_fit, delete the non-stationary part of check_gev_fit and update EVA tests.
  • Loading branch information
stellema authored Mar 7, 2024
2 parents bda7e0c + f64cf1e commit d2c03f9
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 100 deletions.
184 changes: 110 additions & 74 deletions unseen/eva.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
"""Extreme value analysis functions."""

import warnings

from matplotlib.dates import date2num
import numpy as np
from scipy.optimize import minimize
from scipy.stats import genextreme, goodness_of_fit
import warnings
from xarray import apply_ufunc


Expand Down Expand Up @@ -55,8 +54,10 @@ def fit_gev(
core_dim="time",
stationary=True,
check_fit=True,
check_relative_fit=None,
alpha=0.05,
generate_estimates=False,
method="Nelder-Mead",
):
"""Estimate parameters for stationary or nonstationary data distributions.
Expand All @@ -68,18 +69,23 @@ def fit_gev(
Initial estimates of the shape, loc and scale parameters
loc1, scale1 : float, default 0
Initial estimates of the location and scale trend parameters
covariate : array-like or string, default "time" (required if 'stationary=False')
A non-stationary covariate array or coordinate name.
covariate : array-like or string, default "time"
A non-stationary covariate array or coordinate name (non-stationary only).
core_dim : str, default "time"
Name of time/sample dimension in 'data'
stationary : bool, default True
Fit as a stationary GEV using 'scipy.stats.genextremes.fit'
Fit as a stationary GEV using "scipy.stats.genextremes.fit"
check_fit : bool, default False
Test goodness of fit and attempt retry
check_relative_fit : (None, "aic", "bic", "lrt"), default "aic".
Method to test relative fit of stationary and non-stationary models.
The trend paramters are set to zero if the stationary fit is better (non-stationary only)
alpha : float, default 0.05
Goodness of fit p-value threshold
generate_estimates : bool, default False
Generate initial parameter guesses using a data subset
method : str, default "Nelder-Mead"
Optimisation method for non-stationary fit (non-stationary only)
Returns
-------
Expand All @@ -94,16 +100,12 @@ def fit_gev(
parameters are estimated using a penalised negative log-likelihood
function with initial estimates based on the stationary fit.
- The distribution fit is considered good if the p-value is above
the 99th percent level (i.e., accept the null hypothesis).
alpha (i.e., accept the null hypothesis).
- If the parameters fail the goodness of fit test, it will attempt
to fit the data again by generating an initial guess - if that
to fit the data again by generating an initial guess, if that
hasn't already been tried.
- If data is a stacked forecast ensemble, the covariate will need to be
stacked in the same way.
Todo
----
- Test if data is trended before nonstationary fit
"""
kwargs = locals() # Function inputs

Expand Down Expand Up @@ -137,7 +139,7 @@ def penalised_sum(x):

return total + penalty

def nllf(theta, data, covariate):
def nllf(theta, data, covariate=None):
"""Penalised negative log-likelihood function.
Parameters
Expand Down Expand Up @@ -210,55 +212,73 @@ def _fit(
core_dim,
stationary,
check_fit,
check_relative_fit,
alpha,
generate_estimates,
method,
):
"""Estimate distribution parameters."""
# Use genextremes to get stationary distribution parameters.
shape, loc, scale = fit_stationary_gev(data, user_estimates, generate_estimates)
# Use genextremes to get stationary distribution parameters
theta = fit_stationary_gev(data, user_estimates, generate_estimates)

if stationary:
theta = shape, loc, scale
else:
# Initial parameter guesses.
if not stationary:
# Use genextremes fit as initial guesses (scipy.stats reverses sign of shape)
shape, loc, scale = theta
theta_i = -shape, loc, loc1, scale, scale1

# Optimisation bounds (scale parameter must be non-negative).
# Optimisation bounds (scale parameter must be non-negative)
bounds = [(None, None)] * 5
bounds[3] = (0, None)

# Minimise the negative log-likelihood function to get optimal theta.
# Minimise the negative log-likelihood function to get optimal theta
res = minimize(
nllf,
theta_i,
args=(data, covariate),
method="Nelder-Mead",
method=method,
bounds=bounds,
)
theta = res.x

# Restore 'shape' sign for consistency with scipy.stats results.
if check_relative_fit is not None:
# Test relative fit of stationary and non-stationary models
# Negative log likelihood using genextreme parameters
ll_stationary = nllf([-shape, loc, scale], data)
ll_nonstationary = res.fun

result = check_gev_relative_fit(
data,
theta,
[ll_stationary, ll_nonstationary],
test=check_relative_fit,
)
if result is False:
warnings.warn(
f"{check_relative_fit} test failed. Returning stationary parameters."
)
# Return genextremes parameters with no trend
theta = [shape, loc, 0, scale, 0]

# Reverse shape sign for consistency with scipy.stats results
theta[0] *= -1

theta = np.array([i for i in theta], dtype="float64")

if check_fit:
pvalue = check_gev_fit(
data, theta, covariate=kwargs["covariate"], core_dim=kwargs["core_dim"]
)
if check_fit and stationary:
pvalue = check_gev_fit(data, theta, core_dim=kwargs["core_dim"])

# Accept the null distribution of the AD test.
success = True if np.all(pvalue >= alpha) else False
if not success:
# Accept null distribution of the Anderson-darling test (same distribution)
if np.all(pvalue < alpha):
if not kwargs["generate_estimates"]:
warnings.warn(
"Data fit failed. Retrying with 'generate_estimates=True'."
)
kwargs["generate_estimates"] = True # Also breaks loop
theta = fit(data, **kwargs)
else:
# Return NaNs
theta = theta * np.nan
warnings.warn("Data fit failed.")
shape, loc, scale
return theta

def fit(data, **kwargs):
Expand All @@ -284,13 +304,25 @@ def fit(data, **kwargs):

data = kwargs.pop("data")

if not stationary and isinstance(covariate, str):
covariate = data[covariate]
# Format or generate covariate
if not stationary:
if isinstance(covariate, str):
# Select coordinate in data
covariate = data[covariate]
elif covariate is None:
# Guess covariate
if core_dim in data:
covariate = data[core_dim]
else:
covariate = np.arange(data.shape[0])

if covariate.dtype.kind not in set("buifc"):
# Convert dates to numbers
covariate = date2num(covariate)
kwargs["covariate"] = covariate

kwargs["covariate"] = covariate # Update kw dict

# Fit data to distribution parameters
theta = fit(data, **kwargs)

# Return a tuple of scalars instead of a 1D data array
Expand All @@ -299,7 +331,7 @@ def fit(data, **kwargs):
return theta


def check_gev_fit(data, theta, covariate=None, core_dim="time"):
def check_gev_fit(data, theta, core_dim="time"):
"""Test stationary distribution goodness of fit.
Parameters
Expand All @@ -308,57 +340,23 @@ def check_gev_fit(data, theta, covariate=None, core_dim="time"):
Data and covariate to fit.
theta : tuple of floats
Shape, location and scale parameters.
covariate : array-like, optional
Non-stationary covariate.
core_dim : str, default is "time"
Data dimension to test over.
Returns
-------
pvalue : scipy.stats._fit.GoodnessOfFitResult.pvalue
Goodness of fit pvalue.
Notes
-----
- The nonstationary parameters vary with the data distribution as a function
of the covariate, so we test a subset of data at a specific point.
- For nonstationary distributions, the parameters vary as a function of the
covariate the stationary fit is only compared to a subset of data near
the middle of the timeseries.
Todo
----
- Add goodness of fit and trend significance test for nonstationary parameters
"""

def _goodness_of_fit(data, theta, covariate=None):
assert len(theta) in [3, 5]
if len(theta) == 3:
# Stationary parameters
shape, loc, scale = theta
test_data = data

elif len(theta) == 5:
# Non-stationary parameters (test middle 20% of data).

n = data.size
# Index of covariate (i.e., middle timestep in a timeseries)
if covariate is None:
t = n // 2
else:
t = int(np.median(np.unique(covariate)))
dt = n // 10 # Subset 2*10% of data.

# Subset data around midpoint.
test_data = data[t - dt : t + dt]

shape, loc, loc1, scale, scale1 = theta
loc = loc + loc1 * t
scale = scale + scale1 * t
def _goodness_of_fit(data, theta):
"""Test goodness of fit."""
# Stationary parameters
shape, loc, scale = theta

res = goodness_of_fit(
genextreme,
test_data,
data,
known_params=dict(c=shape, loc=loc, scale=scale),
)
return res.pvalue
Expand All @@ -370,12 +368,50 @@ def _goodness_of_fit(data, theta, covariate=None):
input_core_dims=[[core_dim], ["theta"]],
vectorize=True,
dask="parallelized",
kwargs=dict(covariate=covariate),
dask_gufunc_kwargs=dict(meta=(np.ndarray(1, float),)),
)
return pvalue


def check_gev_relative_fit(data, nll, test):
"""Test relative fit of stationary and non-stationary distribution parameters.
https://doi.org/10.1016/j.jhydrol.2017.02.005
Parameters
----------
data : array-like
Data to fit.
ll : list
Negative log-likelihood of the stationary and non-stationary models.
test : {'aic', 'bic', 'lrt'}
Method to test relative fit of stationary and non-stationary models
Returns
-------
result : bool
True if the non-stationary model is better
"""
result = False
if test == "lrt":
# Calculate the likelihood ratio test statistic (only valid for nested models)
d = -2 * (nll[1] - nll[0])
if d > 1:
result = True

elif test == "aic":
# Calculate the Alkaike Information Criterion (AIC) for each model
aic = [(2 * k) + (2 * ll) for ll, k in zip(nll, [3, 5])]
if aic[0] > aic[1]:
result = True

elif test == "bic":
# Calculate the Bayesian Information Criterion (BIC)
bic = [k * np.log(len(data)) + (2 * ll) for ll, k in zip(nll, [3, 5])]
if bic[0] > bic[1]:
result = True
return result


def return_period(data, event, params=None, **kwargs):
"""Get return period for given event by fitting a GEV."""

Expand Down
Loading

0 comments on commit d2c03f9

Please sign in to comment.