Skip to content

Commit

Permalink
Merge pull request #56 from AusClimateService/non_stationary_gev
Browse files Browse the repository at this point in the history
Add option to fit data to a time varying GEV (#52)
  • Loading branch information
DamienIrving authored Oct 16, 2023
2 parents 8ac42f6 + 28add95 commit 44f8a62
Show file tree
Hide file tree
Showing 4 changed files with 198 additions and 72 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ repos:
# these are errors that will be ignored by flake8
# check out their meaning here
# https://flake8.pycqa.org/en/latest/user/error-codes.html
- "--ignore=E203,C901"
- "--ignore=E203,C901,W503"
197 changes: 159 additions & 38 deletions unseen/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
import re

import numpy as np
from scipy.optimize import minimize
from scipy.stats import genextreme, rv_continuous
from scipy.stats._constants import _LOGXMAX
from scipy.stats._distn_infrastructure import _sum_finite
import xclim
from scipy.stats import genextreme as gev


class store_dict(argparse.Action):
Expand Down Expand Up @@ -138,45 +141,161 @@ def event_in_context(data, threshold, direction):
return n_events, n_population, return_period, percentile


def fit_gev(data, user_estimates=[], generate_estimates=False):
"""Fit a GEV by providing fit and scale estimates.
class ns_genextreme_gen(rv_continuous):
"""Extreme value distributions (stationary or non-stationary)."""

Parameters
----------
data : numpy ndarray
user_estimates : list, optional
Estimate of the location and scale parameters
generate_estimates : bool, default False
Fit GEV to data subset first to estimate parameters (useful for large datasets)
def fit_stationary(self, data, user_estimates, generate_estimates):
"""Return estimates of shape, location and scale parameters using genextreme."""
if user_estimates:
shape, loc, scale = user_estimates
shape, loc, scale = genextreme.fit(data, shape, loc=loc, scale=scale)

Returns
-------
shape : float
Shape parameter
loc : float
Location parameter
scale : float
Scale parameter
"""
elif generate_estimates:
# Generate initial estimates using a data subset (useful for large datasets).
shape, loc, scale = genextreme.fit(data[::2])
shape, loc, scale = genextreme.fit(data, shape, loc=loc, scale=scale)
else:
shape, loc, scale = genextreme.fit(data)
return shape, loc, scale

def nllf(self, theta, data, times):
"""Penalised negative log-likelihood of GEV probability density function.
A modified version of scipy.stats.genextremes.fit for fitting extreme value
distribution parameters, in which the location and scale parameters can vary
linearly with a covariate. The non-stationary parameters will be returned only
if the input 'theta' incudes the time-varying location and scale parameters.
A large, finite penalty (rather than infinite negative log-likelihood)
is applied for observations beyond the support of the distribution.
Parameters
----------
theta : tuple of floats
Shape, location and scale parameters (shape, loc0, loc1, scale0, scale1).
data, times : array_like
Data time series and indexes of covariates.
Returns
-------
total : float
The sum of the penalised negative likelihood function.
"""
if len(theta) == 5:
# Non-stationary GEV parameters.
shape, loc0, loc1, scale0, scale1 = theta
loc = loc0 + loc1 * times
scale = scale0 + scale1 * times

if user_estimates:
loc_estimate, scale_estimate = user_estimates
shape, loc, scale = gev.fit(data, loc=loc_estimate, scale=scale_estimate)
elif generate_estimates:
shape_estimate, loc_estimate, scale_estimate = gev.fit(data[::2])
shape, loc, scale = gev.fit(data, loc=loc_estimate, scale=scale_estimate)
else:
shape, loc, scale = gev.fit(data)
else:
# Stationary GEV parameters.
shape, loc, scale = theta

return shape, loc, scale
s = (data - loc) / scale

# Calculate the NLLF (type 1 or types 2-3 extreme value distributions).
if shape == 0:
f = np.log(scale) + s + np.exp(-s)

def return_period(data, event):
"""Get return period for given event by fitting a GEV"""
else:
Z = 1 + shape * s
# NLLF at points where the data is supported by the distribution parameters.
# (N.B. the NLLF is not finite when the shape is nonzero and Z is negative
# because the PDF is zero (log(0)=inf) outside of these bounds).
f = np.where(
Z > 0,
np.log(scale)
+ (1 + 1 / shape) * np.ma.log(Z)
+ np.ma.power(Z, -1 / shape),
np.inf,
)

f = np.where(scale > 0, f, np.inf) # Scale parameter must be positive.

# Sum function along all axes (where finite) & count infinite elements.
total, n_bad = _sum_finite(f)

# Add large finite penalty instead of infinity (log of the largest useable float).
total = total + n_bad * _LOGXMAX * 100
return total

def fit(
self,
data,
user_estimates=[],
loc1=0,
scale1=0,
generate_estimates=False,
stationary=True,
method="Nelder-Mead",
):
"""Return estimates of data distribution parameters and their trend (if applicable).
For stationary data, estimates the shape, location and scale parameters using
scipy.stats.genextremes.fit(). For non-stationary data, also estimates the linear
location and scale trend parameters using a penalised negative log-likelihood
function with initial estimates based on the stationary fit.
Parameters
----------
data : array_like
Data timeseries.
user estimates: list, optional
Initial estimates of the shape, loc and scale parameters.
loc1, scale1 : float, optional
Initial estimates of the location and scale trend parameters. Defaults to 0.
stationary : bool, optional
Fit as a stationary GEV using scipy.stats.genextremes.fit. Defaults to True.
method : str, optional
Method used for scipy.optimize.minimize. Defaults to 'Nelder-Mead'.
Returns
-------
theta : tuple of floats
Shape, location and scale parameters (and loc1 and scale1 if applicable)
Example
-------
'''
ns_genextreme = ns_genextreme_gen()
data = scipy.stats.genextreme.rvs(0.8, loc=3.2, scale=0.5, size=500, random_state=0)
shape, loc, scale = ns_genextreme.fit(data, stationary=True)
shape, loc, loc1, scale, scale1 = ns_genextreme.fit(data, stationary=False)
'''
"""

# Use genextremes to get stationary distribution parameters.
shape, loc, scale = self.fit_stationary(
data, user_estimates, generate_estimates
)

if stationary:
theta = shape, loc, scale
else:
times = np.arange(data.shape[-1], dtype=int)
theta_i = shape, loc, loc1, scale, scale1

# Optimisation bounds (scale parameter must be non-negative).
bounds = [(None, None), (None, None), (None, None), (0, None), (None, None)]

# Minimise the negative log-likelihood function to get optimal theta.
res = minimize(
self.nllf, theta_i, args=(data, times), method=method, bounds=bounds
)
theta = res.x

return theta

shape, loc, scale = fit_gev(data, generate_estimates=True)
probability = gev.sf(event, shape, loc=loc, scale=scale)
return_period = 1.0 / probability

fit_gev = ns_genextreme_gen().fit


def return_period(data, event, **kwargs):
"""Get return period for given event by fitting a GEV."""

shape, loc, scale = fit_gev(data, **kwargs)
return_period = genextreme.isf(
event, shape, loc=loc, scale=scale
) # 1.0 / probability

return return_period

Expand Down Expand Up @@ -206,25 +325,27 @@ def gev_return_curve(

curve_return_periods = np.logspace(0, max_return_period, num=10000)
curve_probabilities = 1.0 / curve_return_periods
curve_values = gev.isf(curve_probabilities, shape, loc, scale)
curve_values = genextreme.isf(curve_probabilities, shape, loc, scale)

event_probability = gev.sf(event_value, shape, loc=loc, scale=scale)
event_probability = genextreme.sf(event_value, shape, loc=loc, scale=scale)
event_return_period = 1.0 / event_probability

# Bootstrapping for confidence interval
boot_values = curve_values
boot_event_return_periods = []
for i in range(n_bootstraps):
if bootstrap_method == "parametric":
boot_data = gev.rvs(shape, loc=loc, scale=scale, size=len(data))
boot_data = genextreme.rvs(shape, loc=loc, scale=scale, size=len(data))
elif bootstrap_method == "non-parametric":
boot_data = np.random.choice(data, size=data.shape, replace=True)
boot_shape, boot_loc, boot_scale = fit_gev(boot_data, generate_estimates=True)

boot_value = gev.isf(curve_probabilities, boot_shape, boot_loc, boot_scale)
boot_value = genextreme.isf(
curve_probabilities, boot_shape, boot_loc, boot_scale
)
boot_values = np.vstack((boot_values, boot_value))

boot_event_probability = gev.sf(
boot_event_probability = genextreme.sf(
event_value, boot_shape, loc=boot_loc, scale=boot_scale
)
boot_event_return_period = 1.0 / boot_event_probability
Expand Down
6 changes: 2 additions & 4 deletions unseen/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,15 @@ def calc_ci(data):
return lower_ci, upper_ci


def calc_moments(sample_da, gev_estimates=[]):
def calc_moments(sample_da, **kwargs):
"""Calculate all the moments for a given sample."""

moments = {}
moments["mean"] = float(np.mean(sample_da))
moments["standard deviation"] = float(np.std(sample_da))
moments["skew"] = float(scipy.stats.skew(sample_da))
moments["kurtosis"] = float(scipy.stats.kurtosis(sample_da))
gev_shape, gev_loc, gev_scale = general_utils.fit_gev(
sample_da, user_estimates=gev_estimates
)
gev_shape, gev_loc, gev_scale = general_utils.fit_gev(sample_da, **kwargs)
moments["GEV shape"] = gev_shape
moments["GEV location"] = gev_loc
moments["GEV scale"] = gev_scale
Expand Down
Loading

0 comments on commit 44f8a62

Please sign in to comment.