Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Generalized Pareto distribution #294

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Distributions
GeneralizedPoisson
BetaNegativeBinomial
GenExtreme
GenPareto
R2D2M2CP
Skellam
histogram_approximation
Expand Down
8 changes: 7 additions & 1 deletion pymc_experimental/distributions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
Experimental probability distributions for stochastic nodes in PyMC.
"""

from pymc_experimental.distributions.continuous import Chi, GenExtreme, Maxwell
from pymc_experimental.distributions.continuous import (
Chi,
GenExtreme,
GenPareto,
Maxwell,
)
from pymc_experimental.distributions.discrete import (
BetaNegativeBinomial,
GeneralizedPoisson,
Expand All @@ -32,6 +37,7 @@
"DiscreteMarkovChain",
"GeneralizedPoisson",
"GenExtreme",
"GenPareto",
"R2D2M2CP",
"Skellam",
"histogram_approximation",
Expand Down
156 changes: 155 additions & 1 deletion pymc_experimental/distributions/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from pymc.distributions.shape_utils import rv_size_is_none
from pymc.logprob.utils import CheckParameterValue
from pymc.pytensorf import floatX
from pytensor.tensor.random.op import RandomVariable
from pytensor.tensor.random.op import RandomVariable, ScipyRandomVariable
from pytensor.tensor.variable import TensorVariable
from scipy import stats

Expand Down Expand Up @@ -221,6 +221,160 @@ def moment(rv, size, mu, sigma, xi):
return mode


# Generalized Pareto Distribution
class GenParetoRV(ScipyRandomVariable):
name: str = "Generalized Pareto"
ndim_supp: int = 0
ndims_params: List[int] = [0, 0, 0]
dtype: str = "floatX"
_print_name: Tuple[str, str] = ("Generalized Pareto Distribution", "\\operatorname{GenPareto}")

@classmethod
def rng_fn(
cls,
rng: Union[np.random.RandomState, np.random.Generator],
mu: np.ndarray,
sigma: np.ndarray,
xi: np.ndarray,
size: Tuple[int, ...],
) -> np.ndarray:
# using scipy's parameterization
return stats.genpareto.rvs(c=xi, loc=mu, scale=sigma, random_state=rng, size=size)


gen_pareto = GenParetoRV()


class GenPareto(Continuous):
r"""
The Generalized Pareto Distribution

The pdf of this distribution is

.. math::

f(x \mid \mu, \sigma, \xi) = \frac{1}{\sigma} (1 + \xi z)^{-1/\xi-1}

where

.. math::

z = \frac{x - \mu}{\sigma}

and is defined on the set (when :math:`\xi \geq 0`):

.. math::

\left\{x: x \geq \mu \right\}
.. plot::

import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import arviz as az
plt.style.use('arviz-darkgrid')
x = np.linspace(-2, 10, 200)
mus = [0., 0., 0., 0., 1., ]
sigmas = [1., 1., 1.,2., 1., ]
xis = [1., 0., 5., 1., 1., ]
for mu, sigma, xi in zip(mus, sigmas, xis):
pdf = st.genpareto.pdf(x, c=xi, loc=mu, scale=sigma)
plt.plot(x, pdf, label=rf'$\mu$ = {mu}, $\sigma$ = {sigma}, $\xi$={xi}')
plt.xlabel('x', fontsize=12)
plt.ylabel('f(x)', fontsize=12)
plt.legend(loc=1)
plt.show()


======== =========================================================================
Support * :math:`x \geq \mu`, when :math:`\xi \geq 0`
Mean * :math:`\mu + \frac{\sigma}{1-\xi}`, when :math:`\xi < 1`
Variance * :math:`\frac{\sigma^2}{(1-\xi)^2 (1-2\xi)}`, when :math:`\xi < 0.5`
======== =========================================================================

Parameters
----------
mu : float
Location parameter.
sigma : float
Scale parameter (sigma > 0).
xi : float
Shape parameter (xi >= 0). Notice that we are using a more restrictive definition for Generalized Pareto Distribution (xi can be smaller than 0). We only include :math:`\xi \geq 0` since it's more commonly used for modelling the tails.
"""

rv_op = gen_pareto

@classmethod
def dist(cls, mu=0, sigma=1, xi=0, **kwargs):
mu = pt.as_tensor_variable(floatX(mu))
sigma = pt.as_tensor_variable(floatX(sigma))
xi = pt.as_tensor_variable(floatX(xi))

return super().dist([mu, sigma, xi], **kwargs)

def logp(value, mu, sigma, xi):
"""
Calculate log-probability of Generalized Pareto distribution
at specified value.

Parameters
----------
value: numeric
Value(s) for which log-probability is calculated. If the log probabilities for multiple
values are desired the values must be provided in a numpy array or Pytensor tensor

Returns
-------
TensorVariable
"""
Comment on lines +316 to +329
Copy link
Member

@ricardoV94 ricardoV94 Jan 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These docstrings are incomplete, and the logp function is not really user facing, so it's better to not include anything at all

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the logp for Generalized Pareto distribution should be
我认为广义帕累托分布的 logp 应该是
def logp(value, mu, sigma, xi):
"""
Calculate log-probability of Generalized Pareto distribution
计算广义帕累托分布的对数概率
at specified value. 在指定值。

Parameters
----------
value: numeric
    Value(s) for which log-probability is calculated. If the log probabilities for multiple
    values are desired the values must be provided in a numpy array or Pytensor tensor

Returns
-------
TensorVariable
"""

scaled = (value - mu) / sigma

logp_expression = pt.switch(
    pt.isclose(xi, 0),
    -1 * scaled,
    -1 * pt.log(sigma) - ((xi + 1) / xi) * pt.log1p(xi * scaled),
)
logp = pt.switch(pt.gt(1 + xi * scaled, 0), logp_expression, -np.inf)
return check_parameters(logp, sigma > 0, pt.and_(xi > -1, xi < 1), msg="sigma > 0 or -1 < xi < 1")


scaled = (value - mu) / sigma

logp_expression = pt.switch(
pt.eq(xi, 0),
-1 * scaled,
-1 * pt.log(sigma) - ((xi + 1) / xi) * pt.log1p(xi * scaled),
)
logp = pt.switch(pt.ge(scaled, 0), logp_expression, -np.inf)
return check_parameters(logp, sigma > 0, xi >= 0, msg="sigma > 0 and xi >= 0")

def logcdf(value, mu, sigma, xi):
"""
Compute the log of the cumulative distribution function for Generalized Pareto
distribution at the specified value.

Parameters
----------
value: numeric or np.ndarray or `TensorVariable`
Value(s) for which log CDF is calculated. If the log CDF for
multiple values are desired the values must be provided in a numpy
array or `TensorVariable`.

Returns
-------
TensorVariable
"""
Comment on lines +342 to +356
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

scaled = (value - mu) / sigma
logc_expression = pt.switch(
pt.eq(xi, 0),
pt.log(1 - pt.exp(-1 * scaled)),
pt.log(1 - pt.pow((1 + xi * scaled), (-1 / xi))),
)
logc = pt.switch(pt.ge(scaled, 0), logc_expression, -np.inf)

return check_parameters(logc, sigma > 0, xi >= 0, msg="sigma > 0 and xi >= 0")

def moment(rv, size, mu, sigma, xi):
r"""
Mean is defined when :math:`\xi < 1`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to provide a real "moment", just anything that always has finite logp. So in this case moment = mu may be good enough?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you suggesting that we only need to return mu instead of the true mean? Or shall I just leave it as it?

Copy link
Member

@ricardoV94 ricardoV94 Jan 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that you can just return mu and take away the check_parameter part. Just make sure you broadcast mu with the other parameters in case size is None

"""
mean_expression = mu + sigma / (1 - xi)
mean = pt.switch(xi < 1, mean_expression, np.inf)
if not rv_size_is_none(size):
mean = pt.full(size, mean)
return check_parameters(mean, xi < 1, msg="xi < 1")


class Chi:
r"""
:math:`\chi` log-likelihood.
Expand Down
2 changes: 1 addition & 1 deletion pymc_experimental/model/marginal_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class MarginalModel(Model):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.marginalized_rvs = []
self._marginalized_named_vars_to_dims = treedict()
self._marginalized_named_vars_to_dims = {}

def _delete_rv_mappings(self, rv: TensorVariable) -> None:
"""Remove all model mappings referring to rv
Expand Down
60 changes: 59 additions & 1 deletion pymc_experimental/tests/distributions/test_continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
)

# the distributions to be tested
from pymc_experimental.distributions import Chi, GenExtreme, Maxwell
from pymc_experimental.distributions import Chi, GenExtreme, GenPareto, Maxwell


class TestGenExtremeClass:
Expand Down Expand Up @@ -138,6 +138,64 @@ class TestGenExtreme(BaseTestDistributionRandom):
]


class TestGenParetoClass:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing the test for moment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the logp for Generalized Pareto distribution should be
我认为广义帕累托分布的 logp 应该是
def logp(value, mu, sigma, xi):
"""
Calculate log-probability of Generalized Pareto distribution
计算广义帕累托分布的对数概率
at specified value. 在指定值。

Parameters
----------
value: numeric
    Value(s) for which log-probability is calculated. If the log probabilities for multiple
    values are desired the values must be provided in a numpy array or Pytensor tensor

Returns
-------
TensorVariable
"""

scaled = (value - mu) / sigma

logp_expression = pt.switch(
    pt.isclose(xi, 0),
    -1 * scaled,
    -1 * pt.log(sigma) - ((xi + 1) / xi) * pt.log1p(xi * scaled),
)
logp = pt.switch(pt.gt(1 + xi * scaled, 0), logp_expression, -np.inf)
return check_parameters(logp, sigma > 0, pt.and_(xi > -1, xi < 1), msg="sigma > 0 or -1 < xi < 1")

"""
Wrapper class so that tests of experimental additions can be dropped into
PyMC directly on adoption.

pm.logp(GenPareto.dist(mu=0.,sigma=1.,xi=5.),value=1.)
"""

def test_logp(self):
def ref_logp(value, mu, sigma, xi):
if xi == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scipy genpareto logpdf fails for xi = 0?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I do noticed a tiny bug in scipy's function when calculating pdf for general pareto distribution with xi==0. Will double check and sumbit a PR to fix that as well.

return sp.expon.logpdf((value - mu) / sigma)
else:
return sp.genpareto.logpdf(value, c=xi, loc=mu, scale=sigma)

check_logp(
GenPareto,
R,
{"mu": R, "sigma": Rplusbig, "xi": Rplusbig},
ref_logp,
decimal=select_by_precision(float64=6, float32=2),
skip_paramdomain_outside_edge_test=True,
Comment on lines +161 to +162
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you skipping the outside edge test?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since I am bounding the xi to be >= 0, I'd like to skip the outside edge test.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The point of that test is to make sure the bounding is defined correctly, so you shouldn't skip

)

def test_logcdf(self):
def ref_logc(value, mu, sigma, xi):
if xi == 0:
return sp.expon.logcdf((value - mu) / sigma)
else:
return sp.genpareto.logcdf(value, c=xi, loc=mu, scale=sigma)

check_logcdf(
GenPareto,
R,
{
"mu": R,
"sigma": Rplusbig,
"xi": Rplusbig,
},
ref_logc,
decimal=select_by_precision(float64=6, float32=2),
skip_paramdomain_outside_edge_test=True,
)


class TestGenPareto(BaseTestDistributionRandom):
pymc_dist = GenPareto
pymc_dist_params = {"mu": 0, "sigma": 1, "xi": 1}
expected_rv_op_params = {"mu": 0, "sigma": 1, "xi": 1}
reference_dist_params = {"loc": 0, "scale": 1, "c": 0.1}
reference_dist = seeded_scipy_distribution_builder("genpareto")
tests_to_run = [
"check_pymc_params_match_rv_op",
"check_pymc_draws_match_reference",
"check_rv_size",
]


class TestChiClass:
"""
Wrapper class so that tests of experimental additions can be dropped into
Expand Down
2 changes: 1 addition & 1 deletion pymc_experimental/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.15
0.0.16