Skip to content

Commit

Permalink
Merge pull request #50 from XENONnT/first_runner
Browse files Browse the repository at this point in the history
First runner manipulating statistical model
  • Loading branch information
dachengx authored Aug 4, 2023
2 parents 7587f3c + f35e0c7 commit 66538e5
Show file tree
Hide file tree
Showing 17 changed files with 575 additions and 90 deletions.
2 changes: 2 additions & 0 deletions alea/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from .models import *

from .runner import *

from .utils import *

from .parameters import *
Expand Down
17 changes: 9 additions & 8 deletions alea/examples/configs/unbinned_wimp_running.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
statistical_model: alea.models.BlueiceExtendedModel
statistical_model_config: unbinned_wimp_statistical_model.yaml

poi: wimp_rate_multiplier
Expand All @@ -12,8 +13,8 @@ computation:
}
parameters_in_common:
{
hypotheses: ["true", "null", "free"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5",
hypotheses: ["free", "null", "true"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5",
n_mc: 5000,
n_batch: 40,
}
Expand All @@ -24,12 +25,12 @@ computation:
parameters_to_vary: {wimp_mass: [10, 50, 200]}
parameters_in_common:
{
hypotheses: ["true", "null", "free"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5",
hypotheses: ["free", "null", "true"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5",
n_mc: 5000,
n_batch: 40,
}
limit_threshold: "thresholds.hdf5"
limit_threshold: "thresholds.h5"
toydata_mode: "generate"
parameters_as_wildcards: ["poi_expectation", "n_mc", "n_batch"]

Expand All @@ -38,12 +39,12 @@ computation:
parameters_to_vary: {poi_expectation: [0.], wimp_mass: [10, 50, 200]}
parameters_in_common:
{
hypotheses: ["true", "null", "free"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5",
hypotheses: ["free", "null", "true"],
output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5",
n_mc: 5000,
n_batch: 40,
compute_confidence_interval: True,
limit_threshold: "thresholds.hdf5",
limit_threshold: "thresholds.h5",
}
toydata_mode: "generate"

Expand Down
5 changes: 4 additions & 1 deletion alea/examples/configs/unbinned_wimp_statistical_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ parameter_definition:
fit_limits:
- 0
- null
parameter_interval_bounds:
- 0
- null

er_rate_multiplier:
nominal_value: 1.0
Expand Down Expand Up @@ -48,7 +51,7 @@ parameter_definition:
er_band_shift:
nominal_value: 0
ptype: shape
uncertainty: null # scipy.stats.uniform(loc=-1, scale=2)
uncertainty: null # 'scipy.stats.uniform(loc=-1, scale=2)'
# relative_uncertainty: false
fittable: true
blueice_anchors:
Expand Down
82 changes: 59 additions & 23 deletions alea/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import inspect
import warnings
from copy import deepcopy
from typing import Dict, List, Tuple, Callable, Optional

import numpy as np
Expand All @@ -10,6 +11,7 @@
from inference_interface import toydata_to_file

from alea.parameters import Parameters
from alea.utils import within_limits, clip_limits


class StatisticalModel:
Expand Down Expand Up @@ -71,6 +73,7 @@ def __init__(
confidence_level: float = 0.9,
confidence_interval_kind: str = "central", # one of central, upper, lower
confidence_interval_threshold: Callable[[float], float] = None,
**kwargs,
):
"""Initialize a statistical model"""
if type(self) == StatisticalModel:
Expand Down Expand Up @@ -183,7 +186,8 @@ def store_data(
data_name_list: Optional[List] = None,
metadata: Optional[Dict] = None):
"""
Store a list of datasets (each on the form of a list of one or more structured arrays)
Store a list of datasets.
(each on the form of a list of one or more structured arrays or dicts)
Using inference_interface, but included here to allow over-writing.
The structure would be: [[datasets1], [datasets2], ..., [datasetsn]],
where each of datasets is a list of structured arrays.
Expand All @@ -198,14 +202,26 @@ def store_data(
metadata (dict, optional (default=None)): metadata to store with the data.
If None, no metadata is stored.
"""
if all([isinstance(d, dict) for d in data_list]):
_data_list = [list(d.values()) for d in data_list]
elif all([isinstance(d, list) for d in data_list]):
_data_list = data_list
else:
raise ValueError(
'Unsupported mixed toydata format! '
'toydata should be a list of dict or a list of list',)

if data_name_list is None:
if hasattr(self, "likelihood_names"):
data_name_list = self.likelihood_names
else:
data_name_list = ["{:d}".format(i) for i in range(len(data_list[0]))]
data_name_list = ["{:d}".format(i) for i in range(len(_data_list[0]))]

kw = {'metadata': metadata} if metadata is not None else dict()
toydata_to_file(file_name, data_list, data_name_list, **kw)
if len(_data_list[0]) != len(data_name_list):
raise ValueError(
"The number of data sets and data names must be the same")
toydata_to_file(file_name, _data_list, data_name_list, **kw)

def get_expectation_values(self, **parameter_values):
"""
Expand Down Expand Up @@ -257,6 +273,7 @@ def cost(args):
call_kwargs = {}
for i, k in enumerate(self.parameters.names):
call_kwargs[k] = args[i]
# for optimization, we want to minimize the negative log-likelihood
return self.ll(**call_kwargs) * -1

return cost
Expand Down Expand Up @@ -338,10 +355,10 @@ def _confidence_interval_checks(

if parameter_interval_bounds is None:
parameter_interval_bounds = parameter_of_interest.parameter_interval_bounds
if parameter_interval_bounds is None:
raise ValueError(
"You must set parameter_interval_bounds in the parameter config"
" or when calling confidence_interval")
else:
value = parameter_interval_bounds
parameter_of_interest._check_parameter_interval_bounds(value)
parameter_interval_bounds = clip_limits(value)

if parameter_of_interest.ptype == "rate":
try:
Expand All @@ -350,6 +367,7 @@ def _confidence_interval_checks(
else:
source_name = poi_name
mu_parameter = self.get_expectation_values(**kwargs)[source_name]
# update parameter_interval_bounds because poi is rate_multiplier
parameter_interval_bounds = (
parameter_interval_bounds[0] / mu_parameter,
parameter_interval_bounds[1] / mu_parameter)
Expand Down Expand Up @@ -378,7 +396,9 @@ def confidence_interval(
parameter_interval_bounds: Tuple[float, float] = None,
confidence_level: float = None,
confidence_interval_kind: str = None,
**kwargs) -> Tuple[float, float]:
best_fit_args: dict = None,
confidence_interval_args: dict = None,
) -> Tuple[float, float]:
"""
Uses self.fit to compute confidence intervals for a certain named parameter.
If the parameter is a rate parameter, and the model has expectation values implemented,
Expand All @@ -401,44 +421,60 @@ def confidence_interval(
If None, the default kind of the model is used.
Keyword Args:
kwargs: the parameters for get_expectation_values and fit
best_fit_args: the parameters to **only** get the global best-fit likelihood
confidence_interval_args: the parameters to get the profile-likelihood,
also the best-fit parameters of profile-likelihood,
parameter_interval_bounds, and confidence interval
"""
if best_fit_args is None:
best_fit_args = {}
if confidence_interval_args is None:
confidence_interval_args = {}
ci_objects = self._confidence_interval_checks(
poi_name,
parameter_interval_bounds,
confidence_level,
confidence_interval_kind,
**kwargs)
**confidence_interval_args)
confidence_interval_kind, confidence_interval_threshold, parameter_interval_bounds = ci_objects

# find best-fit:
best_result, best_ll = self.fit(**kwargs)
# best_fit_args only provides the best-fit likelihood
_, best_ll = self.fit(**best_fit_args)
# the optimization of profile-likelihood under
# confidence_interval_args provides the best_parameter
best_result, _ = self.fit(**confidence_interval_args)
best_parameter = best_result[poi_name]
mask = (parameter_interval_bounds[0] < best_parameter)
mask &= (best_parameter < parameter_interval_bounds[1])
assert mask, ("the best-fit is outside your confidence interval"
" search limits in parameter_interval_bounds")
# log-likelihood - critical value:
mask = within_limits(best_parameter, parameter_interval_bounds)
if not mask:
raise ValueError(
f"The best-fit {best_parameter} is outside your confidence interval "
f"search limits in parameter_interval_bounds {parameter_interval_bounds}.")

# define intersection between likelihood ratio curve and the critical curve:
def t(hypothesis):
def t(hypothesis_value):
# define the intersection
# between the profile-log-likelihood curve and the rejection threshold
kwargs[poi_name] = hypothesis
_, ll = self.fit(**kwargs) # ll is + log-likelihood here
_confidence_interval_args = deepcopy(confidence_interval_args)
_confidence_interval_args[poi_name] = hypothesis_value
_, ll = self.fit(**_confidence_interval_args) # ll is + log-likelihood here
ret = 2. * (best_ll - ll) # likelihood curve "right way up" (smiling)
# if positive, hypothesis is excluded
return ret - confidence_interval_threshold(hypothesis)
return ret - confidence_interval_threshold(hypothesis_value)

t_best_parameter = t(best_parameter)

if t_best_parameter > 0:
warnings.warn(f"CL calculation failed, given fixed parameters {confidence_interval_args}.")

if confidence_interval_kind in {"upper", "central"}:
if confidence_interval_kind in {"upper", "central"} and t_best_parameter < 0:
if t(parameter_interval_bounds[1]) > 0:
ul = brentq(t, best_parameter, parameter_interval_bounds[1])
else:
ul = np.inf
else:
ul = np.nan

if confidence_interval_kind in {"lower", "central"}:
if confidence_interval_kind in {"lower", "central"} and t_best_parameter < 0:
if t(parameter_interval_bounds[0]) > 0:
dl = brentq(t, parameter_interval_bounds[0], best_parameter)
else:
Expand Down
44 changes: 34 additions & 10 deletions alea/models/blueice_extended_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,14 @@ class BlueiceExtendedModel(StatisticalModel):
(assert that all sources have the same analysis_space)
"""

def __init__(self, parameter_definition: dict, likelihood_config: dict):
"""Initializes the statistical model."""
super().__init__(parameter_definition=parameter_definition)
def __init__(self, parameter_definition: dict, likelihood_config: dict, **kwargs):
"""Initializes the statistical model.
Args:
parameter_definition (dict): A dictionary defining the model parameters.
likelihood_config (dict): A dictionary defining the likelihood.
"""
super().__init__(parameter_definition=parameter_definition, **kwargs)
self._likelihood = self._build_ll_from_config(likelihood_config)
self.likelihood_names = [t["name"] for t in likelihood_config["likelihood_terms"]]
self.likelihood_names.append("ancillary_likelihood")
Expand All @@ -66,14 +71,23 @@ def data(self) -> dict:
return super().data

@data.setter
def data(self, data: dict):
def data(self, data: dict or list):
"""
Overrides default setter. Will also set the data of the blueice ll.
Data-sets are expected to be in the form of a list of one
or more structured arrays representing the data-sets of one or more likelihood terms.
Args:
data (dict or list): Data of the statistical model.
If data is a list, it must be a list of length len(self.likelihood_names) + 1.
"""
# iterate through all likelihood terms and set the science data in the blueice ll
# except the generate_values
# last entry in data are the generate_values
if isinstance(data, list):
if len(data) != len(self.likelihood_names) + 1:
raise ValueError(
f"Data must be a list of length {len(self.likelihood_names) + 1}")
data = dict(zip(self.likelihood_names + ["generate_values"], data))
for i, (dataset_name, d) in enumerate(data.items()):
if dataset_name != "generate_values":
ll_term = self._likelihood.likelihood_list[i]
Expand Down Expand Up @@ -159,7 +173,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
for i, source in enumerate(config["sources"]):
parameters_to_ignore: List[str] = [
p.name for p in self.parameters if (
p.ptype == "shape") & (p.name not in source["parameters"])]
p.ptype == "shape") and (p.name not in source["parameters"])]
# no efficiency affects PDF:
parameters_to_ignore += [p.name for p in self.parameters if (p.ptype == "efficiency")]
parameters_to_ignore += source.get("extra_dont_hash_settings", [])
Expand All @@ -179,6 +193,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
rate_parameter = rate_parameters[0]
if rate_parameter.endswith("_rate_multiplier"):
rate_parameter = rate_parameter.replace("_rate_multiplier", "")
# The ancillary term is handled in CustomAncillaryLikelihood
ll.add_rate_parameter(rate_parameter, log_prior=None)
else:
raise NotImplementedError(
Expand All @@ -196,6 +211,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum":
anchors = self.parameters[p].blueice_anchors
if anchors is None:
raise ValueError(f"Shape parameter {p} does not have any anchors.")
# The ancillary term is handled in CustomAncillaryLikelihood
ll.add_shape_parameter(p, anchors=anchors, log_prior=None)

ll.prepare()
Expand Down Expand Up @@ -245,12 +261,20 @@ def _generate_data(self, **generate_values) -> dict:
data["generate_values"] = dict_to_structured_array(generate_values)
return data

def store_data(self, file_name, data_list, data_name_list=None, metadata=None):
"""
Store data in a file.
Append the generate_values to the data_name_list.
"""
if data_name_list is None:
data_name_list = self.likelihood_names + ["generate_values"]
super().store_data(file_name, data_list, data_name_list, metadata)

def _generate_science_data(self, **generate_values) -> dict:
"""Generate data for all science data terms terms."""
"""Generate the science data for all likelihood terms except the ancillary likelihood."""
science_data = [
gen.simulate(**generate_values)
for gen in self.data_generators]
return dict(zip(self.likelihood_names, science_data))
gen.simulate(**generate_values) for gen in self.data_generators]
return dict(zip(self.likelihood_names[:-1], science_data))

def _generate_ancillary_measurements(self, **generate_values) -> dict:
"""
Expand Down
Loading

0 comments on commit 66538e5

Please sign in to comment.