diff --git a/alea/examples/gaussian_model.py b/alea/examples/gaussian_model.py index 90d81d0c..0c39de9d 100644 --- a/alea/examples/gaussian_model.py +++ b/alea/examples/gaussian_model.py @@ -1,8 +1,10 @@ from typing import Optional -from alea.statistical_model import StatisticalModel -import scipy.stats as stats + +from scipy import stats import numpy as np +from alea.statistical_model import StatisticalModel + class GaussianModel(StatisticalModel): def __init__(self, parameter_definition: Optional[dict or list] = None): @@ -13,7 +15,7 @@ def __init__(self, parameter_definition: Optional[dict or list] = None): sigma is fixed in this example. """ if parameter_definition is None: - parameter_definition = ["mu", "sigma"] + parameter_definition = ['mu', 'sigma'] super().__init__(parameter_definition=parameter_definition) def _ll(self, mu=None, sigma=None): diff --git a/alea/parameters.py b/alea/parameters.py index 8c52b365..e5430109 100644 --- a/alea/parameters.py +++ b/alea/parameters.py @@ -12,9 +12,11 @@ class Parameter: nominal_value (float, optional): The nominal value of the parameter. fittable (bool, optional): Indicates if the parameter is fittable or always fixed. ptype (str, optional): The type of the parameter. - uncertainty (float or str, optional): The uncertainty of the parameter. If a string, + uncertainty (float or str, optional): + The uncertainty of the parameter. If a string, it can be evaluated as a numpy or scipy function to define non-gaussian constraints. - relative_uncertainty (bool, optional): Indicates if the uncertainty is relative to the nominal_value. + relative_uncertainty (bool, optional): + Indicates if the uncertainty is relative to the nominal_value. blueice_anchors (list, optional): Anchors for blueice template morphing. fit_limits (tuple, optional): The limits for fitting the parameter. parameter_interval_bounds (tupe, optional): limits for computing confidence intervals @@ -49,11 +51,12 @@ def __init__( self.description = description def __repr__(self) -> str: - parameter_str = [f"{k}={v}" for k, - v in self.__dict__.items() if v is not None] + parameter_str = [ + f"{k}={v}" for k, v in self.__dict__.items() if v is not None] parameter_str = ", ".join(parameter_str) - return f'{self.__class__.__module__}.{self.__class__.__qualname__}'\ - f'({parameter_str})' + repr = f'{self.__class__.__module__}.{self.__class__.__qualname__}' + repr += f'({parameter_str})' + return repr @property def uncertainty(self) -> float or Any: @@ -67,7 +70,8 @@ def uncertainty(self) -> float or Any: return eval(self._uncertainty) else: raise ValueError( - f"Uncertainty string '{self._uncertainty}' must start with 'scipy.' or 'numpy.'") + f"Uncertainty string '{self._uncertainty}'" + " must start with 'scipy.' or 'numpy.'") else: return self._uncertainty @@ -155,8 +159,9 @@ def from_list(cls, names: List[str]): def __repr__(self) -> str: parameter_str = ", ".join(self.names) - return f'{self.__class__.__module__}.{self.__class__.__qualname__}'\ - f'({parameter_str})' + repr = f'{self.__class__.__module__}.{self.__class__.__qualname__}' + repr += f'({parameter_str})' + return repr def add_parameter(self, parameter: Parameter) -> None: """ @@ -181,14 +186,20 @@ def fit_guesses(self) -> Dict[str, float]: """ Returns a dictionary of fit guesses. """ - return {name: param.fit_guess for name, param in self.parameters.items() if param.fit_guess is not None} + return { + name: param.fit_guess + for name, param in self.parameters.items() + if param.fit_guess is not None} @property def fit_limits(self) -> Dict[str, float]: """ Returns a dictionary of fit limits. """ - return {name: param.fit_limits for name, param in self.parameters.items() if param.fit_limits is not None} + return { + name: param.fit_limits + for name, param in self.parameters.items() + if param.fit_limits is not None} @property def fittable(self) -> List[str]: @@ -209,16 +220,21 @@ def nominal_values(self) -> dict: """ return a dict of name:nominal value for all applicable parameters """ - return {k: i.nominal_value for k, i in self.parameters.items() if i.nominal_value is not None} - - def __call__(self, return_fittable: bool = False, - **kwargs: Any) -> Dict[str, float]: + return { + k: i.nominal_value + for k, i in self.parameters.items() + if i.nominal_value is not None} + + def __call__( + self, return_fittable: bool = False, + **kwargs: Any) -> Dict[str, float]: """ Returns a dictionary of parameter values, optionally filtered to return only fittable parameters. Args: - return_fittable (bool, optional): Indicates if only fittable parameters should be returned. + return_fittable (bool, optional): + Indicates if only fittable parameters should be returned. **kwargs: Additional keyword arguments to override parameter values. Returns: @@ -233,7 +249,7 @@ def __call__(self, return_fittable: bool = False, for name, param in self.parameters.items(): new_val = kwargs.get(name, None) - if (return_fittable and param.fittable) or not return_fittable: + if (return_fittable and param.fittable) or (not return_fittable): values[name] = new_val if new_val is not None else param.nominal_value return values @@ -285,5 +301,6 @@ def values_in_fit_limits(self, **kwargs: Any) -> bool: """ Returns True if all values are within the fit limits. """ - return all(self.parameters[name].value_in_fit_limits(value) - for name, value in kwargs.items()) + return all( + self.parameters[name].value_in_fit_limits(value) + for name, value in kwargs.items()) diff --git a/alea/statistical_model.py b/alea/statistical_model.py index c7e08b2f..d1bf4a7b 100644 --- a/alea/statistical_model.py +++ b/alea/statistical_model.py @@ -1,13 +1,16 @@ import inspect -from inference_interface import toydata_from_file, toydata_to_file -from typing import Tuple, Optional, Callable +import warnings +from typing import Tuple, Callable + import numpy as np from scipy.stats import chi2 from scipy.optimize import brentq from iminuit import Minuit from iminuit.util import make_func_code +from inference_interface import toydata_to_file + from alea.parameters import Parameters -import warnings + class StatisticalModel: """ @@ -22,213 +25,131 @@ class StatisticalModel: returns a full data set: Methods: - __init__ - required to implement: - - _ll - _generate_data + __init__ + required to implement: - optional to implement: - get_mus - get_likelihood_term_names + _ll + _generate_data - Implemented here: - store_data - fit - get_confidence_interval - get_expectations - get_parameter_list - print_config + optional to implement: + get_mus + get_likelihood_term_names + Implemented here: + store_data + fit + get_confidence_interval + get_expectations + get_parameter_list + print_config Other members: _data = None _config = {} _confidence_level = 0.9 - _confidence_interval_kind = "upper,lower,central" (if your threshold is the FC threshold, "central" gives you the unified interval) + _confidence_interval_kind = "upper, lower, central" + (if your threshold is the FC threshold, "central" gives you the unified interval) _confidence_interval_threshold: function that defines the Neyman threshold for limit calculations _fit_guess = {} _fixed_parameters = [] """ + + def __init__( + self, + data = None, + parameter_definition: dict or list = None, + confidence_level: float = 0.9, + confidence_interval_kind: str = "central", # one of central, upper, lower + confidence_interval_threshold: Callable[[float], float] = None, + **kwargs): + self._data = data + self._confidence_level = confidence_level + self._confidence_interval_kind = confidence_interval_kind + self.confidence_interval_threshold = confidence_interval_threshold + self._define_parameters(parameter_definition) + + self._check_ll_and_generate_data_signature() + + def _define_parameters(self, parameter_definition): + if parameter_definition is None: + self.parameters = Parameters() + elif isinstance(parameter_definition, dict): + self.parameters = Parameters.from_config(parameter_definition) + elif isinstance(parameter_definition, list): + self.parameters = Parameters.from_list(parameter_definition) + else: + raise RuntimeError("parameter_definition must be dict or list") + + def _check_ll_and_generate_data_signature(self): + ll_params = set(inspect.signature(self._ll).parameters) + generate_data_params = set(inspect.signature(self._generate_data).parameters) + if ll_params != generate_data_params: + raise AssertionError( + "ll and generate_data must have the same signature (parameters)") + + def _ll(self, **kwargs) -> float: + raise NotImplementedError( + "You must write a likelihood function (_ll) for your statistical model" + " or use a subclass where it is written for you") + + def _generate_data(self, **kwargs): + raise NotImplementedError( + "You must write a data-generation method (_generate_data) for your statistical model" + " or use a subclass where it is written for you") + def ll(self, **kwargs) -> float: - # CAUTION: This implementation won't allow you to call the likelihood by positional arguments. + # CAUTION: + # This implementation won't allow you to call the likelihood by positional arguments. parameters = self.parameters(**kwargs) return self._ll(**parameters) - def _ll(self, **kwargs) -> float: - raise NotImplementedError("You must write a likelihood function (_ll) for your statistical model or use a subclass where it is written for you") - def generate_data(self, **kwargs): - # CAUTION: This implementation won't allow you to call generate_data by positional arguments. + # CAUTION: + # This implementation won't allow you to call generate_data by positional arguments. if not self.parameters.values_in_fit_limits(**kwargs): raise ValueError("Values are not within fit limits") parameters = self.parameters(**kwargs) return self._generate_data(**parameters) - def _generate_data(self, **kwargs): - raise NotImplementedError("You must write a data-generation method (_generate_data) for your statistical model or use a subclass where it is written for you") - - def __init__(self, - data = None, - parameter_definition: dict or list = None, - confidence_level: float = 0.9, - confidence_interval_kind:str = "central", #one of central, upper, lower - confidence_interval_threshold: Callable[[float], float] = None, - **kwargs): - self._data = data - self._confidence_level = confidence_level - self._confidence_interval_kind = confidence_interval_kind - self.confidence_interval_threshold = confidence_interval_threshold - self._define_parameters(parameter_definition) - - self._check_ll_and_generate_data_signature() - @property def data(self): """ Simple getter for a data-set-- mainly here so it can be over-ridden for special needs. - Data-sets are expected to be in the form of a list of one or more structured arrays-- representing the data-sets of one or more likelihood terms. + Data-sets are expected to be in the form of a list of one or more structured arrays, + representing the data-sets of one or more likelihood terms. """ if self._data is None: - raise Exception("data has not been assigned this statistical model!") + raise RuntimeError("data has not been assigned this statistical model!") return self._data @data.setter def data(self, data): """ Simple setter for a data-set-- mainly here so it can be over-ridden for special needs. - Data-sets are expected to be in the form of a list of one or more structured arrays-- representing the data-sets of one or more likelihood terms. + Data-sets are expected to be in the form of a list of one or more structured arrays, + representing the data-sets of one or more likelihood terms. """ self._data = data - def store_data(self, file_name, data_list, data_name_list=None, metadata = None): + def store_data( + self, file_name, data_list, data_name_list=None, metadata = None): """ Store a list of datasets (each on the form of a list of one or more structured arrays) Using inference_interface, but included here to allow over-writing. - structure would be: [[datasets1],[datasets2]... [datasetsn]] + structure would be: [[datasets1], [datasets2], ..., [datasetsn]] where each of datasets is a list of structured arrays if you specify, it is set, if not it will read from self.get_likelihood_term_names - if not defined, it will be ["0","1"..."n-1"] + if not defined, it will be ["0", "1", ..., "n-1"] """ if data_name_list is None: try: data_name_list = self.get_likelihood_term_names() - except NotImplementedError as e: + except NotImplementedError: data_name_list = ["{:d}".format(i) for i in range(len(data_list[0]))] - kw = dict(metadata = metadata) if metadata is not None else dict() + kw = {'metadata': metadata} if metadata is not None else dict() toydata_to_file(file_name, data_list, data_name_list, **kw) - def _confidence_interval_checks(self, parameter: str, - parameter_interval_bounds: Tuple[float,float], - confidence_level: float, - confidence_interval_kind:str, - **kwargs): - """ - helper function for confidence_interval that does the input checks and returns bounds + - """ - if confidence_level is None: - confidence_level = self._confidence_level - if confidence_interval_kind is None: - confidence_interval_kind = self._confidence_interval_kind - - assert (0 Tuple[float, float]: - """ - Uses self.fit to compute confidence intervals for a certain named parameter - - parameter: string, name of fittable parameter of the model - parameter_interval_bounds: range in which to search for the confidence interval edges. May be specified as: - - setting the property "parameter_interval_bounds" for the parameter - - passing a list here - If the parameter is a rate parameter, and the model has expectation values implemented, - the bounds will be interpreted as bounds on the expectation value (so that the range in the fit is parameter_interval_bounds/mus) - otherwise the bound is taken as-is. - - - """ - - ci_objects = self._confidence_interval_checks(parameter, - parameter_interval_bounds, - confidence_level, - confidence_interval_kind, - **kwargs) - confidence_interval_kind, confidence_interval_threshold, parameter_interval_bounds = ci_objects - - - #find best-fit: - best_result, best_ll = self.fit(**kwargs) - best_parameter = best_result[parameter] - assert (parameter_interval_bounds[0] < best_parameter) & (best_parameter Tuple[dict,float]: + def fit(self, verbose=False, **kwargs) -> Tuple[dict, float]: """ Fit the model to the data by maximizing the likelihood - returns a dict containing best-fit values of each parameter, and the value of the likelihood evaluated there. - While the optimization is a minimization, the likelihood returned is the _maximum_ of the likelihood. + returns a dict containing best-fit values of each parameter, + and the value of the likelihood evaluated there. + While the optimization is a minimization, + the likelihood returned is the _maximum_ of the likelihood. """ fixed_parameters = list(kwargs.keys()) guesses = self.parameters.fit_guesses @@ -292,11 +216,12 @@ def fit(self, verbose=False, **kwargs)-> Tuple[dict,float]: cost = self.make_objective(minus=True, **kwargs) class MinuitWrap: - """Wrapper for functions to be called by Minuit - + """ + Wrapper for functions to be called by Minuit s_args must be a list of argument names of function f the names in this list must be the same as the keys of - the dictionary passed to the Minuit call.""" + the dictionary passed to the Minuit call. + """ def __init__(self, f, s_args): self.func = f @@ -308,8 +233,9 @@ def __call__(self, *args): # Make the Minuit object cost.errordef = Minuit.LIKELIHOOD - m = Minuit(MinuitWrap(cost, s_args=self.parameters.names), - **defaults) + m = Minuit( + MinuitWrap(cost, s_args=self.parameters.names), + **defaults) fixed_params = [] if fixed_parameters is None else fixed_parameters fixed_params += self.parameters.not_fittable for par in fixed_params: @@ -321,20 +247,123 @@ def __call__(self, *args): m.migrad() if verbose: print(m) - return m.values.to_dict(), -1 * m.fval # alert! This gives the _maximum_ likelihood + # alert! This gives the _maximum_ likelihood + return m.values.to_dict(), -1 * m.fval + + def _confidence_interval_checks( + self, parameter: str, + parameter_interval_bounds: Tuple[float, float], + confidence_level: float, + confidence_interval_kind: str, + **kwargs): + """ + helper function for confidence_interval that does the input checks and returns bounds + + """ + if confidence_level is None: + confidence_level = self._confidence_level + if confidence_interval_kind is None: + confidence_interval_kind = self._confidence_interval_kind - def _check_ll_and_generate_data_signature(self): - ll_params = set(inspect.signature(self._ll).parameters) - generate_data_params = set(inspect.signature(self._generate_data).parameters) - if ll_params != generate_data_params: - raise AssertionError("ll and generate_data must have the same signature (parameters)") + mask = (0 < confidence_level) and (confidence_level < 1) + assert mask, "the confidence level must lie between 0 and 1" + parameter_of_interest = self.parameters[parameter] + assert parameter_of_interest.fittable, "The parameter of interest must be fittable" + assert parameter not in kwargs, "you cannot set the parameter you're constraining" - def _define_parameters(self, parameter_definition): - if parameter_definition is None: - self.parameters = Parameters() - elif isinstance(parameter_definition, dict): - self.parameters = Parameters.from_config(parameter_definition) - elif isinstance(parameter_definition, list): - self.parameters = Parameters.from_list(parameter_definition) + if parameter_interval_bounds is None: + parameter_interval_bounds = parameter_of_interest.parameter_interval_bounds + if parameter_interval_bounds is None: + raise ValueError( + "You must set parameter_interval_bounds in the parameter config" + " or when calling confidence_interval") + + if parameter_of_interest.type == "rate": + try: + mu_parameter = self.get_expectations(**kwargs)[parameter_of_interest.name] + parameter_interval_bounds = ( + parameter_interval_bounds[0] / mu_parameter, + parameter_interval_bounds[1] / mu_parameter) + except NotImplementedError: + warnings.warn( + "The statistical model does not have a get_expectations model implemented," + " confidence interval bounds will be set directly.") + pass # no problem, continuing with bounds as set + + # define threshold if none is defined: + if self.confidence_interval_threshold is not None: + confidence_interval_threshold = self.confidence_interval_threshold + else: + # use asymptotic thresholds assuming the test statistic is Chi2 distributed + if confidence_interval_kind in {"lower", "upper"}: + critical_value = chi2(1).isf(2 * (1. - confidence_level)) + elif confidence_interval_kind == "central": + critical_value = chi2(1).isf(1. - confidence_level) + + confidence_interval_threshold = lambda x: critical_value + + return confidence_interval_kind, confidence_interval_threshold, parameter_interval_bounds + + def confidence_interval( + self, parameter: str, + parameter_interval_bounds: Tuple[float, float] = None, + confidence_level: float= None, + confidence_interval_kind: str = None, + **kwargs) -> Tuple[float, float]: + """ + Uses self.fit to compute confidence intervals for a certain named parameter + + parameter: string, name of fittable parameter of the model + parameter_interval_bounds: range in which to search for the confidence interval edges. + May be specified as: + - setting the property "parameter_interval_bounds" for the parameter + - passing a list here + If the parameter is a rate parameter, and the model has expectation values implemented, + the bounds will be interpreted as bounds on the expectation value + (so that the range in the fit is parameter_interval_bounds/mus) + otherwise the bound is taken as-is. + """ + + ci_objects = self._confidence_interval_checks( + parameter, + parameter_interval_bounds, + confidence_level, + confidence_interval_kind, + **kwargs) + confidence_interval_kind, confidence_interval_threshold, parameter_interval_bounds = ci_objects + + # find best-fit: + best_result, best_ll = self.fit(**kwargs) + best_parameter = best_result[parameter] + mask = (parameter_interval_bounds[0] < best_parameter) + mask &= (best_parameter < parameter_interval_bounds[1]) + assert mask, ("the best-fit is outside your confidence interval" + " search limits in parameter_interval_bounds") + # log-likelihood - critical value: + + # define intersection between likelihood ratio curve and the critical curve: + def t(hypothesis): + # define the intersection + # between the profile-log-likelihood curve and the rejection threshold + kwargs[parameter] = hypothesis + _, ll = self.fit(**kwargs) # ll is + log-likelihood here + ret = 2. * (best_ll - ll) # likelihood curve "right way up" (smiling) + # if positive, hypothesis is excluded + return ret - confidence_interval_threshold(hypothesis) + + if confidence_interval_kind in {"upper", "central"}: + if 0 < t(parameter_interval_bounds[1]): + ul = brentq(t, best_parameter, parameter_interval_bounds[1]) + else: + ul = np.inf else: - raise Exception("parameter_definition must be dict or list") + ul = np.nan + + if confidence_interval_kind in {"lower", "central"}: + if 0 < t(parameter_interval_bounds[0]): + dl = brentq(t, parameter_interval_bounds[0], best_parameter) + else: + dl = -1 * np.inf + else: + dl = np.nan + + return dl, ul