From 232c6de78773708dc6c212707eee8485158e0517 Mon Sep 17 00:00:00 2001 From: dachengx Date: Sat, 22 Jul 2023 01:40:29 +0800 Subject: [PATCH 01/23] Recover runner --- alea/runner.py | 400 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 400 insertions(+) create mode 100644 alea/runner.py diff --git a/alea/runner.py b/alea/runner.py new file mode 100644 index 00000000..f4a056a2 --- /dev/null +++ b/alea/runner.py @@ -0,0 +1,400 @@ +from copy import deepcopy +from typing import Callable, Optional +from datetime import datetime +from pydoc import locate +import inspect + +from tqdm import tqdm +import numpy as np + +from inference_interface import toydata_from_file, numpy_to_toyfile +from alea.statistical_model import StatisticalModel + + +class Runner: + """ + Runner manipulates statistical model and toydata. + - initialize statistical model + - generate or read toy data + - save toy data if needed + - fit parameters + - write output file + """ + + def __init__( + self, + statistical_model: str, + statistical_model_args: dict = None, + parameter_definition: Optional[dict or list] = None, + confidence_level: float = 0.9, + confidence_interval_kind: str = 'central', + confidence_interval_threshold: Callable[[float], float] = None, + poi: str = None, + hypotheses: list = None, + common_generate_values: dict = None, + true_generate_values: dict = None, + n_mc: int = 1, + toydata_file: str = None, + toydata_mode: str = None, + metadata: dict = None, + output_file: str = 'test_toymc.hdf5', + ): + """ + Initialize statistical model, + parameters list, and generate values list + """ + statistical_model_class = locate(statistical_model) + if statistical_model_class is None: + raise ValueError(f'Could not find {statistical_model}!') + if not inspect.isclass(statistical_model_class): + raise ValueError(f'{statistical_model_class} is not a class!') + if not issubclass(statistical_model_class, StatisticalModel): + raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!') + self.statistical_model = statistical_model_class( + parameter_definition=parameter_definition, + confidence_level=confidence_level, + confidence_interval_kind=confidence_interval_kind, + confidence_interval_threshold=confidence_interval_threshold, + **(statistical_model_args if statistical_model_args else {}), + ) + + self.poi = poi + self.hypotheses = hypotheses if hypotheses else [] + self.common_generate_values = common_generate_values + self.true_generate_values = true_generate_values + self.n_mc = n_mc + self.toydata_file = toydata_file + self.toydata_mode = toydata_mode + self.metadata = metadata if metadata else {} + self.output_file = output_file + + self.parameter_list, self.result_list, self.result_dtype = self._get_parameter_list() + + self.generate_values = self._get_generate_values() + + def _get_parameter_list(self): + """Get parameter list and result list from statistical model""" + parameter_list = sorted(self.statistical_model.get_parameter_list()) + result_list = parameter_list + ['ll', 'dl', 'ul'] + result_dtype = [(n, float) for n in parameter_list] + result_dtype += [(n, float) for n in ['ll', 'dl', 'ul']] + # try: + # parameter_list += self.statistical_model.additional_parameters + # result_dtype += [(n, float) for n in self.statistical_model.additional_parameters] + # except: + # pass + return parameter_list, result_list, result_dtype + + def _get_generate_values(self): + """Get generate values list from hypotheses""" + generate_values = [] + hypotheses = deepcopy(self.hypotheses) + + for hypothesis in hypotheses: + if hypothesis == 'null': + # there is no signal component + hypothesis = {self.poi: 0.} + elif hypothesis == 'true': + # the true signal component is used + if self.poi not in self.true_generate_values: + raise ValueError( + f'{self.poi} should be provided in true_generate_values', + ) + hypothesis = { + self.poi: self.true_generate_values.get(self.poi), + } + elif hypothesis == 'free': + hypothesis = {} + + array = deepcopy(self.common_generate_values) + array.update(hypothesis) + generate_values.append(array) + return generate_values + + def write_output(self, results): + """Write output file with metadata""" + metadata = deepcopy(self.metadata) + + result_names = [f'{i:d}' for i in range(len(self.generate_values))] + for i, ea in enumerate(self.hypotheses): + if ea in ['null', 'free', 'true']: + result_names[i] = ea + + metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S') + metadata['poi'] = self.poi + metadata['common_generate_values'] = self.common_generate_values + metadata['true_generate_values'] = self.true_generate_values + + array_metadatas = [{'generate_values': ea} for ea in self.generate_values] + + numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] + + print(f'Saving {self.output_file}') + numpy_to_toyfile( + self.output_file, + numpy_arrays_and_names=numpy_arrays_and_names, + metadata=metadata, + array_metadatas=array_metadatas) + + def read_toydata(self): + """Read toydata from file""" + toydata, toydata_names = toydata_from_file(self.toydata_file) + return toydata, toydata_names + + def toy_simulation(self): + """ + Run toy simulation a specified different toydata mode + and loop over generate values + """ + flag_read_toydata = False + flag_generate_toydata = False + flag_write_toydata = False + + if self.toydata_mode == 'read': + flag_read_toydata = True + elif self.toydata_mode == 'generate': + flag_generate_toydata = True + elif self.toydata_mode == 'generate_and_write': + flag_generate_toydata = True + flag_write_toydata = True + elif self.toydata_mode == 'no_toydata': + pass + + if flag_read_toydata and flag_generate_toydata: + raise ValueError('Cannot both read and generate toydata') + + toydata = [] + toydata_names = None + if flag_read_toydata: + toydata, toydata_names = self.read_toydata() + + results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.generate_values] + for i_mc in tqdm(range(self.n_mc)): + fit_results = [] + for generate_values in self.generate_values: + if flag_read_toydata: + self.statistical_model.data = toydata[i_mc] + if flag_generate_toydata: + self.statistical_model.data = self.statistical_model.generate_data( + **generate_values) + + fit_result, max_llh = self.statistical_model.fit(**generate_values) + fit_result['ll'] = max_llh + fit_result['dl'] = -1. + fit_result['ul'] = -1. + + fit_results.append(fit_result) + toydata.append(self.statistical_model.data) + for fit_result, result_array in zip(fit_results, results): + result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_list) + + if flag_write_toydata: + self.statistical_model.store_data(self.toydata_file, toydata, toydata_names) + + return results + + def run(self): + """Run toy simulation""" + results = self.toy_simulation() + + self.write_output(results) +from copy import deepcopy +from typing import Callable, Optional +from datetime import datetime +from pydoc import locate +import inspect + +from tqdm import tqdm +import numpy as np + +from inference_interface import toydata_from_file, numpy_to_toyfile +from alea.statistical_model import StatisticalModel + + +class Runner: + """ + Runner manipulates statistical model and toydata. + - initialize statistical model + - generate or read toy data + - save toy data if needed + - fit parameters + - write output file + """ + + def __init__( + self, + statistical_model: str, + statistical_model_args: dict = None, + parameter_definition: Optional[dict or list] = None, + confidence_level: float = 0.9, + confidence_interval_kind: str = 'central', + confidence_interval_threshold: Callable[[float], float] = None, + poi: str = None, + hypotheses: list = None, + common_generate_values: dict = None, + true_generate_values: dict = None, + n_mc: int = 1, + toydata_file: str = None, + toydata_mode: str = None, + metadata: dict = None, + output_file: str = 'test_toymc.hdf5', + ): + """ + Initialize statistical model, + parameters list, and generate values list + """ + statistical_model_class = locate(statistical_model) + if statistical_model_class is None: + raise ValueError(f'Could not find {statistical_model}!') + if not inspect.isclass(statistical_model_class): + raise ValueError(f'{statistical_model_class} is not a class!') + if not issubclass(statistical_model_class, StatisticalModel): + raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!') + self.statistical_model = statistical_model_class( + parameter_definition=parameter_definition, + confidence_level=confidence_level, + confidence_interval_kind=confidence_interval_kind, + confidence_interval_threshold=confidence_interval_threshold, + **(statistical_model_args if statistical_model_args else {}), + ) + + self.poi = poi + self.hypotheses = hypotheses if hypotheses else [] + self.common_generate_values = common_generate_values + self.true_generate_values = true_generate_values + self.n_mc = n_mc + self.toydata_file = toydata_file + self.toydata_mode = toydata_mode + self.metadata = metadata if metadata else {} + self.output_file = output_file + + self.parameter_list, self.result_list, self.result_dtype = self._get_parameter_list() + + self.generate_values = self._get_generate_values() + + def _get_parameter_list(self): + """Get parameter list and result list from statistical model""" + parameter_list = sorted(self.statistical_model.get_parameter_list()) + result_list = parameter_list + ['ll', 'dl', 'ul'] + result_dtype = [(n, float) for n in parameter_list] + result_dtype += [(n, float) for n in ['ll', 'dl', 'ul']] + # try: + # parameter_list += self.statistical_model.additional_parameters + # result_dtype += [(n, float) for n in self.statistical_model.additional_parameters] + # except: + # pass + return parameter_list, result_list, result_dtype + + def _get_generate_values(self): + """Get generate values list from hypotheses""" + generate_values = [] + hypotheses = deepcopy(self.hypotheses) + + for hypothesis in hypotheses: + if hypothesis == 'null': + # there is no signal component + hypothesis = {self.poi: 0.} + elif hypothesis == 'true': + # the true signal component is used + if self.poi not in self.true_generate_values: + raise ValueError( + f'{self.poi} should be provided in true_generate_values', + ) + hypothesis = { + self.poi: self.true_generate_values.get(self.poi), + } + elif hypothesis == 'free': + hypothesis = {} + + array = deepcopy(self.common_generate_values) + array.update(hypothesis) + generate_values.append(array) + return generate_values + + def write_output(self, results): + """Write output file with metadata""" + metadata = deepcopy(self.metadata) + + result_names = [f'{i:d}' for i in range(len(self.generate_values))] + for i, ea in enumerate(self.hypotheses): + if ea in ['null', 'free', 'true']: + result_names[i] = ea + + metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S') + metadata['poi'] = self.poi + metadata['common_generate_values'] = self.common_generate_values + metadata['true_generate_values'] = self.true_generate_values + + array_metadatas = [{'generate_values': ea} for ea in self.generate_values] + + numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] + + print(f'Saving {self.output_file}') + numpy_to_toyfile( + self.output_file, + numpy_arrays_and_names=numpy_arrays_and_names, + metadata=metadata, + array_metadatas=array_metadatas) + + def read_toydata(self): + """Read toydata from file""" + toydata, toydata_names = toydata_from_file(self.toydata_file) + return toydata, toydata_names + + def toy_simulation(self): + """ + Run toy simulation a specified different toydata mode + and loop over generate values + """ + flag_read_toydata = False + flag_generate_toydata = False + flag_write_toydata = False + + if self.toydata_mode == 'read': + flag_read_toydata = True + elif self.toydata_mode == 'generate': + flag_generate_toydata = True + elif self.toydata_mode == 'generate_and_write': + flag_generate_toydata = True + flag_write_toydata = True + elif self.toydata_mode == 'no_toydata': + pass + + if flag_read_toydata and flag_generate_toydata: + raise ValueError('Cannot both read and generate toydata') + + toydata = [] + toydata_names = None + if flag_read_toydata: + toydata, toydata_names = self.read_toydata() + + results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.generate_values] + for i_mc in tqdm(range(self.n_mc)): + fit_results = [] + for generate_values in self.generate_values: + if flag_read_toydata: + self.statistical_model.data = toydata[i_mc] + if flag_generate_toydata: + self.statistical_model.data = self.statistical_model.generate_data( + **generate_values) + + fit_result, max_llh = self.statistical_model.fit(**generate_values) + fit_result['ll'] = max_llh + fit_result['dl'] = -1. + fit_result['ul'] = -1. + + fit_results.append(fit_result) + toydata.append(self.statistical_model.data) + for fit_result, result_array in zip(fit_results, results): + result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_list) + + if flag_write_toydata: + self.statistical_model.store_data(self.toydata_file, toydata, toydata_names) + + return results + + def run(self): + """Run toy simulation""" + results = self.toy_simulation() + + self.write_output(results) From 628d4524356f99bf394d855b339b04b3d8be8210 Mon Sep 17 00:00:00 2001 From: dachengx Date: Sat, 22 Jul 2023 01:45:10 +0800 Subject: [PATCH 02/23] Improve code style --- alea/blueice_extended_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alea/blueice_extended_model.py b/alea/blueice_extended_model.py index fa9036a4..f61f3f70 100644 --- a/alea/blueice_extended_model.py +++ b/alea/blueice_extended_model.py @@ -123,7 +123,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum": for i, source in enumerate(config["sources"]): parameters_to_ignore: List[str] = [ p.name for p in self.parameters if ( - p.ptype == "shape") & (p.name not in source["parameters"])] + p.ptype == "shape") and (p.name not in source["parameters"])] # no efficiency affects PDF: parameters_to_ignore += [p.name for p in self.parameters if (p.ptype == "efficiency")] parameters_to_ignore += source.get("extra_dont_hash_settings", []) From cc0d06e35eb5cee0f384476f8d5794f843f7575e Mon Sep 17 00:00:00 2001 From: dachengx Date: Sat, 22 Jul 2023 01:45:42 +0800 Subject: [PATCH 03/23] Minor change --- alea/blueice_extended_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/alea/blueice_extended_model.py b/alea/blueice_extended_model.py index f61f3f70..944ee013 100644 --- a/alea/blueice_extended_model.py +++ b/alea/blueice_extended_model.py @@ -134,7 +134,6 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum": ll = likelihood_object(blueice_config) for source in config["sources"]: - # Set rate parameters rate_parameters = [ p for p in source["parameters"] if self.parameters[p].ptype == "rate"] From 0e334dd403be09db48a19400ce8f379e80d61132 Mon Sep 17 00:00:00 2001 From: dachengx Date: Sun, 23 Jul 2023 04:23:16 +0800 Subject: [PATCH 04/23] Remove duplicated codes --- alea/__init__.py | 1 + alea/runner.py | 200 ----------------------------------------------- 2 files changed, 1 insertion(+), 200 deletions(-) diff --git a/alea/__init__.py b/alea/__init__.py index 66dd17c0..99491ee8 100644 --- a/alea/__init__.py +++ b/alea/__init__.py @@ -5,3 +5,4 @@ from alea import template_source from alea import toymc_running from alea import utils +from alea import runner diff --git a/alea/runner.py b/alea/runner.py index f4a056a2..cbf520be 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -198,203 +198,3 @@ def run(self): results = self.toy_simulation() self.write_output(results) -from copy import deepcopy -from typing import Callable, Optional -from datetime import datetime -from pydoc import locate -import inspect - -from tqdm import tqdm -import numpy as np - -from inference_interface import toydata_from_file, numpy_to_toyfile -from alea.statistical_model import StatisticalModel - - -class Runner: - """ - Runner manipulates statistical model and toydata. - - initialize statistical model - - generate or read toy data - - save toy data if needed - - fit parameters - - write output file - """ - - def __init__( - self, - statistical_model: str, - statistical_model_args: dict = None, - parameter_definition: Optional[dict or list] = None, - confidence_level: float = 0.9, - confidence_interval_kind: str = 'central', - confidence_interval_threshold: Callable[[float], float] = None, - poi: str = None, - hypotheses: list = None, - common_generate_values: dict = None, - true_generate_values: dict = None, - n_mc: int = 1, - toydata_file: str = None, - toydata_mode: str = None, - metadata: dict = None, - output_file: str = 'test_toymc.hdf5', - ): - """ - Initialize statistical model, - parameters list, and generate values list - """ - statistical_model_class = locate(statistical_model) - if statistical_model_class is None: - raise ValueError(f'Could not find {statistical_model}!') - if not inspect.isclass(statistical_model_class): - raise ValueError(f'{statistical_model_class} is not a class!') - if not issubclass(statistical_model_class, StatisticalModel): - raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!') - self.statistical_model = statistical_model_class( - parameter_definition=parameter_definition, - confidence_level=confidence_level, - confidence_interval_kind=confidence_interval_kind, - confidence_interval_threshold=confidence_interval_threshold, - **(statistical_model_args if statistical_model_args else {}), - ) - - self.poi = poi - self.hypotheses = hypotheses if hypotheses else [] - self.common_generate_values = common_generate_values - self.true_generate_values = true_generate_values - self.n_mc = n_mc - self.toydata_file = toydata_file - self.toydata_mode = toydata_mode - self.metadata = metadata if metadata else {} - self.output_file = output_file - - self.parameter_list, self.result_list, self.result_dtype = self._get_parameter_list() - - self.generate_values = self._get_generate_values() - - def _get_parameter_list(self): - """Get parameter list and result list from statistical model""" - parameter_list = sorted(self.statistical_model.get_parameter_list()) - result_list = parameter_list + ['ll', 'dl', 'ul'] - result_dtype = [(n, float) for n in parameter_list] - result_dtype += [(n, float) for n in ['ll', 'dl', 'ul']] - # try: - # parameter_list += self.statistical_model.additional_parameters - # result_dtype += [(n, float) for n in self.statistical_model.additional_parameters] - # except: - # pass - return parameter_list, result_list, result_dtype - - def _get_generate_values(self): - """Get generate values list from hypotheses""" - generate_values = [] - hypotheses = deepcopy(self.hypotheses) - - for hypothesis in hypotheses: - if hypothesis == 'null': - # there is no signal component - hypothesis = {self.poi: 0.} - elif hypothesis == 'true': - # the true signal component is used - if self.poi not in self.true_generate_values: - raise ValueError( - f'{self.poi} should be provided in true_generate_values', - ) - hypothesis = { - self.poi: self.true_generate_values.get(self.poi), - } - elif hypothesis == 'free': - hypothesis = {} - - array = deepcopy(self.common_generate_values) - array.update(hypothesis) - generate_values.append(array) - return generate_values - - def write_output(self, results): - """Write output file with metadata""" - metadata = deepcopy(self.metadata) - - result_names = [f'{i:d}' for i in range(len(self.generate_values))] - for i, ea in enumerate(self.hypotheses): - if ea in ['null', 'free', 'true']: - result_names[i] = ea - - metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S') - metadata['poi'] = self.poi - metadata['common_generate_values'] = self.common_generate_values - metadata['true_generate_values'] = self.true_generate_values - - array_metadatas = [{'generate_values': ea} for ea in self.generate_values] - - numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] - - print(f'Saving {self.output_file}') - numpy_to_toyfile( - self.output_file, - numpy_arrays_and_names=numpy_arrays_and_names, - metadata=metadata, - array_metadatas=array_metadatas) - - def read_toydata(self): - """Read toydata from file""" - toydata, toydata_names = toydata_from_file(self.toydata_file) - return toydata, toydata_names - - def toy_simulation(self): - """ - Run toy simulation a specified different toydata mode - and loop over generate values - """ - flag_read_toydata = False - flag_generate_toydata = False - flag_write_toydata = False - - if self.toydata_mode == 'read': - flag_read_toydata = True - elif self.toydata_mode == 'generate': - flag_generate_toydata = True - elif self.toydata_mode == 'generate_and_write': - flag_generate_toydata = True - flag_write_toydata = True - elif self.toydata_mode == 'no_toydata': - pass - - if flag_read_toydata and flag_generate_toydata: - raise ValueError('Cannot both read and generate toydata') - - toydata = [] - toydata_names = None - if flag_read_toydata: - toydata, toydata_names = self.read_toydata() - - results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.generate_values] - for i_mc in tqdm(range(self.n_mc)): - fit_results = [] - for generate_values in self.generate_values: - if flag_read_toydata: - self.statistical_model.data = toydata[i_mc] - if flag_generate_toydata: - self.statistical_model.data = self.statistical_model.generate_data( - **generate_values) - - fit_result, max_llh = self.statistical_model.fit(**generate_values) - fit_result['ll'] = max_llh - fit_result['dl'] = -1. - fit_result['ul'] = -1. - - fit_results.append(fit_result) - toydata.append(self.statistical_model.data) - for fit_result, result_array in zip(fit_results, results): - result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_list) - - if flag_write_toydata: - self.statistical_model.store_data(self.toydata_file, toydata, toydata_names) - - return results - - def run(self): - """Run toy simulation""" - results = self.toy_simulation() - - self.write_output(results) From 4fb911364a7e86a4608ae5425864b02ad0fc3f46 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 15:01:35 -0500 Subject: [PATCH 05/23] Simplify toydata mode --- alea/runner.py | 141 +++++++++++++++++++++++++++++-------------------- 1 file changed, 84 insertions(+), 57 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index cbf520be..6c3d9360 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -19,23 +19,64 @@ class Runner: - save toy data if needed - fit parameters - write output file + + One toyfile can contain multiple toydata, but all of them are from the same generate_values + + Attributes: + statistical_model: statistical model + poi: parameter of interest + hypotheses (list): list of hypotheses + common_hypothesis (dict): common hypothesis, the values are copied to each hypothesis + generate_values (dict): generate values for toydata + n_mc (int): number of Monte Carlo + toydata_file (str): toydata filename + toydata_mode (str): toydata mode, 'read', 'generate', 'generate_and_write', 'no_toydata' + metadata (dict): metadata, if None, it is set to {} + output_file (str): output filename + result_names (list): list of result names + result_dtype (list): list of result dtypes + hypotheses_values (list): list of values for hypotheses + + Args: + statistical_model (str): statistical model class name + poi (str): parameter of interest + hypotheses (list): list of hypotheses + n_mc (int): number of Monte Carlo + statistical_model_args (dict, optional (default=None)): arguments for statistical model + parameter_definition (dict or list, optional (default=None)): parameter definition + confidence_level (float, optional (default=0.9)): confidence level + confidence_interval_kind (str, optional (default='central')): + kind of confidence interval, choice from 'central', 'upper' or 'lower' + confidence_interval_threshold (Callable[[float], float], optional (default=None)): + confidence interval threshold of likelihood ratio + common_hypothesis (dict, optional (default=None)): common hypothesis, the values are copied to each hypothesis + generate_values (dict, optional (default=None)): + generate values of toydata. If None, toydata depend on statistical model. + toydata_mode (str, optional (default='generate_and_write')): + toydata mode, choice from 'read', 'generate', 'generate_and_write', 'no_toydata' + toydata_file (str, optional (default=None)): toydata filename + metadata (dict, optional (default=None)): metadata + output_file (str, optional (default='test_toymc.hdf5')): output filename + + Todo: + Implement confidence interval calculation """ def __init__( self, statistical_model: str, + poi: str, + hypotheses: list, + n_mc: int, + generate_values: dict = None, statistical_model_args: dict = None, parameter_definition: Optional[dict or list] = None, confidence_level: float = 0.9, confidence_interval_kind: str = 'central', confidence_interval_threshold: Callable[[float], float] = None, - poi: str = None, - hypotheses: list = None, - common_generate_values: dict = None, - true_generate_values: dict = None, - n_mc: int = 1, + common_hypothesis: dict = None, + toydata_mode: str = 'generate_and_write', toydata_file: str = None, - toydata_mode: str = None, metadata: dict = None, output_file: str = 'test_toymc.hdf5', ): @@ -60,34 +101,30 @@ def __init__( self.poi = poi self.hypotheses = hypotheses if hypotheses else [] - self.common_generate_values = common_generate_values - self.true_generate_values = true_generate_values + self.common_hypothesis = common_hypothesis + self.generate_values = generate_values self.n_mc = n_mc self.toydata_file = toydata_file self.toydata_mode = toydata_mode self.metadata = metadata if metadata else {} self.output_file = output_file - self.parameter_list, self.result_list, self.result_dtype = self._get_parameter_list() + self.result_names, self.result_dtype = self._get_parameter_list() - self.generate_values = self._get_generate_values() + self.hypotheses_values = self._get_hypotheses() def _get_parameter_list(self): """Get parameter list and result list from statistical model""" parameter_list = sorted(self.statistical_model.get_parameter_list()) - result_list = parameter_list + ['ll', 'dl', 'ul'] + # add likelihood, lower limit, and upper limit + result_names = parameter_list + ['ll', 'dl', 'ul'] result_dtype = [(n, float) for n in parameter_list] result_dtype += [(n, float) for n in ['ll', 'dl', 'ul']] - # try: - # parameter_list += self.statistical_model.additional_parameters - # result_dtype += [(n, float) for n in self.statistical_model.additional_parameters] - # except: - # pass - return parameter_list, result_list, result_dtype - - def _get_generate_values(self): + return result_names, result_dtype + + def _get_hypotheses(self): """Get generate values list from hypotheses""" - generate_values = [] + hypotheses_values = [] hypotheses = deepcopy(self.hypotheses) for hypothesis in hypotheses: @@ -96,36 +133,36 @@ def _get_generate_values(self): hypothesis = {self.poi: 0.} elif hypothesis == 'true': # the true signal component is used - if self.poi not in self.true_generate_values: + if self.poi not in self.generate_values: raise ValueError( - f'{self.poi} should be provided in true_generate_values', + f'{self.poi} should be provided in generate_values', ) hypothesis = { - self.poi: self.true_generate_values.get(self.poi), + self.poi: self.generate_values.get(self.poi), } elif hypothesis == 'free': hypothesis = {} - array = deepcopy(self.common_generate_values) + array = deepcopy(self.common_hypothesis) array.update(hypothesis) - generate_values.append(array) - return generate_values + hypotheses_values.append(array) + return hypotheses_values def write_output(self, results): """Write output file with metadata""" metadata = deepcopy(self.metadata) - result_names = [f'{i:d}' for i in range(len(self.generate_values))] + result_names = [f'{i:d}' for i in range(len(self.hypotheses_values))] for i, ea in enumerate(self.hypotheses): if ea in ['null', 'free', 'true']: result_names[i] = ea metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S') metadata['poi'] = self.poi - metadata['common_generate_values'] = self.common_generate_values - metadata['true_generate_values'] = self.true_generate_values + metadata['common_hypothesis'] = self.common_hypothesis + metadata['generate_values'] = self.generate_values - array_metadatas = [{'generate_values': ea} for ea in self.generate_values] + array_metadatas = [{'hypotheses_values': ea} for ea in self.hypotheses_values] numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] @@ -146,49 +183,39 @@ def toy_simulation(self): Run toy simulation a specified different toydata mode and loop over generate values """ - flag_read_toydata = False - flag_generate_toydata = False - flag_write_toydata = False - - if self.toydata_mode == 'read': - flag_read_toydata = True - elif self.toydata_mode == 'generate': - flag_generate_toydata = True - elif self.toydata_mode == 'generate_and_write': - flag_generate_toydata = True - flag_write_toydata = True - elif self.toydata_mode == 'no_toydata': - pass - - if flag_read_toydata and flag_generate_toydata: - raise ValueError('Cannot both read and generate toydata') + if self.toydata_mode not in { + 'read', 'generate', 'generate_and_write', 'no_toydata', + }: + raise ValueError(f'Unknown toydata mode: {self.toydata_mode}') toydata = [] toydata_names = None - if flag_read_toydata: + if self.toydata_mode == 'read': toydata, toydata_names = self.read_toydata() - results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.generate_values] + results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.hypotheses_values] for i_mc in tqdm(range(self.n_mc)): + if self.toydata_mode == 'generate' or self.toydata_mode == 'generate_and_write': + self.statistical_model.data = self.statistical_model.generate_data( + **self.generate_values) fit_results = [] - for generate_values in self.generate_values: - if flag_read_toydata: + for hypothesis_values in self.hypotheses_values: + if self.toydata_mode == 'read': self.statistical_model.data = toydata[i_mc] - if flag_generate_toydata: - self.statistical_model.data = self.statistical_model.generate_data( - **generate_values) - fit_result, max_llh = self.statistical_model.fit(**generate_values) + fit_result, max_llh = self.statistical_model.fit(**hypothesis_values) fit_result['ll'] = max_llh fit_result['dl'] = -1. fit_result['ul'] = -1. fit_results.append(fit_result) - toydata.append(self.statistical_model.data) + # appemd toydata + toydata.append(self.statistical_model.data) + # assign fitting results for fit_result, result_array in zip(fit_results, results): - result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_list) + result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_names) - if flag_write_toydata: + if self.toydata_mode == 'generate_and_write': self.statistical_model.store_data(self.toydata_file, toydata, toydata_names) return results From d31d4bc49001788d9ef3a748b4e84872f472135a Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 15:26:28 -0500 Subject: [PATCH 06/23] Add test_runner.py Encourage people to use h5 instead of hdf5 --- alea/__init__.py | 2 + .../configs/unbinned_wimp_running.yaml | 11 ++--- alea/models/blueice_extended_model.py | 7 +++- alea/runner.py | 14 ++++--- alea/template_source.py | 8 ++-- tests/test_gaussian_model.py | 2 +- tests/test_runner.py | 40 +++++++++++++++++++ 7 files changed, 66 insertions(+), 18 deletions(-) create mode 100644 tests/test_runner.py diff --git a/alea/__init__.py b/alea/__init__.py index 9ee77721..3a59b33b 100644 --- a/alea/__init__.py +++ b/alea/__init__.py @@ -4,6 +4,8 @@ from .models import * +from .runner import * + from .utils import * from .parameters import * diff --git a/alea/examples/configs/unbinned_wimp_running.yaml b/alea/examples/configs/unbinned_wimp_running.yaml index d8115d52..6a7a9e30 100644 --- a/alea/examples/configs/unbinned_wimp_running.yaml +++ b/alea/examples/configs/unbinned_wimp_running.yaml @@ -1,3 +1,4 @@ +statistical_model: alea.models.BlueiceExtendedModel statistical_model_config: unbinned_wimp_statistical_model.yaml poi: wimp_rate_multiplier @@ -13,7 +14,7 @@ computation: parameters_in_common: { hypotheses: ["true", "null", "free"], - output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5", + output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5", n_mc: 5000, n_batch: 40, } @@ -25,11 +26,11 @@ computation: parameters_in_common: { hypotheses: ["true", "null", "free"], - output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5", + output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5", n_mc: 5000, n_batch: 40, } - limit_threshold: "thresholds.hdf5" + limit_threshold: "thresholds.h5" toydata_mode: "generate" parameters_as_wildcards: ["poi_expectation", "n_mc", "n_batch"] @@ -39,11 +40,11 @@ computation: parameters_in_common: { hypotheses: ["true", "null", "free"], - output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.hdf5", + output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5", n_mc: 5000, n_batch: 40, compute_confidence_interval: True, - limit_threshold: "thresholds.hdf5", + limit_threshold: "thresholds.h5", } toydata_mode: "generate" diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index 6ffe20c5..90389d68 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -26,14 +26,17 @@ class BlueiceExtendedModel(StatisticalModel): likelihood_config (dict): A dictionary defining the likelihood. """ - def __init__(self, parameter_definition: dict, likelihood_config: dict): + def __init__( + self, + parameter_definition: dict, likelihood_config: dict, + *args, **kwargs): """Initializes the statistical model. Args: parameter_definition (dict): A dictionary defining the model parameters. likelihood_config (dict): A dictionary defining the likelihood. """ - super().__init__(parameter_definition=parameter_definition) + super().__init__(parameter_definition=parameter_definition, *args, **kwargs) self._likelihood = self._build_ll_from_config(likelihood_config) self.likelihood_names = [t["name"] for t in likelihood_config["likelihood_terms"]] self.likelihood_names.append("ancillary_likelihood") diff --git a/alea/runner.py b/alea/runner.py index 6c3d9360..0e4e9b61 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -8,7 +8,7 @@ import numpy as np from inference_interface import toydata_from_file, numpy_to_toyfile -from alea.statistical_model import StatisticalModel +from alea.model import StatisticalModel class Runner: @@ -56,7 +56,7 @@ class Runner: toydata mode, choice from 'read', 'generate', 'generate_and_write', 'no_toydata' toydata_file (str, optional (default=None)): toydata filename metadata (dict, optional (default=None)): metadata - output_file (str, optional (default='test_toymc.hdf5')): output filename + output_file (str, optional (default='test_toymc.h5')): output filename Todo: Implement confidence interval calculation @@ -68,17 +68,18 @@ def __init__( poi: str, hypotheses: list, n_mc: int, + common_hypothesis: dict = None, generate_values: dict = None, statistical_model_args: dict = None, parameter_definition: Optional[dict or list] = None, + likelihood_config: dict = None, confidence_level: float = 0.9, confidence_interval_kind: str = 'central', confidence_interval_threshold: Callable[[float], float] = None, - common_hypothesis: dict = None, toydata_mode: str = 'generate_and_write', toydata_file: str = None, metadata: dict = None, - output_file: str = 'test_toymc.hdf5', + output_file: str = 'test_toymc.h5', ): """ Initialize statistical model, @@ -93,6 +94,7 @@ def __init__( raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!') self.statistical_model = statistical_model_class( parameter_definition=parameter_definition, + likelihood_config=likelihood_config, confidence_level=confidence_level, confidence_interval_kind=confidence_interval_kind, confidence_interval_threshold=confidence_interval_threshold, @@ -101,9 +103,9 @@ def __init__( self.poi = poi self.hypotheses = hypotheses if hypotheses else [] - self.common_hypothesis = common_hypothesis - self.generate_values = generate_values self.n_mc = n_mc + self.common_hypothesis = common_hypothesis if common_hypothesis else {} + self.generate_values = generate_values if generate_values else {} self.toydata_file = toydata_file self.toydata_mode = toydata_mode self.metadata = metadata if metadata else {} diff --git a/alea/template_source.py b/alea/template_source.py index f7dac572..8e60d825 100644 --- a/alea/template_source.py +++ b/alea/template_source.py @@ -169,10 +169,10 @@ class CombinedSource(blueice.HistogramPdfSource): but takes in lists of histogram names. :param weights: weights :param histnames: list of filenames containing the histograms - :param templatenames: list of names of histograms within the hdf5 files + :param templatenames: list of names of histograms within the h5 files :param named_parameters : list of names of weights to be applied to histograms. Must be 1 shorter than histnames, templatenames - :param histogram_parameters: names of parameters that should be put in the hdf5/histogram names, + :param histogram_parameters: names of parameters that should be put in the h5/histogram names, """ def build_histogram(self): @@ -305,7 +305,7 @@ def build_histogram(self): logging.debug("h.bin_edges type", type(h.bin_edges)) if len(seen_bin_edges) != len(expected_bin_edges): raise ValueError( - "Axis %d of histogram %s in hdf5 file %s has %d bin edges, but expected %d" + "Axis %d of histogram %s in h5 file %s has %d bin edges, but expected %d" % ( axis_i, histname, templatename, len(seen_bin_edges), len(expected_bin_edges))) @@ -315,7 +315,7 @@ def build_histogram(self): expected_bin_edges, decimal=4) except AssertionError: warnings.warn( - "Axis %d of histogram %s in hdf5 file %s has bin edges %s, " + "Axis %d of histogram %s in h5 file %s has bin edges %s, " "but expected %s. Since length matches, setting it expected values..." % ( axis_i, histname, templatename, seen_bin_edges, diff --git a/tests/test_gaussian_model.py b/tests/test_gaussian_model.py index ea21309e..1d5a8f5f 100644 --- a/tests/test_gaussian_model.py +++ b/tests/test_gaussian_model.py @@ -38,7 +38,7 @@ def test_data_generation(self): def test_data_storage(self): """Test storage of data to file and retrieval of data from file""" - toydata_file = 'simple_data.hdf5' + toydata_file = 'simple_data.h5' self.model.data = self.model.generate_data(mu=0, sigma=2) self.model.store_data(toydata_file, [self.model.data]) stored_data = inference_interface.toydata_from_file(toydata_file) diff --git a/tests/test_runner.py b/tests/test_runner.py new file mode 100644 index 00000000..3d4f8014 --- /dev/null +++ b/tests/test_runner.py @@ -0,0 +1,40 @@ +from os import remove +import pytest +from unittest import TestCase + +from alea.utils import load_yaml +from alea.runner import Runner + + +@pytest.mark.usefixtures('rm_cache') +class TestRunner(TestCase): + """Test of the Runner class""" + + @classmethod + def setUp(cls): + """Initialise the Runner instance""" + cls.runner_config = load_yaml('unbinned_wimp_running.yaml') + cls.model_config = load_yaml(cls.runner_config['statistical_model_config']) + cls.set_new_runner(cls) + + def set_new_runner(self): + """Set a new BlueiceExtendedModel instance""" + self.runner = Runner( + statistical_model = self.runner_config['statistical_model'], + poi = self.runner_config['poi'], + hypotheses = self.runner_config['computation']['discovery_power']['parameters_in_common']['hypotheses'], + n_mc = 10, + generate_values = {'wimp_rate_multiplier': 1.0}, + parameter_definition = self.model_config['parameter_definition'], + likelihood_config = self.model_config['likelihood_config'], + toydata_mode = 'generate_and_write', + toydata_file = 'simple_data.h5', + output_file = 'test_toymc.h5', + ) + + def test_run(self): + """Test of the toy_simulation and write_output method""" + self.runner.run() + remove('simple_data.h5') + remove('test_toymc.h5') + From 80e0dd613a1b4a537d564842db2fa0d75e9a25d1 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 15:35:05 -0500 Subject: [PATCH 07/23] Happier code style --- alea/runner.py | 52 ++++++++++++++++++++++---------------------- tests/test_runner.py | 5 +++-- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index 0e4e9b61..032b9b1b 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -19,7 +19,6 @@ class Runner: - save toy data if needed - fit parameters - write output file - One toyfile can contain multiple toydata, but all of them are from the same generate_values Attributes: @@ -49,7 +48,8 @@ class Runner: kind of confidence interval, choice from 'central', 'upper' or 'lower' confidence_interval_threshold (Callable[[float], float], optional (default=None)): confidence interval threshold of likelihood ratio - common_hypothesis (dict, optional (default=None)): common hypothesis, the values are copied to each hypothesis + common_hypothesis (dict, optional (default=None)): + common hypothesis, the values are copied to each hypothesis generate_values (dict, optional (default=None)): generate values of toydata. If None, toydata depend on statistical model. toydata_mode (str, optional (default='generate_and_write')): @@ -103,17 +103,17 @@ def __init__( self.poi = poi self.hypotheses = hypotheses if hypotheses else [] - self.n_mc = n_mc + self._n_mc = n_mc self.common_hypothesis = common_hypothesis if common_hypothesis else {} self.generate_values = generate_values if generate_values else {} - self.toydata_file = toydata_file - self.toydata_mode = toydata_mode - self.metadata = metadata if metadata else {} - self.output_file = output_file + self._toydata_file = toydata_file + self._toydata_mode = toydata_mode + self._output_file = output_file + self._metadata = metadata if metadata else {} - self.result_names, self.result_dtype = self._get_parameter_list() + self._result_names, self._result_dtype = self._get_parameter_list() - self.hypotheses_values = self._get_hypotheses() + self._hypotheses_values = self._get_hypotheses() def _get_parameter_list(self): """Get parameter list and result list from statistical model""" @@ -152,9 +152,9 @@ def _get_hypotheses(self): def write_output(self, results): """Write output file with metadata""" - metadata = deepcopy(self.metadata) + metadata = deepcopy(self._metadata) - result_names = [f'{i:d}' for i in range(len(self.hypotheses_values))] + result_names = [f'{i:d}' for i in range(len(self._hypotheses_values))] for i, ea in enumerate(self.hypotheses): if ea in ['null', 'free', 'true']: result_names[i] = ea @@ -164,20 +164,20 @@ def write_output(self, results): metadata['common_hypothesis'] = self.common_hypothesis metadata['generate_values'] = self.generate_values - array_metadatas = [{'hypotheses_values': ea} for ea in self.hypotheses_values] + array_metadatas = [{'hypotheses_values': ea} for ea in self._hypotheses_values] numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] - print(f'Saving {self.output_file}') + print(f'Saving {self._output_file}') numpy_to_toyfile( - self.output_file, + self._output_file, numpy_arrays_and_names=numpy_arrays_and_names, metadata=metadata, array_metadatas=array_metadatas) def read_toydata(self): """Read toydata from file""" - toydata, toydata_names = toydata_from_file(self.toydata_file) + toydata, toydata_names = toydata_from_file(self._toydata_file) return toydata, toydata_names def toy_simulation(self): @@ -185,24 +185,24 @@ def toy_simulation(self): Run toy simulation a specified different toydata mode and loop over generate values """ - if self.toydata_mode not in { + if self._toydata_mode not in { 'read', 'generate', 'generate_and_write', 'no_toydata', }: - raise ValueError(f'Unknown toydata mode: {self.toydata_mode}') + raise ValueError(f'Unknown toydata mode: {self._toydata_mode}') toydata = [] toydata_names = None - if self.toydata_mode == 'read': + if self._toydata_mode == 'read': toydata, toydata_names = self.read_toydata() - results = [np.zeros(self.n_mc, dtype=self.result_dtype) for _ in self.hypotheses_values] - for i_mc in tqdm(range(self.n_mc)): - if self.toydata_mode == 'generate' or self.toydata_mode == 'generate_and_write': + results = [np.zeros(self._n_mc, dtype=self._result_dtype) for _ in self._hypotheses_values] + for i_mc in tqdm(range(self._n_mc)): + if self._toydata_mode == 'generate' or self._toydata_mode == 'generate_and_write': self.statistical_model.data = self.statistical_model.generate_data( **self.generate_values) fit_results = [] - for hypothesis_values in self.hypotheses_values: - if self.toydata_mode == 'read': + for hypothesis_values in self._hypotheses_values: + if self._toydata_mode == 'read': self.statistical_model.data = toydata[i_mc] fit_result, max_llh = self.statistical_model.fit(**hypothesis_values) @@ -215,10 +215,10 @@ def toy_simulation(self): toydata.append(self.statistical_model.data) # assign fitting results for fit_result, result_array in zip(fit_results, results): - result_array[i_mc] = tuple(fit_result[pn] for pn in self.result_names) + result_array[i_mc] = tuple(fit_result[pn] for pn in self._result_names) - if self.toydata_mode == 'generate_and_write': - self.statistical_model.store_data(self.toydata_file, toydata, toydata_names) + if self._toydata_mode == 'generate_and_write': + self.statistical_model.store_data(self._toydata_file, toydata, toydata_names) return results diff --git a/tests/test_runner.py b/tests/test_runner.py index 3d4f8014..b4531559 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -19,10 +19,12 @@ def setUp(cls): def set_new_runner(self): """Set a new BlueiceExtendedModel instance""" + # TODO: interpret the config file after submitter class is implemented + parameter_zvc = self.runner_config['computation']['discovery_power'] self.runner = Runner( statistical_model = self.runner_config['statistical_model'], poi = self.runner_config['poi'], - hypotheses = self.runner_config['computation']['discovery_power']['parameters_in_common']['hypotheses'], + hypotheses = parameter_zvc['parameters_in_common']['hypotheses'], n_mc = 10, generate_values = {'wimp_rate_multiplier': 1.0}, parameter_definition = self.model_config['parameter_definition'], @@ -37,4 +39,3 @@ def test_run(self): self.runner.run() remove('simple_data.h5') remove('test_toymc.h5') - From 2c624dcd0b580fb00ef99a920b2970a2217b77b9 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 15:38:19 -0500 Subject: [PATCH 08/23] Minor change --- alea/runner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index 032b9b1b..beab8108 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -34,13 +34,13 @@ class Runner: output_file (str): output filename result_names (list): list of result names result_dtype (list): list of result dtypes - hypotheses_values (list): list of values for hypotheses + _hypotheses_values (list): list of values for hypotheses Args: statistical_model (str): statistical model class name poi (str): parameter of interest hypotheses (list): list of hypotheses - n_mc (int): number of Monte Carlo + _n_mc (int): number of Monte Carlo statistical_model_args (dict, optional (default=None)): arguments for statistical model parameter_definition (dict or list, optional (default=None)): parameter definition confidence_level (float, optional (default=0.9)): confidence level @@ -52,11 +52,11 @@ class Runner: common hypothesis, the values are copied to each hypothesis generate_values (dict, optional (default=None)): generate values of toydata. If None, toydata depend on statistical model. - toydata_mode (str, optional (default='generate_and_write')): + _toydata_mode (str, optional (default='generate_and_write')): toydata mode, choice from 'read', 'generate', 'generate_and_write', 'no_toydata' - toydata_file (str, optional (default=None)): toydata filename - metadata (dict, optional (default=None)): metadata - output_file (str, optional (default='test_toymc.h5')): output filename + _toydata_file (str, optional (default=None)): toydata filename + _metadata (dict, optional (default=None)): metadata + _output_file (str, optional (default='test_toymc.h5')): output filename Todo: Implement confidence interval calculation @@ -103,9 +103,9 @@ def __init__( self.poi = poi self.hypotheses = hypotheses if hypotheses else [] - self._n_mc = n_mc self.common_hypothesis = common_hypothesis if common_hypothesis else {} self.generate_values = generate_values if generate_values else {} + self._n_mc = n_mc self._toydata_file = toydata_file self._toydata_mode = toydata_mode self._output_file = output_file From 06ad8f3df145afec3c9b205f90ad6f6757089bf7 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 17:57:09 -0500 Subject: [PATCH 09/23] Help model to save dict data and list data in a unified function --- alea/examples/gaussian_model.py | 4 +- alea/model.py | 24 ++++++++--- alea/models/blueice_extended_model.py | 28 +++++++++++-- alea/runner.py | 31 +++++++++----- tests/__init__.py | 0 tests/test_gaussian_model.py | 36 ++++++++-------- tests/test_runner.py | 60 +++++++++++++++++++-------- 7 files changed, 127 insertions(+), 56 deletions(-) create mode 100644 tests/__init__.py diff --git a/alea/examples/gaussian_model.py b/alea/examples/gaussian_model.py index 86f19b0e..2ae4684e 100644 --- a/alea/examples/gaussian_model.py +++ b/alea/examples/gaussian_model.py @@ -10,7 +10,7 @@ class GaussianModel(StatisticalModel): def __init__( self, parameter_definition: Optional[dict or list] = None, - **kwargs, + *args, **kwargs, ): """ Initialise a model of a gaussian measurement (hatmu), @@ -20,7 +20,7 @@ def __init__( """ if parameter_definition is None: parameter_definition = ["mu", "sigma"] - super().__init__(parameter_definition=parameter_definition, **kwargs) + super().__init__(parameter_definition=parameter_definition, *args, **kwargs) def _ll(self, mu=None, sigma=None): hat_mu = self.data[0]['hat_mu'][0] diff --git a/alea/model.py b/alea/model.py index 51c57510..dfd4eb60 100644 --- a/alea/model.py +++ b/alea/model.py @@ -58,6 +58,7 @@ def __init__( confidence_level: float = 0.9, confidence_interval_kind: str = "central", # one of central, upper, lower confidence_interval_threshold: Callable[[float], float] = None, + **kwargs, ): """Initialize a statistical model""" if type(self) == StatisticalModel: @@ -159,23 +160,36 @@ def data(self, data): self.is_data_set = True def store_data( - self, file_name, data_list, data_name_list=None, metadata = None): + self, file_name, data_list, data_name_list=None, metadata=None): """ - Store a list of datasets (each on the form of a list of one or more structured arrays) + Store a list of datasets. + (each on the form of a list of one or more structured arrays or dicts) Using inference_interface, but included here to allow over-writing. structure would be: [[datasets1], [datasets2], ..., [datasetsn]] where each of datasets is a list of structured arrays if you specify, it is set, if not it will read from self.get_likelihood_term_names if not defined, it will be ["0", "1", ..., "n-1"] """ + if all([isinstance(d, dict) for d in data_list]): + _data_list = [list(d.values()) for d in data_list] + elif all([isinstance(d, list) for d in data_list]): + _data_list = data_list + else: + raise ValueError( + 'Unsupported mixed toydata format! ' + 'toydata should be a list of dict or a list of list',) + if data_name_list is None: if hasattr(self, "likelihood_names"): data_name_list = self.likelihood_names else: - data_name_list = ["{:d}".format(i) for i in range(len(data_list[0]))] + data_name_list = ["{:d}".format(i) for i in range(len(_data_list[0]))] kw = {'metadata': metadata} if metadata is not None else dict() - toydata_to_file(file_name, data_list, data_name_list, **kw) + if len(_data_list[0]) != len(data_name_list): + raise ValueError( + "The number of data sets and data names must be the same") + toydata_to_file(file_name, _data_list, data_name_list, **kw) def get_expectation_values(self, **parameter_values): return NotImplementedError("get_expectation_values is optional to implement") @@ -195,7 +209,7 @@ def get_likelihood_term_from_name(self, likelihood_name): """ if hasattr(self, "likelihood_names"): likelihood_names = self.likelihood_names - return {n:i for i,n in enumerate(likelihood_names)}[likelihood_name] + return {n:i for i, n in enumerate(likelihood_names)}[likelihood_name] else: raise NotImplementedError("The attribute likelihood_names is not defined.") diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index 90389d68..131f4630 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -65,14 +65,23 @@ def data(self) -> dict: return super().data @data.setter - def data(self, data: dict): + def data(self, data: dict or list): """ Overrides default setter. Will also set the data of the blueice ll. Data-sets are expected to be in the form of a list of one or more structured arrays-- representing the data-sets of one or more likelihood terms. + + Args: + data (dict or list): Data of the statistical model. + If data is a list, it must be a list of length len(self.likelihood_names) + 1. """ # iterate through all likelihood terms and set the science data in the blueice ll # last entry in data are the generate_values + if isinstance(data, list): + if len(data) != len(self.likelihood_names) + 1: + raise ValueError( + f"Data must be a list of length {len(self.likelihood_names) + 1}") + data = dict(zip(self.likelihood_names + ["generate_values"], data)) for i, (dataset_name, d) in enumerate(data.items()): if dataset_name != "generate_values": ll_term = self._likelihood.likelihood_list[i] @@ -210,12 +219,23 @@ def _generate_data(self, **generate_values) -> dict: data["generate_values"] = dict_to_structured_array(generate_values) return data + def store_data(self, file_name, data_list, data_name_list=None, metadata=None): + """ + Store data in a file. + Append the generate_values to the data_name_list. + """ + if data_name_list is None: + data_name_list = self.likelihood_names + ["generate_values"] + super().store_data(file_name, data_list, data_name_list, metadata) + def _generate_science_data(self, **generate_values) -> dict: - science_data = [gen.simulate(**generate_values) - for gen in self.data_generators] - return dict(zip(self.likelihood_names, science_data)) + """Generate the science data for all likelihood terms except the ancillary likelihood.""" + science_data = [ + gen.simulate(**generate_values) for gen in self.data_generators] + return dict(zip(self.likelihood_names[:-1], science_data)) def _generate_ancillary_measurements(self, **generate_values) -> dict: + """Generate the ancillary measurements.""" ancillary_measurements = {} anc_ll = self._likelihood.likelihood_list[-1] ancillary_generators = anc_ll._get_constraint_functions(**generate_values) diff --git a/alea/runner.py b/alea/runner.py index beab8108..2dd686f8 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -19,7 +19,7 @@ class Runner: - save toy data if needed - fit parameters - write output file - One toyfile can contain multiple toydata, but all of them are from the same generate_values + One toyfile can contain multiple toydata, but all of them are from the same generate_values. Attributes: statistical_model: statistical model @@ -41,7 +41,7 @@ class Runner: poi (str): parameter of interest hypotheses (list): list of hypotheses _n_mc (int): number of Monte Carlo - statistical_model_args (dict, optional (default=None)): arguments for statistical model + statistical_model_args (dict, optional (default={})): arguments for statistical model parameter_definition (dict or list, optional (default=None)): parameter definition confidence_level (float, optional (default=0.9)): confidence level confidence_interval_kind (str, optional (default='central')): @@ -70,7 +70,7 @@ def __init__( n_mc: int, common_hypothesis: dict = None, generate_values: dict = None, - statistical_model_args: dict = None, + statistical_model_args: dict = {}, parameter_definition: Optional[dict or list] = None, likelihood_config: dict = None, confidence_level: float = 0.9, @@ -92,9 +92,11 @@ def __init__( raise ValueError(f'{statistical_model_class} is not a class!') if not issubclass(statistical_model_class, StatisticalModel): raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!') + + # likelihood_config is keyword argument, because not all statistical model needs it + statistical_model_args['likelihood_config'] = likelihood_config self.statistical_model = statistical_model_class( parameter_definition=parameter_definition, - likelihood_config=likelihood_config, confidence_level=confidence_level, confidence_interval_kind=confidence_interval_kind, confidence_interval_threshold=confidence_interval_threshold, @@ -156,7 +158,7 @@ def write_output(self, results): result_names = [f'{i:d}' for i in range(len(self._hypotheses_values))] for i, ea in enumerate(self.hypotheses): - if ea in ['null', 'free', 'true']: + if ea in ['free', 'null', 'true']: result_names[i] = ea metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S') @@ -165,7 +167,6 @@ def write_output(self, results): metadata['generate_values'] = self.generate_values array_metadatas = [{'hypotheses_values': ea} for ea in self._hypotheses_values] - numpy_arrays_and_names = [(r, rn) for r, rn in zip(results, result_names)] print(f'Saving {self._output_file}') @@ -180,10 +181,18 @@ def read_toydata(self): toydata, toydata_names = toydata_from_file(self._toydata_file) return toydata, toydata_names + def write_toydata(self, toydata, toydata_names): + """ + Write toydata to file. + If toydata is a list of dict, convert it to a list of list. + """ + self.statistical_model.store_data(self._toydata_file, toydata, toydata_names) + def toy_simulation(self): """ - Run toy simulation a specified different toydata mode - and loop over generate values + For each Monte Carlo: + - run toy simulation a specified toydata mode and generate values. + - loop over hypotheses. """ if self._toydata_mode not in { 'read', 'generate', 'generate_and_write', 'no_toydata', @@ -218,12 +227,12 @@ def toy_simulation(self): result_array[i_mc] = tuple(fit_result[pn] for pn in self._result_names) if self._toydata_mode == 'generate_and_write': - self.statistical_model.store_data(self._toydata_file, toydata, toydata_names) + self.write_toydata(toydata, toydata_names) - return results + return toydata, results def run(self): """Run toy simulation""" - results = self.toy_simulation() + toydata, results = self.toy_simulation() self.write_output(results) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_gaussian_model.py b/tests/test_gaussian_model.py index 1d5a8f5f..84187166 100644 --- a/tests/test_gaussian_model.py +++ b/tests/test_gaussian_model.py @@ -7,30 +7,32 @@ from alea.examples import GaussianModel +gaussian_model_parameter_definition = { + 'mu': { + 'fit_guess': 0., + 'fittable': True, + 'nominal_value': 0., + }, + 'sigma': { + 'fit_guess': 1., + 'fit_limits': [ + 0., + None, + ], + 'fittable': True, + 'nominal_value': 1., + }, +} + + class TestGaussianModel(TestCase): """Test of the GaussianModel class""" @classmethod def setUp(cls): """Initialise the GaussianModel instance""" - parameter_definition = { - 'mu': { - 'fit_guess': 0., - 'fittable': True, - 'nominal_value': 0., - }, - 'sigma': { - 'fit_guess': 1., - 'fit_limits': [ - 0., - None, - ], - 'fittable': True, - 'nominal_value': 1., - }, - } cls.model = GaussianModel( - parameter_definition=parameter_definition) + parameter_definition=gaussian_model_parameter_definition) def test_data_generation(self): """Test generation of data""" diff --git a/tests/test_runner.py b/tests/test_runner.py index b4531559..34c9a2d4 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -4,6 +4,7 @@ from alea.utils import load_yaml from alea.runner import Runner +from .test_gaussian_model import gaussian_model_parameter_definition @pytest.mark.usefixtures('rm_cache') @@ -15,27 +16,52 @@ def setUp(cls): """Initialise the Runner instance""" cls.runner_config = load_yaml('unbinned_wimp_running.yaml') cls.model_config = load_yaml(cls.runner_config['statistical_model_config']) - cls.set_new_runner(cls) + cls.toydata_file = 'simple_data.h5' + cls.output_file = 'test_toymc.h5' - def set_new_runner(self): - """Set a new BlueiceExtendedModel instance""" + def set_gaussian_runner(self, toydata_mode='generate_and_write'): + """Set a new runner instance with GaussianModel""" + self.runner = Runner( + statistical_model='alea.examples.gaussian_model.GaussianModel', + poi='mu', + hypotheses=['free', 'null', 'true'], + n_mc=10, + generate_values={'mu': 1., 'sigma': 1.}, + parameter_definition=gaussian_model_parameter_definition, + toydata_mode=toydata_mode, + toydata_file=self.toydata_file, + output_file=self.output_file, + ) + + def set_blueice_runner(self, toydata_mode='generate_and_write'): + """Set a new runner instance with BlueiceExtendedModel""" # TODO: interpret the config file after submitter class is implemented parameter_zvc = self.runner_config['computation']['discovery_power'] self.runner = Runner( - statistical_model = self.runner_config['statistical_model'], - poi = self.runner_config['poi'], - hypotheses = parameter_zvc['parameters_in_common']['hypotheses'], - n_mc = 10, - generate_values = {'wimp_rate_multiplier': 1.0}, - parameter_definition = self.model_config['parameter_definition'], - likelihood_config = self.model_config['likelihood_config'], - toydata_mode = 'generate_and_write', - toydata_file = 'simple_data.h5', - output_file = 'test_toymc.h5', + statistical_model=self.runner_config['statistical_model'], + poi=self.runner_config['poi'], + hypotheses=parameter_zvc['parameters_in_common']['hypotheses'], + n_mc=10, + generate_values={'wimp_rate_multiplier': 1.0}, + parameter_definition=self.model_config['parameter_definition'], + likelihood_config=self.model_config['likelihood_config'], + toydata_mode=toydata_mode, + toydata_file=self.toydata_file, + output_file=self.output_file, ) - def test_run(self): + def test_runners(self): """Test of the toy_simulation and write_output method""" - self.runner.run() - remove('simple_data.h5') - remove('test_toymc.h5') + set_runners = [self.set_gaussian_runner] + # set_runners = [self.set_blueice_runner, self.set_gaussian_runner] + for set_runner in set_runners: + # test toydata_mode generate_and_write + set_runner() + self.runner.run() + remove(self.output_file) + + # test toydata_mode read + set_runner(toydata_mode='read') + self.runner.run() + remove(self.toydata_file) + remove(self.output_file) From 5104568588025ef01f4a1b40c3ea2eaf6344365a Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 18:07:36 -0500 Subject: [PATCH 10/23] Happier code style --- alea/model.py | 2 +- alea/runner.py | 6 ++++-- tests/test_runner.py | 3 +-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/alea/model.py b/alea/model.py index dfd4eb60..2cdcb1f5 100644 --- a/alea/model.py +++ b/alea/model.py @@ -209,7 +209,7 @@ def get_likelihood_term_from_name(self, likelihood_name): """ if hasattr(self, "likelihood_names"): likelihood_names = self.likelihood_names - return {n:i for i, n in enumerate(likelihood_names)}[likelihood_name] + return {n: i for i, n in enumerate(likelihood_names)}[likelihood_name] else: raise NotImplementedError("The attribute likelihood_names is not defined.") diff --git a/alea/runner.py b/alea/runner.py index 2dd686f8..69ce8d2c 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -70,7 +70,7 @@ def __init__( n_mc: int, common_hypothesis: dict = None, generate_values: dict = None, - statistical_model_args: dict = {}, + statistical_model_args: dict = None, parameter_definition: Optional[dict or list] = None, likelihood_config: dict = None, confidence_level: float = 0.9, @@ -94,6 +94,8 @@ def __init__( raise ValueError(f'{statistical_model_class} is not a subclass of StatisticalModel!') # likelihood_config is keyword argument, because not all statistical model needs it + if statistical_model_args is None: + statistical_model_args = {} statistical_model_args['likelihood_config'] = likelihood_config self.statistical_model = statistical_model_class( parameter_definition=parameter_definition, @@ -158,7 +160,7 @@ def write_output(self, results): result_names = [f'{i:d}' for i in range(len(self._hypotheses_values))] for i, ea in enumerate(self.hypotheses): - if ea in ['free', 'null', 'true']: + if ea in {'free', 'null', 'true'}: result_names[i] = ea metadata['date'] = datetime.now().strftime('%Y%m%d_%H:%M:%S') diff --git a/tests/test_runner.py b/tests/test_runner.py index 34c9a2d4..dc7c4d13 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -52,8 +52,7 @@ def set_blueice_runner(self, toydata_mode='generate_and_write'): def test_runners(self): """Test of the toy_simulation and write_output method""" - set_runners = [self.set_gaussian_runner] - # set_runners = [self.set_blueice_runner, self.set_gaussian_runner] + set_runners = [self.set_blueice_runner, self.set_gaussian_runner] for set_runner in set_runners: # test toydata_mode generate_and_write set_runner() From dec5ca0204a9eaf77e79d5f887bd06ed16f56e83 Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 23:29:41 -0500 Subject: [PATCH 11/23] Minor change at docstring --- alea/models/blueice_extended_model.py | 2 ++ alea/parameters.py | 14 +++++++++----- alea/runner.py | 8 ++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index 131f4630..ae347915 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -170,6 +170,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum": rate_parameter = rate_parameters[0] if rate_parameter.endswith("_rate_multiplier"): rate_parameter = rate_parameter.replace("_rate_multiplier", "") + # The ancillary term is handled in CustomAncillaryLikelihood ll.add_rate_parameter(rate_parameter, log_prior=None) else: raise NotImplementedError( @@ -187,6 +188,7 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum": anchors = self.parameters[p].blueice_anchors if anchors is None: raise ValueError(f"Shape parameter {p} does not have any anchors.") + # The ancillary term is handled in CustomAncillaryLikelihood ll.add_shape_parameter(p, anchors=anchors, log_prior=None) ll.prepare() diff --git a/alea/parameters.py b/alea/parameters.py index e7f26da0..09326d07 100644 --- a/alea/parameters.py +++ b/alea/parameters.py @@ -1,5 +1,9 @@ from typing import Any, Dict, List, Optional, Tuple +# These imports are needed to evaluate the uncertainty string +import numpy +import scipy + class Parameter: """ @@ -40,12 +44,12 @@ def __init__( self.nominal_value = nominal_value self.fittable = fittable self.ptype = ptype - self._uncertainty = uncertainty + self.uncertainty = uncertainty self.relative_uncertainty = relative_uncertainty self.blueice_anchors = blueice_anchors self.fit_limits = fit_limits self.parameter_interval_bounds = parameter_interval_bounds - self._fit_guess = fit_guess + self.fit_guess = fit_guess self.description = description def __repr__(self) -> str: @@ -63,7 +67,7 @@ def uncertainty(self) -> float or Any: If the uncertainty is a string, it can be evaluated as a numpy or scipy function. """ if isinstance(self._uncertainty, str): - # Evaluate the uncertainty if it's a string + # Evaluate the uncertainty if it's a string starting with "scipy." or "numpy." if self._uncertainty.startswith("scipy.") or self._uncertainty.startswith("numpy."): return eval(self._uncertainty) else: @@ -275,8 +279,8 @@ def __call__( if any(i is None for k, i in values.items()): emptypars = ", ".join([k for k, i in values.items() if i is None]) raise AssertionError( - "All parameters must be set explicitly, or have a nominal value," - " encountered for: " + emptypars) + "All parameters must be set explicitly, or have a nominal value, " + "not satisfied for: " + emptypars) return values def __getattr__(self, name: str) -> Parameter: diff --git a/alea/runner.py b/alea/runner.py index 69ce8d2c..e2cee508 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -14,11 +14,11 @@ class Runner: """ Runner manipulates statistical model and toydata. - - initialize statistical model - - generate or read toy data + - initialize the statistical model + - generate or reads toy data - save toy data if needed - - fit parameters - - write output file + - fit fittable parameters + - write the output file One toyfile can contain multiple toydata, but all of them are from the same generate_values. Attributes: From 38234586dbddf44e915256f2caa45a907f3bfe7d Mon Sep 17 00:00:00 2001 From: dachengx Date: Mon, 31 Jul 2023 23:31:13 -0500 Subject: [PATCH 12/23] Minor change --- alea/examples/configs/unbinned_wimp_statistical_model.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alea/examples/configs/unbinned_wimp_statistical_model.yaml b/alea/examples/configs/unbinned_wimp_statistical_model.yaml index cd4f3f37..8a7af095 100644 --- a/alea/examples/configs/unbinned_wimp_statistical_model.yaml +++ b/alea/examples/configs/unbinned_wimp_statistical_model.yaml @@ -48,7 +48,7 @@ parameter_definition: er_band_shift: nominal_value: 0 ptype: shape - uncertainty: null # scipy.stats.uniform(loc=-1, scale=2) + uncertainty: null # 'scipy.stats.uniform(loc=-1, scale=2)' # relative_uncertainty: false fittable: true blueice_anchors: From c0e69ffddd20007d4cd4a59e4c1396a950b0578e Mon Sep 17 00:00:00 2001 From: dachengx Date: Tue, 1 Aug 2023 03:29:22 -0500 Subject: [PATCH 13/23] Confidential interval calculation --- .../configs/unbinned_wimp_running.yaml | 6 +-- .../unbinned_wimp_statistical_model.yaml | 3 ++ alea/model.py | 40 +++++++++++------ alea/parameters.py | 32 ++++++++++---- alea/runner.py | 43 +++++++++++++------ alea/utils.py | 15 +++++++ tests/test_gaussian_model.py | 19 ++++---- tests/test_runner.py | 15 ++++++- 8 files changed, 123 insertions(+), 50 deletions(-) diff --git a/alea/examples/configs/unbinned_wimp_running.yaml b/alea/examples/configs/unbinned_wimp_running.yaml index 6a7a9e30..31b29678 100644 --- a/alea/examples/configs/unbinned_wimp_running.yaml +++ b/alea/examples/configs/unbinned_wimp_running.yaml @@ -13,7 +13,7 @@ computation: } parameters_in_common: { - hypotheses: ["true", "null", "free"], + hypotheses: ["free", "null", "true"], output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5", n_mc: 5000, n_batch: 40, @@ -25,7 +25,7 @@ computation: parameters_to_vary: {wimp_mass: [10, 50, 200]} parameters_in_common: { - hypotheses: ["true", "null", "free"], + hypotheses: ["free", "null", "true"], output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5", n_mc: 5000, n_batch: 40, @@ -39,7 +39,7 @@ computation: parameters_to_vary: {poi_expectation: [0.], wimp_mass: [10, 50, 200]} parameters_in_common: { - hypotheses: ["true", "null", "free"], + hypotheses: ["free", "null", "true"], output_filename: "toymc_power_wimp_mass_{wimp_mass:d}_poi_expectation_{poi_expectation:.2f}.h5", n_mc: 5000, n_batch: 40, diff --git a/alea/examples/configs/unbinned_wimp_statistical_model.yaml b/alea/examples/configs/unbinned_wimp_statistical_model.yaml index 8a7af095..5176345a 100644 --- a/alea/examples/configs/unbinned_wimp_statistical_model.yaml +++ b/alea/examples/configs/unbinned_wimp_statistical_model.yaml @@ -21,6 +21,9 @@ parameter_definition: fit_limits: - 0 - null + parameter_interval_bounds: + - 0 + - null er_rate_multiplier: nominal_value: 1.0 diff --git a/alea/model.py b/alea/model.py index 2cdcb1f5..d586637d 100644 --- a/alea/model.py +++ b/alea/model.py @@ -1,5 +1,6 @@ import inspect import warnings +from copy import deepcopy from typing import Tuple, Callable, Optional import numpy as np @@ -11,6 +12,7 @@ from inference_interface import toydata_to_file from alea.parameters import Parameters +from alea.utils import within_limits class StatisticalModel: @@ -302,6 +304,7 @@ def _confidence_interval_checks( else: source_name = poi_name mu_parameter = self.get_expectation_values(**kwargs)[source_name] + # update parameter_interval_bounds because poi is rate_multiplier parameter_interval_bounds = ( parameter_interval_bounds[0] / mu_parameter, parameter_interval_bounds[1] / mu_parameter) @@ -330,7 +333,9 @@ def confidence_interval( parameter_interval_bounds: Tuple[float, float] = None, confidence_level: float = None, confidence_interval_kind: str = None, - **kwargs) -> Tuple[float, float]: + best_fit_args: dict = None, + confidence_interval_args: dict = None, + ) -> Tuple[float, float]: """ Uses self.fit to compute confidence intervals for a certain named parameter @@ -344,35 +349,42 @@ def confidence_interval( (so that the range in the fit is parameter_interval_bounds/mus) otherwise the bound is taken as-is. """ - + if best_fit_args is None: + best_fit_args = {} + if confidence_interval_args is None: + confidence_interval_args = {} ci_objects = self._confidence_interval_checks( poi_name, parameter_interval_bounds, confidence_level, confidence_interval_kind, - **kwargs) + **confidence_interval_args) confidence_interval_kind, confidence_interval_threshold, parameter_interval_bounds = ci_objects # find best-fit: - best_result, best_ll = self.fit(**kwargs) + best_result, best_ll = self.fit(**best_fit_args) best_parameter = best_result[poi_name] - mask = (parameter_interval_bounds[0] < best_parameter) - mask &= (best_parameter < parameter_interval_bounds[1]) - assert mask, ("the best-fit is outside your confidence interval" - " search limits in parameter_interval_bounds") + mask = within_limits(best_parameter, parameter_interval_bounds) + if not mask: + raise ValueError( + f"The best-fit {best_parameter} is outside your confidence interval " + f"search limits in parameter_interval_bounds {parameter_interval_bounds}.") # log-likelihood - critical value: # define intersection between likelihood ratio curve and the critical curve: - def t(hypothesis): + def t(hypothesis_value): # define the intersection # between the profile-log-likelihood curve and the rejection threshold - kwargs[poi_name] = hypothesis - _, ll = self.fit(**kwargs) # ll is + log-likelihood here + _confidence_interval_args = deepcopy(confidence_interval_args) + _confidence_interval_args[poi_name] = hypothesis_value + _, ll = self.fit(**_confidence_interval_args) # ll is + log-likelihood here ret = 2. * (best_ll - ll) # likelihood curve "right way up" (smiling) # if positive, hypothesis is excluded - return ret - confidence_interval_threshold(hypothesis) + return ret - confidence_interval_threshold(hypothesis_value) + + t_best_parameter = t(best_parameter) - if confidence_interval_kind in {"upper", "central"}: + if confidence_interval_kind in {"upper", "central"} and t_best_parameter < 0: if t(parameter_interval_bounds[1]) > 0: ul = brentq(t, best_parameter, parameter_interval_bounds[1]) else: @@ -380,7 +392,7 @@ def t(hypothesis): else: ul = np.nan - if confidence_interval_kind in {"lower", "central"}: + if confidence_interval_kind in {"lower", "central"} and t_best_parameter < 0: if t(parameter_interval_bounds[0]) > 0: dl = brentq(t, parameter_interval_bounds[0], best_parameter) else: diff --git a/alea/parameters.py b/alea/parameters.py index 09326d07..c8c9498b 100644 --- a/alea/parameters.py +++ b/alea/parameters.py @@ -3,6 +3,9 @@ # These imports are needed to evaluate the uncertainty string import numpy import scipy +import numpy as np + +from alea.utils import MAX_FLOAT, within_limits class Parameter: @@ -94,6 +97,26 @@ def fit_guess(self) -> float: def fit_guess(self, value: float) -> None: self._fit_guess = value + @property + def parameter_interval_bounds(self) -> float: + # make sure to only return parameter_interval_bounds if fittable + if self._parameter_interval_bounds is not None and not self.fittable: + raise ValueError( + f"Parameter {self.name} is not fittable, but has a parameter_interval_bounds.") + else: + return self._parameter_interval_bounds + + @parameter_interval_bounds.setter + def parameter_interval_bounds(self, value: Optional[List]) -> None: + if value is None: + value = [-MAX_FLOAT, MAX_FLOAT] + else: + if value[0] is None: + value[0] = -MAX_FLOAT + if value[1] is None: + value[1] = MAX_FLOAT + self._parameter_interval_bounds = value + def __eq__(self, other: object) -> bool: """Returns True if all attributes are equal""" if isinstance(other, Parameter): @@ -103,14 +126,7 @@ def __eq__(self, other: object) -> bool: def value_in_fit_limits(self, value: float) -> bool: """Returns True if value is within fit_limits""" - if self.fit_limits is None: - return True - elif self.fit_limits[0] is None: - return value <= self.fit_limits[1] - elif self.fit_limits[1] is None: - return value >= self.fit_limits[0] - else: - return self.fit_limits[0] <= value <= self.fit_limits[1] + return within_limits(value, self.fit_limits) class Parameters: diff --git a/alea/runner.py b/alea/runner.py index e2cee508..dfd955f5 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -27,13 +27,14 @@ class Runner: hypotheses (list): list of hypotheses common_hypothesis (dict): common hypothesis, the values are copied to each hypothesis generate_values (dict): generate values for toydata - n_mc (int): number of Monte Carlo - toydata_file (str): toydata filename - toydata_mode (str): toydata mode, 'read', 'generate', 'generate_and_write', 'no_toydata' - metadata (dict): metadata, if None, it is set to {} - output_file (str): output filename - result_names (list): list of result names - result_dtype (list): list of result dtypes + _compute_confidence_interval (bool): whether compute confidence interval + _n_mc (int): number of Monte Carlo + _toydata_file (str): toydata filename + _toydata_mode (str): toydata mode, 'read', 'generate', 'generate_and_write', 'no_toydata' + _metadata (dict): metadata, if None, it is set to {} + _output_file (str): output filename + _result_names (list): list of result names + _result_dtype (list): list of result dtypes _hypotheses_values (list): list of values for hypotheses Args: @@ -43,6 +44,9 @@ class Runner: _n_mc (int): number of Monte Carlo statistical_model_args (dict, optional (default={})): arguments for statistical model parameter_definition (dict or list, optional (default=None)): parameter definition + likelihood_config (dict, optional (default=None)): likelihood configuration + compute_confidence_interval (bool, optional (default=False)): + whether compute confidence interval confidence_level (float, optional (default=0.9)): confidence level confidence_interval_kind (str, optional (default='central')): kind of confidence interval, choice from 'central', 'upper' or 'lower' @@ -52,11 +56,11 @@ class Runner: common hypothesis, the values are copied to each hypothesis generate_values (dict, optional (default=None)): generate values of toydata. If None, toydata depend on statistical model. - _toydata_mode (str, optional (default='generate_and_write')): + toydata_mode (str, optional (default='generate_and_write')): toydata mode, choice from 'read', 'generate', 'generate_and_write', 'no_toydata' - _toydata_file (str, optional (default=None)): toydata filename - _metadata (dict, optional (default=None)): metadata - _output_file (str, optional (default='test_toymc.h5')): output filename + toydata_file (str, optional (default=None)): toydata filename + metadata (dict, optional (default=None)): metadata + output_file (str, optional (default='test_toymc.h5')): output filename Todo: Implement confidence interval calculation @@ -73,6 +77,7 @@ def __init__( statistical_model_args: dict = None, parameter_definition: Optional[dict or list] = None, likelihood_config: dict = None, + compute_confidence_interval: bool = False, confidence_level: float = 0.9, confidence_interval_kind: str = 'central', confidence_interval_threshold: Callable[[float], float] = None, @@ -109,6 +114,7 @@ def __init__( self.hypotheses = hypotheses if hypotheses else [] self.common_hypothesis = common_hypothesis if common_hypothesis else {} self.generate_values = generate_values if generate_values else {} + self._compute_confidence_interval = compute_confidence_interval self._n_mc = n_mc self._toydata_file = toydata_file self._toydata_mode = toydata_mode @@ -132,6 +138,10 @@ def _get_hypotheses(self): """Get generate values list from hypotheses""" hypotheses_values = [] hypotheses = deepcopy(self.hypotheses) + if 'free' not in hypotheses and self._compute_confidence_interval: + raise ValueError('free hypothesis is needed for confidence interval calculation!') + if 'free' in hypotheses and hypotheses.index('free') != 0: + raise ValueError('free hypothesis should be the first hypothesis!') for hypothesis in hypotheses: if hypothesis == 'null': @@ -218,8 +228,15 @@ def toy_simulation(self): fit_result, max_llh = self.statistical_model.fit(**hypothesis_values) fit_result['ll'] = max_llh - fit_result['dl'] = -1. - fit_result['ul'] = -1. + if self._compute_confidence_interval and (self.poi not in hypothesis_values): + dl, ul = self.statistical_model.confidence_interval( + poi_name=self.poi, + best_fit_args=self._hypotheses_values[0], + confidence_interval_args=hypothesis_values) + else: + dl, ul = np.nan, np.nan + fit_result['dl'] = dl + fit_result['ul'] = ul fit_results.append(fit_result) # appemd toydata diff --git a/alea/utils.py b/alea/utils.py index 3d24fe6b..4ce5b618 100644 --- a/alea/utils.py +++ b/alea/utils.py @@ -12,6 +12,9 @@ logging.basicConfig(level=logging.INFO) +MAX_FLOAT = np.sqrt(np.finfo(np.float32).max) + + def get_analysis_space(analysis_space: dict) -> list: eval_analysis_space = [] @@ -147,3 +150,15 @@ def get_template_folder_list(likelihood_config): raise ValueError( "template_folder must be either a string or a list of strings.") return template_folder_list + + +def within_limits(value, limits): + """Returns True if value is within limits""" + if limits is None: + return True + elif limits[0] is None: + return value <= limits[1] + elif limits[1] is None: + return value >= limits[0] + else: + return limits[0] <= value <= limits[1] diff --git a/tests/test_gaussian_model.py b/tests/test_gaussian_model.py index 84187166..9838fdc7 100644 --- a/tests/test_gaussian_model.py +++ b/tests/test_gaussian_model.py @@ -9,19 +9,18 @@ gaussian_model_parameter_definition = { 'mu': { - 'fit_guess': 0., + 'fit_guess': 0.0, 'fittable': True, - 'nominal_value': 0., - }, - 'sigma': { - 'fit_guess': 1., - 'fit_limits': [ - 0., - None, + 'nominal_value': 0.0, + 'parameter_interval_bounds': [ + -10, + 10 ], - 'fittable': True, - 'nominal_value': 1., }, + 'sigma': { + 'fittable': False, + 'nominal_value': 1.0, + } } diff --git a/tests/test_runner.py b/tests/test_runner.py index dc7c4d13..2fedc777 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -2,8 +2,11 @@ import pytest from unittest import TestCase +import numpy as np + from alea.utils import load_yaml from alea.runner import Runner +from inference_interface import toyfiles_to_numpy from .test_gaussian_model import gaussian_model_parameter_definition @@ -18,6 +21,7 @@ def setUp(cls): cls.model_config = load_yaml(cls.runner_config['statistical_model_config']) cls.toydata_file = 'simple_data.h5' cls.output_file = 'test_toymc.h5' + cls.n_mc = 3 def set_gaussian_runner(self, toydata_mode='generate_and_write'): """Set a new runner instance with GaussianModel""" @@ -25,9 +29,10 @@ def set_gaussian_runner(self, toydata_mode='generate_and_write'): statistical_model='alea.examples.gaussian_model.GaussianModel', poi='mu', hypotheses=['free', 'null', 'true'], - n_mc=10, + n_mc=self.n_mc, generate_values={'mu': 1., 'sigma': 1.}, parameter_definition=gaussian_model_parameter_definition, + compute_confidence_interval=True, toydata_mode=toydata_mode, toydata_file=self.toydata_file, output_file=self.output_file, @@ -41,10 +46,11 @@ def set_blueice_runner(self, toydata_mode='generate_and_write'): statistical_model=self.runner_config['statistical_model'], poi=self.runner_config['poi'], hypotheses=parameter_zvc['parameters_in_common']['hypotheses'], - n_mc=10, + n_mc=self.n_mc, generate_values={'wimp_rate_multiplier': 1.0}, parameter_definition=self.model_config['parameter_definition'], likelihood_config=self.model_config['likelihood_config'], + compute_confidence_interval=True, toydata_mode=toydata_mode, toydata_file=self.toydata_file, output_file=self.output_file, @@ -63,4 +69,9 @@ def test_runners(self): set_runner(toydata_mode='read') self.runner.run() remove(self.toydata_file) + + # check confidence interval computation + results = toyfiles_to_numpy(self.runner._output_file) + if np.any(np.isnan(results['free']['dl'])) or np.any(np.isnan(results['free']['ul'])): + raise ValueError('Confidence interval computation failed!') remove(self.output_file) From 8f22c0c63438b7738f0f9b30c3f0a64771ed0f40 Mon Sep 17 00:00:00 2001 From: dachengx Date: Tue, 1 Aug 2023 03:53:46 -0500 Subject: [PATCH 14/23] Happier code style --- alea/parameters.py | 1 - tests/test_gaussian_model.py | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/alea/parameters.py b/alea/parameters.py index c8c9498b..5eeff18c 100644 --- a/alea/parameters.py +++ b/alea/parameters.py @@ -3,7 +3,6 @@ # These imports are needed to evaluate the uncertainty string import numpy import scipy -import numpy as np from alea.utils import MAX_FLOAT, within_limits diff --git a/tests/test_gaussian_model.py b/tests/test_gaussian_model.py index 9838fdc7..9d007cd9 100644 --- a/tests/test_gaussian_model.py +++ b/tests/test_gaussian_model.py @@ -9,18 +9,18 @@ gaussian_model_parameter_definition = { 'mu': { - 'fit_guess': 0.0, + 'fit_guess': 0., 'fittable': True, - 'nominal_value': 0.0, + 'nominal_value': 0., 'parameter_interval_bounds': [ -10, - 10 + 10, ], }, 'sigma': { 'fittable': False, - 'nominal_value': 1.0, - } + 'nominal_value': 1., + }, } From 9e5c79f628028836bade7f68f04befb0ce39634a Mon Sep 17 00:00:00 2001 From: dachengx Date: Tue, 1 Aug 2023 14:22:09 -0500 Subject: [PATCH 15/23] Set sigma as not fittable otherwise horrible things when CL calculation happens --- notebooks/placeholder.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/notebooks/placeholder.ipynb b/notebooks/placeholder.ipynb index fe4e9a0d..55cd6574 100644 --- a/notebooks/placeholder.ipynb +++ b/notebooks/placeholder.ipynb @@ -33,17 +33,17 @@ " 'fit_guess': 0.,\n", " 'fittable': True,\n", " 'nominal_value': 0.,\n", + " 'parameter_interval_bounds': [\n", + " -10,\n", + " 10,\n", + " ],\n", " },\n", " 'sigma': {\n", - " 'fit_guess': 1.,\n", - " 'fit_limits': [\n", - " 0.,\n", - " None,\n", - " ],\n", - " 'fittable': True,\n", + " 'fittable': False,\n", " 'nominal_value': 1.,\n", " },\n", "}\n", + "\n", "model = GaussianModel(parameter_definition=parameter_definition)" ] }, From 89c0bd393cdeb74243de2e8933baeeae2439df49 Mon Sep 17 00:00:00 2001 From: dachengx Date: Tue, 1 Aug 2023 14:24:45 -0500 Subject: [PATCH 16/23] Remove todo of runner because it is done --- alea/runner.py | 3 --- docs/source/reference/alea.rst | 8 ++++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index dfd955f5..24dac271 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -61,9 +61,6 @@ class Runner: toydata_file (str, optional (default=None)): toydata filename metadata (dict, optional (default=None)): metadata output_file (str, optional (default='test_toymc.h5')): output filename - - Todo: - Implement confidence interval calculation """ def __init__( diff --git a/docs/source/reference/alea.rst b/docs/source/reference/alea.rst index 82980323..9f487fbc 100644 --- a/docs/source/reference/alea.rst +++ b/docs/source/reference/alea.rst @@ -29,6 +29,14 @@ alea.parameters module :undoc-members: :show-inheritance: +alea.runner module +------------------ + +.. automodule:: alea.runner + :members: + :undoc-members: + :show-inheritance: + alea.simulators module ---------------------- From b710149e7bf2c833ea7d035dcd7416dafeeb2b34 Mon Sep 17 00:00:00 2001 From: dachengx Date: Tue, 1 Aug 2023 14:40:53 -0500 Subject: [PATCH 17/23] Make the placeholder is already an OK gaussian model example --- docs/source/index.rst | 2 +- docs/source/notebooks/gaussian_model.ipynb | 1 + docs/source/notebooks/placeholder.ipynb | 1 - notebooks/{placeholder.ipynb => gaussian_model.ipynb} | 4 ++-- 4 files changed, 4 insertions(+), 4 deletions(-) create mode 120000 docs/source/notebooks/gaussian_model.ipynb delete mode 120000 docs/source/notebooks/placeholder.ipynb rename notebooks/{placeholder.ipynb => gaussian_model.ipynb} (99%) diff --git a/docs/source/index.rst b/docs/source/index.rst index 8f5ebe83..7ca4be5f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -38,7 +38,7 @@ Alea is a tool to perform toyMC-based inference constructions. :maxdepth: 1 :caption: Example notebooks - notebooks/placeholders.ipynb + notebooks/gaussian_model.ipynb .. toctree:: :maxdepth: 2 diff --git a/docs/source/notebooks/gaussian_model.ipynb b/docs/source/notebooks/gaussian_model.ipynb new file mode 120000 index 00000000..3de578be --- /dev/null +++ b/docs/source/notebooks/gaussian_model.ipynb @@ -0,0 +1 @@ +../../../notebooks/gaussian_model.ipynb \ No newline at end of file diff --git a/docs/source/notebooks/placeholder.ipynb b/docs/source/notebooks/placeholder.ipynb deleted file mode 120000 index c5208c92..00000000 --- a/docs/source/notebooks/placeholder.ipynb +++ /dev/null @@ -1 +0,0 @@ -../../../notebooks/placeholder.ipynb \ No newline at end of file diff --git a/notebooks/placeholder.ipynb b/notebooks/gaussian_model.ipynb similarity index 99% rename from notebooks/placeholder.ipynb rename to notebooks/gaussian_model.ipynb index 55cd6574..d636c3e0 100644 --- a/notebooks/placeholder.ipynb +++ b/notebooks/gaussian_model.ipynb @@ -2,10 +2,10 @@ "cells": [ { "cell_type": "markdown", - "id": "76edc00b-f220-47e2-914b-0781c288594d", + "id": "766ddb65-2b9c-4c08-85db-96fdae472503", "metadata": {}, "source": [ - "# Please delete this when new meaningful notebooks are added." + "# Example of GaussianModel" ] }, { From 9215c9c3e563eeada83110524da86688a2c1803e Mon Sep 17 00:00:00 2001 From: dachengx Date: Wed, 2 Aug 2023 17:20:05 -0500 Subject: [PATCH 18/23] Few change, add warnings and improve performance --- alea/examples/gaussian_model.py | 4 ++-- alea/model.py | 3 +++ alea/models/blueice_extended_model.py | 7 ++----- alea/runner.py | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/alea/examples/gaussian_model.py b/alea/examples/gaussian_model.py index b2addc15..6061bff9 100644 --- a/alea/examples/gaussian_model.py +++ b/alea/examples/gaussian_model.py @@ -24,12 +24,12 @@ class GaussianModel(StatisticalModel): def __init__( self, parameter_definition: Optional[Dict or List] = None, - *args, **kwargs, + **kwargs, ): """Initialise a gaussian model.""" if parameter_definition is None: parameter_definition = ["mu", "sigma"] - super().__init__(parameter_definition=parameter_definition, *args, **kwargs) + super().__init__(parameter_definition=parameter_definition, **kwargs) def _ll(self, mu=None, sigma=None): """ diff --git a/alea/model.py b/alea/model.py index 3d879c66..4f3c1725 100644 --- a/alea/model.py +++ b/alea/model.py @@ -457,6 +457,9 @@ def t(hypothesis_value): t_best_parameter = t(best_parameter) + if t_best_parameter < 0: + warnings.warn(f"CL calculation failed, given fixed parameters {confidence_interval_args}.") + if confidence_interval_kind in {"upper", "central"} and t_best_parameter < 0: if t(parameter_interval_bounds[1]) > 0: ul = brentq(t, best_parameter, parameter_interval_bounds[1]) diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index e1685ace..69d84581 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -38,17 +38,14 @@ class BlueiceExtendedModel(StatisticalModel): (assert that all sources have the same analysis_space) """ - def __init__( - self, - parameter_definition: dict, likelihood_config: dict, - *args, **kwargs): + def __init__(self, parameter_definition: dict, likelihood_config: dict, **kwargs): """Initializes the statistical model. Args: parameter_definition (dict): A dictionary defining the model parameters. likelihood_config (dict): A dictionary defining the likelihood. """ - super().__init__(parameter_definition=parameter_definition, *args, **kwargs) + super().__init__(parameter_definition=parameter_definition, **kwargs) self._likelihood = self._build_ll_from_config(likelihood_config) self.likelihood_names = [t["name"] for t in likelihood_config["likelihood_terms"]] self.likelihood_names.append("ancillary_likelihood") diff --git a/alea/runner.py b/alea/runner.py index 24dac271..052585a1 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -218,11 +218,13 @@ def toy_simulation(self): if self._toydata_mode == 'generate' or self._toydata_mode == 'generate_and_write': self.statistical_model.data = self.statistical_model.generate_data( **self.generate_values) + if self._toydata_mode == 'generate_and_write': + # append toydata + toydata.append(self.statistical_model.data) + if self._toydata_mode == 'read': + self.statistical_model.data = toydata[i_mc] fit_results = [] for hypothesis_values in self._hypotheses_values: - if self._toydata_mode == 'read': - self.statistical_model.data = toydata[i_mc] - fit_result, max_llh = self.statistical_model.fit(**hypothesis_values) fit_result['ll'] = max_llh if self._compute_confidence_interval and (self.poi not in hypothesis_values): @@ -236,8 +238,6 @@ def toy_simulation(self): fit_result['ul'] = ul fit_results.append(fit_result) - # appemd toydata - toydata.append(self.statistical_model.data) # assign fitting results for fit_result, result_array in zip(fit_results, results): result_array[i_mc] = tuple(fit_result[pn] for pn in self._result_names) From 9a577e32a1b21ed75398e7000a0817f606685fe0 Mon Sep 17 00:00:00 2001 From: dachengx Date: Wed, 2 Aug 2023 17:35:59 -0500 Subject: [PATCH 19/23] Convert the data generation of runner into a generator --- alea/runner.py | 49 +++++++++++++++++++++++++++++--------------- tests/test_runner.py | 2 +- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index 052585a1..58a853d5 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -3,6 +3,7 @@ from datetime import datetime from pydoc import locate import inspect +import warnings from tqdm import tqdm import numpy as np @@ -197,6 +198,32 @@ def write_toydata(self, toydata, toydata_names): """ self.statistical_model.store_data(self._toydata_file, toydata, toydata_names) + def data_generator(self): + """Generate or read toydata, and store them to attribute if needed""" + if self._toydata_mode == 'read': + self.toydata, self.toydata_names = self.read_toydata() + if len(self.toydata) < self._n_mc: + raise ValueError( + f'Number of stored toydata {len(self.toydata)} is ' + f'less than number of Monte Carlo {self._n_mc}!') + elif len(self.toydata) > self._n_mc: + warnings.warn( + f'Number of stored toydata {len(self.toydata)} is ' + f'larger than number of Monte Carlo {self._n_mc}.') + else: + self.toydata = [] + self.toydata_names = None + for i_mc in tqdm(range(self._n_mc)): + if self._toydata_mode == 'generate' or self._toydata_mode == 'generate_and_write': + data = self.statistical_model.generate_data( + **self.generate_values) + if self._toydata_mode == 'generate_and_write': + # append toydata + self.toydata.append(data) + if self._toydata_mode == 'read': + data = self.toydata[i_mc] + yield data + def toy_simulation(self): """ For each Monte Carlo: @@ -208,21 +235,9 @@ def toy_simulation(self): }: raise ValueError(f'Unknown toydata mode: {self._toydata_mode}') - toydata = [] - toydata_names = None - if self._toydata_mode == 'read': - toydata, toydata_names = self.read_toydata() - results = [np.zeros(self._n_mc, dtype=self._result_dtype) for _ in self._hypotheses_values] - for i_mc in tqdm(range(self._n_mc)): - if self._toydata_mode == 'generate' or self._toydata_mode == 'generate_and_write': - self.statistical_model.data = self.statistical_model.generate_data( - **self.generate_values) - if self._toydata_mode == 'generate_and_write': - # append toydata - toydata.append(self.statistical_model.data) - if self._toydata_mode == 'read': - self.statistical_model.data = toydata[i_mc] + for i_mc, data in enumerate(self.data_generator()): + self.statistical_model.data = data fit_results = [] for hypothesis_values in self._hypotheses_values: fit_result, max_llh = self.statistical_model.fit(**hypothesis_values) @@ -243,12 +258,12 @@ def toy_simulation(self): result_array[i_mc] = tuple(fit_result[pn] for pn in self._result_names) if self._toydata_mode == 'generate_and_write': - self.write_toydata(toydata, toydata_names) + self.write_toydata(self.toydata, self.toydata_names) - return toydata, results + return results def run(self): """Run toy simulation""" - toydata, results = self.toy_simulation() + results = self.toy_simulation() self.write_output(results) diff --git a/tests/test_runner.py b/tests/test_runner.py index 2fedc777..2e4c78f8 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -58,7 +58,7 @@ def set_blueice_runner(self, toydata_mode='generate_and_write'): def test_runners(self): """Test of the toy_simulation and write_output method""" - set_runners = [self.set_blueice_runner, self.set_gaussian_runner] + set_runners = [self.set_gaussian_runner, self.set_blueice_runner] for set_runner in set_runners: # test toydata_mode generate_and_write set_runner() From 7958d71399908cbeef34ea16d0b6daaef0b88ce9 Mon Sep 17 00:00:00 2001 From: dachengx Date: Wed, 2 Aug 2023 17:55:57 -0500 Subject: [PATCH 20/23] More compact data generator --- alea/runner.py | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/alea/runner.py b/alea/runner.py index 58a853d5..62565763 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -199,30 +199,40 @@ def write_toydata(self, toydata, toydata_names): self.statistical_model.store_data(self._toydata_file, toydata, toydata_names) def data_generator(self): - """Generate or read toydata, and store them to attribute if needed""" + """Generate, save or read toydata""" + # check toydata mode + if self._toydata_mode not in { + 'read', 'generate', 'generate_and_write', 'no_toydata', + }: + raise ValueError(f'Unknown toydata mode: {self._toydata_mode}') + # check toydata file size if self._toydata_mode == 'read': - self.toydata, self.toydata_names = self.read_toydata() - if len(self.toydata) < self._n_mc: + toydata, toydata_names = self.read_toydata() + if len(toydata) < self._n_mc: raise ValueError( - f'Number of stored toydata {len(self.toydata)} is ' + f'Number of stored toydata {len(toydata)} is ' f'less than number of Monte Carlo {self._n_mc}!') - elif len(self.toydata) > self._n_mc: + elif len(toydata) > self._n_mc: warnings.warn( - f'Number of stored toydata {len(self.toydata)} is ' + f'Number of stored toydata {len(toydata)} is ' f'larger than number of Monte Carlo {self._n_mc}.') else: - self.toydata = [] - self.toydata_names = None - for i_mc in tqdm(range(self._n_mc)): + toydata = [] + toydata_names = None + # generate toydata + for i_mc in range(self._n_mc): if self._toydata_mode == 'generate' or self._toydata_mode == 'generate_and_write': data = self.statistical_model.generate_data( **self.generate_values) if self._toydata_mode == 'generate_and_write': # append toydata - self.toydata.append(data) + toydata.append(data) if self._toydata_mode == 'read': - data = self.toydata[i_mc] + data = toydata[i_mc] yield data + # save toydata + if self._toydata_mode == 'generate_and_write': + self.write_toydata(toydata, toydata_names) def toy_simulation(self): """ @@ -230,13 +240,8 @@ def toy_simulation(self): - run toy simulation a specified toydata mode and generate values. - loop over hypotheses. """ - if self._toydata_mode not in { - 'read', 'generate', 'generate_and_write', 'no_toydata', - }: - raise ValueError(f'Unknown toydata mode: {self._toydata_mode}') - results = [np.zeros(self._n_mc, dtype=self._result_dtype) for _ in self._hypotheses_values] - for i_mc, data in enumerate(self.data_generator()): + for i_mc, data in tqdm(enumerate(self.data_generator()), total=self._n_mc): self.statistical_model.data = data fit_results = [] for hypothesis_values in self._hypotheses_values: @@ -256,10 +261,6 @@ def toy_simulation(self): # assign fitting results for fit_result, result_array in zip(fit_results, results): result_array[i_mc] = tuple(fit_result[pn] for pn in self._result_names) - - if self._toydata_mode == 'generate_and_write': - self.write_toydata(self.toydata, self.toydata_names) - return results def run(self): From 6bab21b9f5b772eadbe6c061118f7c304374b96f Mon Sep 17 00:00:00 2001 From: dachengx Date: Wed, 2 Aug 2023 23:54:15 -0500 Subject: [PATCH 21/23] Also specify what if toydata_mode is no_toydata --- alea/runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/alea/runner.py b/alea/runner.py index 62565763..73527a55 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -227,8 +227,10 @@ def data_generator(self): if self._toydata_mode == 'generate_and_write': # append toydata toydata.append(data) - if self._toydata_mode == 'read': + elif self._toydata_mode == 'read': data = toydata[i_mc] + elif self._toydata_mode == 'no_toydata': + data = None yield data # save toydata if self._toydata_mode == 'generate_and_write': From 3dbaa98a9d60ad4c1695483818d67141d576d255 Mon Sep 17 00:00:00 2001 From: dachengx Date: Fri, 4 Aug 2023 09:58:17 -0500 Subject: [PATCH 22/23] Warning when parameter_interval_bounds contains None --- alea/model.py | 21 ++++++++++++--------- alea/parameters.py | 31 ++++++++++++++++++++----------- alea/utils.py | 19 +++++++++++++++++-- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/alea/model.py b/alea/model.py index 4f3c1725..83c14fb6 100644 --- a/alea/model.py +++ b/alea/model.py @@ -11,7 +11,7 @@ from inference_interface import toydata_to_file from alea.parameters import Parameters -from alea.utils import within_limits +from alea.utils import within_limits, clip_limits class StatisticalModel: @@ -273,6 +273,7 @@ def cost(args): call_kwargs = {} for i, k in enumerate(self.parameters.names): call_kwargs[k] = args[i] + # for optimization, we want to minimize the negative log-likelihood return self.ll(**call_kwargs) * -1 return cost @@ -354,10 +355,10 @@ def _confidence_interval_checks( if parameter_interval_bounds is None: parameter_interval_bounds = parameter_of_interest.parameter_interval_bounds - if parameter_interval_bounds is None: - raise ValueError( - "You must set parameter_interval_bounds in the parameter config" - " or when calling confidence_interval") + else: + value = parameter_interval_bounds + parameter_of_interest._check_parameter_interval_bounds(value) + parameter_interval_bounds = clip_limits(value) if parameter_of_interest.ptype == "rate": try: @@ -434,15 +435,17 @@ def confidence_interval( **confidence_interval_args) confidence_interval_kind, confidence_interval_threshold, parameter_interval_bounds = ci_objects - # find best-fit: - best_result, best_ll = self.fit(**best_fit_args) + # best_fit_args only provides the best-fit likelihood + _, best_ll = self.fit(**best_fit_args) + # the optimization of profile-likelihood under + # confidence_interval_args provides the best_parameter + best_result, _ = self.fit(**confidence_interval_args) best_parameter = best_result[poi_name] mask = within_limits(best_parameter, parameter_interval_bounds) if not mask: raise ValueError( f"The best-fit {best_parameter} is outside your confidence interval " f"search limits in parameter_interval_bounds {parameter_interval_bounds}.") - # log-likelihood - critical value: # define intersection between likelihood ratio curve and the critical curve: def t(hypothesis_value): @@ -457,7 +460,7 @@ def t(hypothesis_value): t_best_parameter = t(best_parameter) - if t_best_parameter < 0: + if t_best_parameter > 0: warnings.warn(f"CL calculation failed, given fixed parameters {confidence_interval_args}.") if confidence_interval_kind in {"upper", "central"} and t_best_parameter < 0: diff --git a/alea/parameters.py b/alea/parameters.py index ee812551..5de3ef10 100644 --- a/alea/parameters.py +++ b/alea/parameters.py @@ -1,10 +1,11 @@ +import warnings from typing import Any, Dict, List, Optional, Tuple # These imports are needed to evaluate the uncertainty string -import numpy -import scipy +import numpy # noqa: F401 +import scipy # noqa: F401 -from alea.utils import MAX_FLOAT, within_limits +from alea.utils import within_limits, clip_limits class Parameter: @@ -107,17 +108,13 @@ def parameter_interval_bounds(self) -> float: raise ValueError( f"Parameter {self.name} is not fittable, but has a parameter_interval_bounds.") else: - return self._parameter_interval_bounds + # print warning when value contains None + value = self._parameter_interval_bounds + self._check_parameter_interval_bounds(value) + return clip_limits(value) @parameter_interval_bounds.setter def parameter_interval_bounds(self, value: Optional[List]) -> None: - if value is None: - value = [-MAX_FLOAT, MAX_FLOAT] - else: - if value[0] is None: - value[0] = -MAX_FLOAT - if value[1] is None: - value[1] = MAX_FLOAT self._parameter_interval_bounds = value def __eq__(self, other: object) -> bool: @@ -131,6 +128,18 @@ def value_in_fit_limits(self, value: float) -> bool: """Returns True if value is within fit_limits""" return within_limits(value, self.fit_limits) + def _check_parameter_interval_bounds(self, value): + """Check if parameter_interval_bounds is within fit_limits and is not None.""" + if (value is None) or (value[0] is None) or (value[1] is None): + warnings.warn( + f"parameter_interval_bounds not defined for parameter {self.name}. " + "This may cause numerical overflow when calculating confidential interval.") + value = clip_limits(value) + if not (self.value_in_fit_limits(value[0]) and self.value_in_fit_limits(value[1])): + raise ValueError( + f"parameter_interval_bounds {value} not within " + f"fit_limits {self.fit_limits} for parameter {self.name}.") + class Parameters: """ diff --git a/alea/utils.py b/alea/utils.py index 95c73773..f468a617 100644 --- a/alea/utils.py +++ b/alea/utils.py @@ -1,7 +1,7 @@ import os import re import yaml -import pkg_resources +import importlib_resources from glob import glob from copy import deepcopy from pydoc import locate @@ -82,7 +82,7 @@ def _get_abspath(file_name): def _package_path(sub_directory): """Get the abs path of the requested sub folder""" - return pkg_resources.resource_filename('alea', f'{sub_directory}') + return importlib_resources.files('alea') / sub_directory def formatted_to_asterisked(formatted): @@ -176,3 +176,18 @@ def within_limits(value, limits): return value >= limits[0] else: return limits[0] <= value <= limits[1] + + +def clip_limits(value): + """ + Clip limits to be within [-MAX_FLOAT, MAX_FLOAT] + by converting None to -MAX_FLOAT and MAX_FLOAT. + """ + if value is None: + value = [-MAX_FLOAT, MAX_FLOAT] + else: + if value[0] is None: + value[0] = -MAX_FLOAT + if value[1] is None: + value[1] = MAX_FLOAT + return value From f35e0c763e0f06eadf06ad3ac4ea5a6bb751d0c5 Mon Sep 17 00:00:00 2001 From: dachengx Date: Fri, 4 Aug 2023 10:16:40 -0500 Subject: [PATCH 23/23] Update docstring a bit --- alea/model.py | 5 ++++- alea/runner.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/alea/model.py b/alea/model.py index 83c14fb6..e33a9c47 100644 --- a/alea/model.py +++ b/alea/model.py @@ -421,7 +421,10 @@ def confidence_interval( If None, the default kind of the model is used. Keyword Args: - kwargs: the parameters for get_expectation_values and fit + best_fit_args: the parameters to **only** get the global best-fit likelihood + confidence_interval_args: the parameters to get the profile-likelihood, + also the best-fit parameters of profile-likelihood, + parameter_interval_bounds, and confidence interval """ if best_fit_args is None: best_fit_args = {} diff --git a/alea/runner.py b/alea/runner.py index 73527a55..7e4a600e 100644 --- a/alea/runner.py +++ b/alea/runner.py @@ -241,6 +241,9 @@ def toy_simulation(self): For each Monte Carlo: - run toy simulation a specified toydata mode and generate values. - loop over hypotheses. + + Todo: + Implement per-hypothesis switching on whether to compute confidence intervals """ results = [np.zeros(self._n_mc, dtype=self._result_dtype) for _ in self._hypotheses_values] for i_mc, data in tqdm(enumerate(self.data_generator()), total=self._n_mc):