From e40efb73cb262c59b2bb484b4d6dea2082e01979 Mon Sep 17 00:00:00 2001 From: Vanya Belyaev Date: Fri, 18 Oct 2024 15:34:16 +0200 Subject: [PATCH] 1. add more methods to `Ostap::Math::ECDF` got get the ranking 1. optimise `Ostap::Math::ECDF` and skip sorting when possible 1. add `ostap.stats.twosamples.py` to make Two Sampel Tests 1. add the test for Two Sample Test 1. reduce code duplication between GoF-1D and Two-Sample Tests --- ReleaseNotes/release_notes.md | 8 +- ostap/core/ostap_types.py | 12 +- ostap/fitting/fithelpers.py | 9 +- ostap/stats/gof1d.py | 369 ++++--------- ostap/stats/gof_utils.py | 221 +++++++- ostap/stats/tests/test_stats_2samples.py | 108 ++++ ostap/stats/twosamples.py | 625 +++++++++++++++++++++++ source/include/Ostap/ECDF.h | 100 ++-- source/src/ECDF.cpp | 48 +- 9 files changed, 1144 insertions(+), 356 deletions(-) create mode 100644 ostap/stats/tests/test_stats_2samples.py create mode 100644 ostap/stats/twosamples.py diff --git a/ReleaseNotes/release_notes.md b/ReleaseNotes/release_notes.md index f4462fe1..00c685ff 100644 --- a/ReleaseNotes/release_notes.md +++ b/ReleaseNotes/release_notes.md @@ -1,5 +1,11 @@ ## New features - + + 1. add more methods to `Ostap::Math::ECDF` got get the ranking + 1. optimise `Ostap::Math::ECDF` and skip sorting when possible + 1. add `ostap.stats.twosamples.py` to make Two Sampel Tests + 1. add the test for Two Sample Test + 1. reduce code duplication between GoF-1D and Two-Sample Tests + ## Backward incompatible ## Bug fixes diff --git a/ostap/core/ostap_types.py b/ostap/core/ostap_types.py index 87c511b8..50ddbd7b 100644 --- a/ostap/core/ostap_types.py +++ b/ostap/core/ostap_types.py @@ -95,10 +95,13 @@ if hasattr ( std , 'string' ) : string_types += ( std.string , ) if hasattr ( std , 'string_view' ) : string_types += ( std.string_view , ) # ============================================================================= -try : +try : # ======================================================================= + # ========================================================================= import numpy as np - listlike_types = listlike_types + ( np.ndarray , ) -except ImportError : + listlike_types = listlike_types + ( np.ndarray , ) + # ========================================================================= +except ImportError : # ======================================================== + # ========================================================================= pass # ============================================================================= dict_types = dict , @@ -106,11 +109,12 @@ sequence_types = listlike_types + ( Sequence , Collection , Iterable , Generator ) sized_types = Sized , path_types = string_types +# ============================================================================= if ( 3 , 6 ) <= python_version : path_types = string_types + ( os.PathLike , ) # ============================================================================= ## sometimes we need to ensure that dictionary is ordered -ordered_dict = dict +ordered_dict = dict if python_version < ( 3 , 7 ) : from collections import OrderedDict as ordered_dict dictlike_types += ( ordered_dict, ) diff --git a/ostap/fitting/fithelpers.py b/ostap/fitting/fithelpers.py index fe0532b1..8597afc6 100644 --- a/ostap/fitting/fithelpers.py +++ b/ostap/fitting/fithelpers.py @@ -617,7 +617,7 @@ def binning ( self , axis , name = '' ) : args_timer = 'timer' , 'timing' args_warnings = 'warnings' , 'warning' args_sumw2 = 'sumw2' , 'sumw2err' , 'sumw2errs' , 'sumw2error' , 'sumw2errors' -args_asymptotic = ( 'asymptotic' , 'asymptorocerr' , 'asymptoticerrs' , +args_asymptotic = ( 'asymptotic' , 'asymptoticcerr' , 'asymptoticerrs' , 'asymptoticerror , asymptoticerrors' ) args_batch = 'batch' , 'batchmode' , 'modebatch' args_backend = 'evalbackend' , 'backendeval' , 'backend' @@ -777,12 +777,7 @@ def parse_args ( self , dataset = None , *args , **kwargs ) : _args.append ( ROOT.RooFit.Extended ( a ) ) - elif key in ( 'cpu' , - 'cpus' , - 'ncpu' , - 'ncpus' , - 'numcpu' , - 'numcpus' ) and isinstance ( a , int ) and 1<= a : + elif key in args_numcpu and isinstance ( a , int ) and 1<= a : _args.append ( ROOT.RooFit.NumCPU( a ) ) diff --git a/ostap/stats/gof1d.py b/ostap/stats/gof1d.py index 1d4b2b0c..841420e2 100644 --- a/ostap/stats/gof1d.py +++ b/ostap/stats/gof1d.py @@ -33,14 +33,17 @@ from ostap.core.core import SE, VE, Ostap, cidict_fun from ostap.math.base import doubles, axis_range from ostap.math.models import f1_draw -from ostap.math.math_ve import significance +from ostap.stats.gof_utils import Estimators,Summary import ostap.fitting.ds2numpy import ostap.fitting.roofit import ROOT, math # ============================================================================= -try : +try : # ======================================================================= + # ========================================================================= import numpy as np -except ImportError : + # ========================================================================= +except ImportError : # ======================================================== + # ========================================================================= np = None # ============================================================================= # logging @@ -68,7 +71,7 @@ def kolmogorov_smirnov ( cdf_data ) : >>> ks2 = kolmogorov_smirnov ( cdf_data ) """ n = len ( cdf_data ) - result = max ( max ( ( i + 1.0 ) / n - Fi , Fi - float ( i ) / n ) for ( i, Fi ) in enumerate ( cdf_data ) ) + result = max ( max ( ( i + 1.0 ) / n - Fi , Fi - float ( i ) / n ) for ( i, Fi ) in enumerate ( cdf_data ) ) ** 2 return math.sqrt ( result ) # ============================================================================= ## Get Anderson-Darling statistiscs AD^2 @@ -187,13 +190,11 @@ def ZC ( cdf_data ) : result = sum ( ( flog ( ( 1.0 / Fi - 1 ) / ( ( n - 0.5 ) / ( i + 0.25 ) - 1 ) ) ) ** 2 for ( i , Fi ) in enumerate ( cdf_data ) ) return result # ============================================================================== - -# ============================================================================= ## @class GoF1D # Goodness of 1D-fits # @see https://doi.org/10.1111/1467-9868.00337 -class GoF1D(object) : - """ Goodness of 1D-fits +class GoF1D(Estimators) : + """ Goodness-of-fit 1D-case - see https://doi.org/10.1111/1467-9868.00337 """ def __init__ ( self , @@ -237,15 +238,22 @@ def cdf ( x ) : return fun.cdf ( x ) ## evalute CDF for sorted data self.__cdf_data = self.__vct_cdf ( self.__data ) - - self.__KS_val = kolmogorov_smirnov ( self.__cdf_data ) - self.__AD_val = anderson_darling ( self.__cdf_data ) - self.__CM_val = cramer_von_mises ( self.__cdf_data ) - self.__ZK_val = ZK ( self.__cdf_data ) - self.__ZA_val = ZA ( self.__cdf_data ) - self.__ZC_val = ZC ( self.__cdf_data ) - self.__K_val = kuiper ( self.__cdf_data ) - + + self.__estimators = { + 'KS' : kolmogorov_smirnov ( self.__cdf_data ) , + 'K' : kuiper ( self.__cdf_data ) , + 'AD' : anderson_darling ( self.__cdf_data ) , + 'CM' : cramer_von_mises ( self.__cdf_data ) , + 'ZK' : ZK ( self.__cdf_data ) , + 'ZA' : ZA ( self.__cdf_data ) , + 'ZC' : ZC ( self.__cdf_data ) , + } + # ========================================================================= + @property + def estimators ( self ) : + """`estimators` : get a dictionary of all estimators""" + return self.__estimators + # ========================================================================= ## size of dataset @property @@ -289,15 +297,23 @@ def cdf_data ( self ) : def kolmogorov_smirnov_estimator ( self ) : """ Get Kolmogorov-Smirnov statistiscs KS """ - return self.__KS_val + return self.__estimators.get('KS',None) - # =============================================== + # ========================================================================= + ## Get Kuiper statististics + @property + def kuiper_estimator ( self ) : + """ Get Kuiperstatistics + """ + return self.__estimators.get('K', None ) + + # ========================================================================= ## Get Anderson-Darling statistiscs @property def anderson_darling_estimator ( self ) : """ Get Anderson-Darling statistiscs """ - return self.__AD_val + return self.__estimators.get ( 'AD' , None ) # ========================================================================= ## Get Cramer-von Mises statistics @@ -305,7 +321,7 @@ def anderson_darling_estimator ( self ) : def cramer_von_mises_estimator ( self ) : """ Get Cramer-von Mises statistics """ - return self.__CM_val + return self.estimators.get ( 'CM' , None ) # ========================================================================= ## Get ZK statististics @@ -313,7 +329,7 @@ def cramer_von_mises_estimator ( self ) : def ZK_estimator ( self ) : """ Get ZK statistics """ - return self.__ZK_val + return self.estimators.get ( 'ZK' , None ) # ========================================================================= ## Get ZA statististics @@ -321,7 +337,7 @@ def ZK_estimator ( self ) : def ZA_estimator ( self ) : """ Get ZA statistics """ - return self.__ZA_val + return self.estimators['ZK'] # ========================================================================= ## Get ZC statististics @@ -329,65 +345,10 @@ def ZA_estimator ( self ) : def ZC_estimator ( self ) : """ Get ZC statistics """ - return self.__ZC_val + return self.__estimators['ZK'] - # ========================================================================= - ## Get Kuiper statististics - @property - def kuiper_estimator ( self ) : - """ Get Kuiperstatistics - """ - return self.__K_val - - # ========================================================================== - ## Print the summary as Table - def table ( self , title = '' , prefix = '' , width = 5 , precision = 3 ) : - """ Print the summary as Table """ - rows = [ ( 'Statistics' , 'Value' ) ] - from ostap.logger.pretty import pretty_float - import ostap.logger.table as T - - ks , expo = pretty_float ( self.__KS_val , width = width , precision = precision ) - if expo : row = 'Kolmogorov-Smirnov' , ks , '10^%+d' % expo - else : row = 'Kolmogorov-Smirnov' , ks - rows.append ( row ) - - ad , expo = pretty_float ( self.__AD_val , width = width , precision = precision ) - if expo : row = 'Anderson-Darling' , ad , '10^%+d' % expo - else : row = 'Anderson-Darling' , ad - rows.append ( row ) - - cm , expo = pretty_float ( self.__CM_val , width = width , precision = precision ) - if expo : row = 'Cramer-von Mises' , cm , '10^%+d' % expo - else : row = 'Cramer-von Mises' , cm - rows.append ( row ) - - k , expo = pretty_float ( self.__K_val , width = width , precision = precision ) - if expo : row = 'Kuiper' , k , '10^%+d' % expo - else : row = 'Kuiper' , k - rows.append ( row ) - - zk , expo = pretty_float ( self.__ZK_val , width = width , precision = precision ) - if expo : row = 'ZK' , zk, '10^%+d' % expo - else : row = 'ZK' , zk - rows.append ( row ) - - za , expo = pretty_float ( self.__ZA_val , width = width , precision = precision ) - if expo : row = 'ZA' , za, '10^%+d' % expo - else : row = 'ZA' , za - rows.append ( row ) - - zc , expo = pretty_float ( self.__ZC_val , width = width , precision = precision ) - if expo : row = 'ZC' , zc, '10^%+d' % expo - else : row = 'ZC' , zc - rows.append ( row ) - - - title = title if title else 'Goodness of 1D-fit' - return T.table ( rows , title = title , prefix = prefix , alignment = 'lcl' ) - - __repr__ = table - __str__ = table + __repr__ = Estimators.table + __str__ = Estimators.table # ========================================================================= ## Draw fit CDF & empirical ECDF @@ -407,7 +368,7 @@ def draw ( self , opts = '' , *args , **kwargs ) : if hasattr ( cdf , 'xmin' ) : xmin = min ( xmin , cdf.xmin () ) if hasattr ( cdf , 'xmax' ) : xmax = max ( xmax , cdf.xmax () ) - xmin , xmax = axis_range ( xmin , xmax , delta = 0.05 ) + xmin , xmax = axis_range ( xmin , xmax , delta = 0.10 ) xmin = kwargs.pop ( 'xmin' , xmin ) xmax = kwargs.pop ( 'xmax' , xmax ) @@ -421,21 +382,13 @@ def draw ( self , opts = '' , *args , **kwargs ) : return ecdf.draw ( '%s %s' % ( 'same' , opts ) , color = 2 , linewidth = 3 , xmin = xmin , xmax = xmax , **kwargs ) -# ============================================================================= -KS_keys = 'ks' , 'kolmogorov' , 'kolmogorovsmirnov' -AD_keys = 'ad' , 'andersen' , 'andersendarling' -CM_keys = 'cm' , 'cramer' , 'cramervonmises' -ZK_keys = 'zk' , -ZA_keys = 'za' , -ZC_keys = 'zc' , -K_keys = 'k' , 'kuiper' # ============================================================================= ## @class GoF1DToys # Check Goodness of 1D-fits using toys -class GoF1DToys(GoF1D) : - """ Check Goodness of 1D-fits ysing toys +class GoF1DToys(GoF1D,Summary) : + """ Check Goodness-of-fit with toys (1D-case) """ - ## result of GoGptoys + ## result of GoF-toys Result = namedtuple ( 'Result' , 'statistics counter pvalue nsigma' ) # ========================================================================= def __init__ ( self , @@ -446,21 +399,14 @@ def __init__ ( self , assert isinstance ( nToys , int ) and 0 < nToys , "Invalid `nToys` argument!" - ## initializat the base + ## initialize the base GoF1D.__init__ ( self , pdf , dataset ) - self.__pdf = pdf + self.__pdf = pdf + + self.__counters = defaultdict(SE) + self.__ecdfs = {} - self.__KS_cnt = SE () - self.__AD_cnt = SE () - self.__CM_cnt = SE () - self.__ZK_cnt = SE () - self.__ZA_cnt = SE () - self.__ZC_cnt = SE () - self.__K_cnt = SE () - - self.__ecdfs = {} - self.__nToys = 0 if 0 < nToys : self.run ( nToys , silent = silent ) @@ -473,8 +419,9 @@ def run ( self , nToys = 1000 , silent = False ) : varname = self.__pdf.xvar.name - results = defaultdict(list) - vct_cdf = self.vcdf + results = defaultdict(list) + counters = self.counters + vct_cdf = self.vcdf from ostap.utils.progress_bar import progress_bar for i in progress_bar ( nToys , silent = silent , description = 'Toys:') : @@ -485,29 +432,29 @@ def run ( self , nToys = 1000 , silent = False ) : cdf_data = vct_cdf ( data ) ks = kolmogorov_smirnov ( cdf_data ) + k = kuiper ( cdf_data ) ad = anderson_darling ( cdf_data ) cm = cramer_von_mises ( cdf_data ) zk = ZK ( cdf_data ) za = ZA ( cdf_data ) zc = ZC ( cdf_data ) - k = kuiper ( cdf_data ) - - self.__KS_cnt += ks - self.__AD_cnt += ad - self.__CM_cnt += cm - self.__ZK_cnt += zk - self.__ZA_cnt += za - self.__ZC_cnt += zc - self.__K_cnt += k - results [ 'KS' ].append ( ks ) - results [ 'AD' ].append ( ad ) - results [ 'CM' ].append ( cm ) - results [ 'ZK' ].append ( zk ) - results [ 'ZA' ].append ( za ) - results [ 'ZC' ].append ( zc ) - results [ 'K' ].append ( k ) + counters [ 'KS' ] += ks + counters [ 'K' ] += k + counters [ 'AD' ] += ad + counters [ 'CM' ] += cm + counters [ 'ZK' ] += zk + counters [ 'ZA' ] += za + counters [ 'ZC' ] += zc + results [ 'KS' ].append ( ks ) + results [ 'K' ].append ( k ) + results [ 'AD' ].append ( ad ) + results [ 'CM' ].append ( cm ) + results [ 'ZK' ].append ( zk ) + results [ 'ZA' ].append ( za ) + results [ 'ZC' ].append ( zc ) + ## delete data if isinstance ( dset , ROOT.RooDataSet ) : dset = Ostap.MoreRooFit.delete_data ( dset ) @@ -541,31 +488,20 @@ def nToys ( self ) : def ecdfs ( self ) : """`ecdfs` : toy results as empirical cumulative distribution functions""" return self.__ecdfs - # ========================================================================= - ## Helper methof to get result - def result ( self , value , counter , label ) : - """ Helper method to get result - """ - if self.__ecdfs and label in self.__ecdfs : - pvalue = self.__ecdfs [ label ] . estimate ( value ) - nsigma = significance ( pvalue ) - else : - pvalue = VE ( -1 , 0 ) - nsigma = VE ( -1 , 0 ) - ## - return self.Result ( value , - counter , - pvalue , - nsigma ) - + ## Counters + @property + def counters ( self ) : + """`counters` : toy results as counters""" + return self.__counters + # ========================================================================= ## Get Kolmogorov-Smirnov statistiscs @property def kolmogorov_smirnov ( self ) : """ Get Kolmogorov-Smirnov statistiscs KS """ - return self.result ( self.kolmogorov_smirnov_estimator , self.__KS_cnt , 'KS' ) + return self.result ( 'KS' ) # =============================================== ## Get Anderson-Darling statistiscs @@ -573,7 +509,7 @@ def kolmogorov_smirnov ( self ) : def anderson_darling ( self ) : """ Get Anderson-Darling statistiscs """ - return self.result ( self.anderson_darling_estimator , self.__AD_cnt , 'AD' ) + return self.result ( 'AD' ) # ========================================================================= ## Get Cramer-von Mises statistics @@ -581,7 +517,7 @@ def anderson_darling ( self ) : def cramer_von_mises ( self ) : """ Get Cramer-von Mises statistics """ - return self.result ( self.cramer_von_mises_estimator , self.__CM_cnt , 'CM' ) + return self.result ( 'CM' ) # ========================================================================= ## Get ZK statististics @@ -589,7 +525,7 @@ def cramer_von_mises ( self ) : def ZK ( self ) : """ Get ZK statistics """ - return self.result ( self.ZK_estimator , self.__ZK_cnt , 'ZK' ) + return self.result ( 'ZK' ) # ========================================================================= ## Get ZA statististics @@ -597,7 +533,7 @@ def ZK ( self ) : def ZA ( self ) : """ Get ZA statistics """ - return self.result ( self.ZA_estimator , self.__ZA_cnt , 'ZA' ) + return self.result ( 'ZA' ) # ========================================================================= ## Get ZC statististics @@ -605,7 +541,7 @@ def ZA ( self ) : def ZC ( self ) : """ Get ZC statistics """ - return self.result ( self.ZC_estimator , self.__ZC_cnt , 'ZC' ) + return self.result ( 'ZC' ) # ========================================================================= ## Get Kuiper statististics @@ -613,144 +549,11 @@ def ZC ( self ) : def kuiper ( self ) : """ Get Kuiper statistics """ - return self.result ( self.kuiper_estimator , self.__K_cnt , 'K' ) - - # ========================================================================= - ## format a row in the table - def _row ( self , what , result , width = 5 , precision = 3 ) : - """ Format a row in the table - """ - - value = result.statistics - counter = result.counter - pvalue = result.pvalue - nsigma = result.nsigma - - mean = counter.mean () - rms = counter.rms () - vmin, vmax = counter.minmax () - - mxv = max ( value , mean.value() , mean.error() , rms , vmin , vmax ) - - from ostap.logger.pretty import fmt_pretty_ve - - fmt, fmtv , fmte , expo = fmt_pretty_ve ( VE ( mxv , mean.cov2() ) , - width = width , - precision = precision , - parentheses = False ) - - if expo : scale = 10**expo - else : scale = 1 - fmt2 = '%s/%s' % ( fmtv , fmtv ) - - return ( what , - fmtv % ( value / scale ) , - ( mean / scale ).toString ( fmt ) , - fmtv % ( rms / scale ) , - fmt2 % ( vmin / scale , vmax / scale ) , - ( '10^%+d' % expo if expo else '' ) , - ( 100 * pvalue ) .toString ( '%.2f +/- %-.2f' ) , - ( nsigma ) .toString ( '%.1f +/- %-.1f' ) ) - - # ========================================================================= - ## Make a summary table - def table ( self , title = '' , prefix = '' , width = 5 , precision = 3 ) : - """ Make a summary table - """ - - import ostap.logger.table as T - - rows = [ ( 'Statistics' , 'value' , 'mean' , 'rms' , 'min/max' , 'factor' , 'p-value [%]' , '#sigma' ) ] - - rows.append ( self._row ( 'Kolmogorov-Smirnov' , self.kolmogorov_smirnov , width = width , precision = precision ) ) - rows.append ( self._row ( 'Anderson-Darling' , self.anderson_darling , width = width , precision = precision ) ) - rows.append ( self._row ( 'Cramer-von Mises' , self.cramer_von_mises , width = width , precision = precision ) ) - rows.append ( self._row ( 'Kuiper' , self.kuiper , width = width , precision = precision ) ) - rows.append ( self._row ( 'ZK' , self.ZK , width = width , precision = precision ) ) - rows.append ( self._row ( 'ZA' , self.ZA , width = width , precision = precision ) ) - rows.append ( self._row ( 'ZC' , self.ZC , width = width , precision = precision ) ) - - ## skip empty columns - has_expo = False - for row in rows[1:] : - r = list ( row ) - if r[-3] : - has_expo = True - break - - if not has_expo : - new_rows = [] - for row in rows : - r = list ( row ) - del r [ -3 ] - new_rows.append ( r ) - rows = new_rows - - - if not title and self.nToys : - title = 'Goodness of 1D-fit with #%d toys' % self.nToys - elif not title : - title = 'Goodness of 1D-fit' - - return T.table ( rows , title = title , prefix = prefix , alignment = 'lccccccccccc' ) - - __repr__ = table - __str__ = table - - # ========================================================================= - ## Draw fit CDF & empirical ECDF - def draw ( self , what , opts = '' , *args , **kwargs ) : - """ Draw fit CDF & empirical CDF - """ - key = cidict_fun ( what ) - if key in KS_keys : - result = self.kolmogorov_smirnov - ecdf = self.__ecdfs [ 'KS' ] - logger.info ( 'Toy resuls for Kolmogorov-Smirnov estimate' ) - elif key in AD_keys : - result = self.anderson_darling - ecdf = self.__ecdfs [ 'AD' ] - logger.info ( 'Toy resuls for Anderson-Darling estimate' ) - elif key in CM_keys : - result = self.cramer_von_mises - ecdf = self.__ecdfs [ 'CM' ] - logger.info ( 'Toy resuls for Cramer-von Mises estimate' ) - elif key in ZK_keys : - result = self.ZK - ecdf = self.__ecdfs [ 'ZK' ] - logger.info ( 'Toy resuls for ZK estimate' ) - elif key in ZA_keys : - result = self.ZA - ecdf = self.__ecdfs [ 'ZA' ] - logger.info ( 'Toy resuls for ZK estimate' ) - elif key in ZC_keys : - result = self.ZC - ecdf = self.__ecdfs [ 'ZC' ] - logger.info ( 'Toy resuls for ZK estimate' ) - elif key in K_keys : - result = self.kuiper - ecdf = self.__ecdfs [ 'K' ] - logger.info ( 'Toy resuls for Kuiper estimate' ) - else : - raise KeyError ( "draw: Invalid `what`:%s" % what ) - - xmin,xmax = ecdf.xmin () , ecdf.xmax () - value = result.statistics - xmin = min ( xmin , value ) - xmax = max ( xmax , value ) - xmin , xmax = axis_range ( xmin , xmax , delta = 0.10 ) - xmin = xmin if 0 < xmin else 0 - kwargs [ 'xmin' ] = kwargs.get ( 'xmin' , xmin ) - kwargs [ 'xmax' ] = kwargs.get ( 'xmax' , xmax ) - result = ecdf.draw ( opts , *args , **kwargs ) - line = ROOT.TLine ( value , 0 , value , 1 ) - ## - line.SetLineWidth ( 4 ) - line.SetLineColor ( 8 ) - line.draw() - ## - return result, line + return self.result ( 'K' ) + table = Summary.table + __repr__ = Summary.table + __str__ = Summary.table # ============================================================================= if '__main__' == __name__ : diff --git a/ostap/stats/gof_utils.py b/ostap/stats/gof_utils.py index bb65a603..e8023f16 100644 --- a/ostap/stats/gof_utils.py +++ b/ostap/stats/gof_utils.py @@ -13,17 +13,22 @@ __author__ = "Vanya BELYAEV Ivan.Belyaev@cern.ch" __date__ = "2023-12-06" __all__ = ( - 'mean_var' , ## mean and variance for (weighted) arrays - 'nEff' , ## get number of effective entries - 'normalize' , ## "normalize" variables in dataset/structured array - ) + 'mean_var' , ## mean and variance for (weighted) arrays + 'nEff' , ## get number of effective entries + 'normalize' , ## "normalize" variables in dataset/structured array + 'Estimators' , ## helper mixin class to print statistical estimators + 'Summary' , ## helper mixin class to print statistical estimators +) # ============================================================================= +from collections import namedtuple from ostap.core.meta_info import python_info from ostap.core.core import VE, Ostap from ostap.core.ostap_types import string_types +from ostap.math.base import axis_range +from ostap.math.math_ve import significance from ostap.stats.counters import EffCounter -from ostap.utils.basic import numcpu -from ostap.utils.utils import splitter +from ostap.utils.basic import numcpu, loop_items +from ostap.utils.utils import splitter from ostap.utils.progress_bar import progress_bar import ROOT, sys, warnings # ============================================================================= @@ -374,8 +379,210 @@ def run ( self , NN , silent = False ) : counter += result # return counter + +# ============================================================================= +## Short labels for various statitical estimators +Labels = { + 'KS' : 'Kolmogorov-Smirnov' , + 'K' : 'Kuiper' , + 'AD' : 'Anderson-Darling' , + 'CM' : 'Cramers-von Mises' , + 'ZK' : 'Zhang-ZK' , + 'ZA' : 'Zhang-ZA' , + 'ZC' : 'Zhang-ZC' , +} +# ============================================================================ +## @class Estimators +# Helper mixin class to format the table of estimators +class Estimators(object) : + """ Helper mixin class to format the table of estimators + """ + # ========================================================================== + ## Print the summary as Table + def table ( self , title = '' , prefix = '' , width = 5 , precision = 3 ) : + """ Print the summary as Table + """ + import ostap.logger.table as T + from ostap.logger.pretty import pretty_float + ## + rows = [ ( 'Statistics' , 'Value' ) ] + for label , value in loop_items ( self.estimators ) : + + the_label = Labels.get ( label , label ) + + result , expo = pretty_float ( value , width = width , precision = precision ) + + if expo : row = the_label , result , '10^%+d' % expo + else : row = the_label , result + + title = title if title else 'Goodness of 1D-fit' + return T.table ( rows , title = title , prefix = prefix , alignment = 'lcl' ) + +# ============================================================================ +## @class Summary +# Helper mixin class to keep (and format) the statistics +class Summary(object) : + """ Helper mixin class to keep (and format) the statistics + """ + # ============================================================================= + KS_keys = 'ks' , 'kolmogorov' , 'kolmogorovsmirnov' + K_keys = 'k' , 'kuiper' + AD_keys = 'ad' , 'andersen' , 'andersendarling' + CM_keys = 'cm' , 'cramer' , 'cramervonmises' + ZK_keys = 'zk' , 'zhangk' , 'zhangzk' + ZA_keys = 'za' , 'zhanga' , 'zhangza' + ZC_keys = 'zc' , 'zhangc' , 'zhangzc' + # ========================================================================= + ## result of toys/permutations + Result = namedtuple ( 'Result' , 'statistics counter pvalue nsigma' ) + # ========================================================================= + ## Helper method to get result + def result ( self , label ) : + """ Helper method to get result + """ + if not label in self.estimators : return None + if not label in self.ecdfs : return None + if not label in self.counters : return None + ## + value = self.estimators [ label ] + ecdfs = self.ecdfs [ label ] + counter = self.counters [ label ] + ## + pvalue = ecdfs. estimate ( value ) + nsigma = significance ( pvalue ) + return self.Result ( value , + counter , + pvalue , + nsigma ) + # ========================================================================= + ## format a row in the summary table + def row ( self , what , result , width = 5 , precision = 3 ) : + """ Format a row in the sumamry table + """ + value = result.statistics + counter = result.counter + pvalue = result.pvalue + nsigma = result.nsigma + + mean = counter.mean () + rms = counter.rms () + vmin, vmax = counter.minmax () + + mxv = max ( abs ( value ) , abs ( mean.value() ) , mean.error() , rms , abs ( vmin ) , abs ( vmax ) ) + + from ostap.logger.pretty import fmt_pretty_ve + fmt, fmtv , fmte , expo = fmt_pretty_ve ( VE ( mxv , mean.cov2() ) , + width = width , + precision = precision , + parentheses = False ) + + if expo : scale = 10**expo + else : scale = 1 + fmt2 = '%s/%s' % ( fmtv , fmtv ) + + return ( what , + fmtv % ( value / scale ) , + ( mean / scale ).toString ( fmt ) , + fmtv % ( rms / scale ) , + fmt2 % ( vmin / scale , vmax / scale ) , + ( '10^%+d' % expo if expo else '' ) , + ( 100 * pvalue ) .toString ( '%.2f +/- %-.2f' ) , + ( nsigma ) .toString ( '%.1f +/- %-.1f' ) ) - + # ========================================================================= + ## Make a summary table + def table ( self , title = '' , prefix = '' , width = 5 , precision = 3 ) : + """ Make a summary table + """ + import ostap.logger.table as T + rows = [ ( 'Statistics' , 'value' , 'mean' , 'rms' , 'min/max' , 'factor' , 'p-value [%]' , '#sigma' ) ] + + for label in self.ecdfs : + + result = self.result ( label ) + if not result : continue + + the_label = Labels.get ( label , label ) + row = self.row ( the_label , result , width = width , precision = precision ) + rows.append ( row ) + + ## skip empty columns + has_expo = False + for row in rows[1:] : + r = list ( row ) + if r[-3] : + has_expo = True + break + + if not has_expo : + new_rows = [] + for row in rows : + r = list ( row ) + del r [ -3 ] + new_rows.append ( r ) + rows = new_rows + + if not title and self.nToys : + title = 'Goodness of 1D-fit with #%d toys' % self.nToys + elif not title : + title = 'Goodness of 1D-fit' + + return T.table ( rows , title = title , prefix = prefix , alignment = 'lccccccccccc' ) + + # ========================================================================= + ## Draw fit CDF & empirical ECDF + def draw ( self , what , opts = '' , *args , **kwargs ) : + """ Draw fit CDF & empirical CDF + """ + key = cidict_fun ( what ) + if key in self.KS_keys and 'KS' in self.ecdfs : + result = self.result ( 'KS' ) + ecdf = self.ecdfs [ 'KS' ] + logger.info ( 'Toy resuls for Kolmogorov-Smirnov estimate' ) + elif key in self.K_keys and 'K' in self.ecdfs : + result = self.result ( 'K' ) + ecdf = self.ecdfs [ 'K' ] + logger.info ( 'Toy resuls for Kuiper estimate' ) + elif key in self.AD_keys and 'AD' in self.ecdfs : + result = self.result ( 'AD' ) + ecdf = self.ecdfs [ 'AD' ] + logger.info ( 'Toy resuls for Anderson-Darling estimate' ) + elif key in self.CM_keys and 'CM' in self.ecdfs : + result = self.result ( 'CM' ) + ecdf = self.ecdfs [ 'CM' ] + logger.info ( 'Toy resuls for Cramer-von Mises estimate' ) + elif key in self.ZK_keys and 'ZK' in self.ecdfs : + result = self.result ( 'ZK' ) + ecdf = self.ecdfs [ 'ZK' ] + logger.info ( 'Toy resuls for ZK estimate' ) + elif key in self.ZA_keys and 'ZA' in self.ecdfs : + result = self.result ( 'ZA' ) + ecdf = self.ecdfs [ 'ZA' ] + logger.info ( 'Toy resuls for ZK estimate' ) + elif key in self.ZC_keys and 'ZC' in self.ecdfs : + result = self.result ( 'ZC' ) + ecdf = self.ecdfs [ 'ZC' ] + logger.info ( 'Toy resuls for ZK estimate' ) + else : + raise KeyError ( "draw: Invalid `what`:%s" % what ) + + xmin , xmax = ecdf.xmin () , ecdf.xmax () + value = result.statistics + xmin = min ( xmin , value ) + xmax = max ( xmax , value ) + xmin , xmax = axis_range ( xmin , xmax , delta = 0.10 ) + + kwargs [ 'xmin' ] = kwargs.get ( 'xmin' , xmin ) + kwargs [ 'xmax' ] = kwargs.get ( 'xmax' , xmax ) + result = ecdf.draw ( opts , *args , **kwargs ) + line = ROOT.TLine ( value , 0 , value , 1 ) + ## + line.SetLineWidth ( 4 ) + line.SetLineColor ( 8 ) + line.draw() + ## + return result, line + # ============================================================================= if '__main__' == __name__ : diff --git a/ostap/stats/tests/test_stats_2samples.py b/ostap/stats/tests/test_stats_2samples.py new file mode 100644 index 00000000..4decac03 --- /dev/null +++ b/ostap/stats/tests/test_stats_2samples.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ============================================================================== +# @file ostap/stats/tests/test_stats_gof1d.py +# Test Goddness-of-fits for 1D fits +# Copyright (c) Ostap developpers. +# ============================================================================== +""" Test Goddness-of-fits for 1D fits +""" +# ============================================================================== +from ostap.stats.twosamples import TSTest, TSToys +from ostap.core.meta_info import python_info +from ostap.plotting.canvas import use_canvas +from ostap.logger.pretty import pretty_float +from ostap.utils.utils import vrange +from ostap.math.math_ve import significance +import ostap.logger.table as T +import ROOT, random, array +# ============================================================================== +from ostap.logger.logger import getLogger +if '__main__' == __name__ : logger = getLogger ( 'tests_stats_2samples' ) +else : logger = getLogger ( __name__ ) +# ============================================================================== +N1 = 200 +N2 = 200 + +ds1 = array.array ( 'd' , ( random.gauss ( 0 , 1 ) for i in range ( N1 ) ) ) ## Gaussian +ds2 = array.array ( 'd' , ( random.gauss ( 0 , 1 ) for i in range ( N2 ) ) ) ## the same Gaussian +ds3 = array.array ( 'd' , ( random.gauss ( 0.5 , 1 ) for i in range ( N2 ) ) ) ## different Gaussian +ds4 = array.array ( 'd' , ( random.gauss ( 0 , 1.3 ) for i in range ( N2 ) ) ) ## different Gaussian + +# ============================================================================= +def test_2samples_good () : + + logger = getLogger ( 'test_2samples_good' ) + + test = TSTest ( ds1 , ds2 ) + title = 'Test good samples' + logger.info ( '%s:\n%s' % ( title , test.table ( title = title , prefix = '# ' ) ) ) + + toys = TSToys ( ds1 , ds2 ) + title = 'Toys good samples' + logger.info ( '%s:\n%s' % ( title , toys.table ( title = title , prefix = '# ' ) ) ) + + with use_canvas ( 'test_2samples_good/test' , wait = 3 ) : test.draw() + with use_canvas ( 'test_2samples_good/toys' , wait = 3 ) : toys.draw() + + +# ============================================================================= +def test_2samples_same () : + + logger = getLogger ( 'test_2samples_same' ) + + test = TSTest ( ds1 , ds1 ) + title = 'Test same samples' + logger.info ( '%s:\n%s' % ( title , test.table ( title = title , prefix = '# ' ) ) ) + + toys = TSToys ( ds1 , ds1 ) + title = 'Toys same samples' + logger.info ( '%s:\n%s' % ( title , toys.table ( title = title , prefix = '# ' ) ) ) + + with use_canvas ( 'test_2samples_same/test' , wait = 3 ) : test.draw() + with use_canvas ( 'test_2samples_same/toys' , wait = 3 ) : toys.draw() + +# ============================================================================= +def test_2samples_bad1 () : + + logger = getLogger ( 'test_2samples_bad1' ) + + test = TSTest ( ds1 , ds3 ) + title = 'Test bad/1 samples' + logger.info ( '%s:\n%s' % ( title , test.table ( title = title , prefix = '# ' ) ) ) + + toys = TSToys ( ds1 , ds3 ) + title = 'Toys bad/1 samples' + logger.info ( '%s:\n%s' % ( title , toys.table ( title = title , prefix = '# ' ) ) ) + + with use_canvas ( 'test_2samples_bad1/test' , wait = 3 ) : test.draw() + with use_canvas ( 'test_2samples_bad1/toys' , wait = 3 ) : toys.draw() + +# ============================================================================= +def test_2samples_bad2 () : + + logger = getLogger ( 'test_2samples_bad2' ) + + test = TSTest ( ds1 , ds4 ) + title = 'Test bad/2 samples' + logger.info ( '%s:\n%s' % ( title , test.table ( title = title , prefix = '# ' ) ) ) + + toys = TSToys ( ds1 , ds4 ) + title = 'Toys bad/2 samples' + logger.info ( '%s:\n%s' % ( title , toys.table ( title = title , prefix = '# ' ) ) ) + + with use_canvas ( 'test_2samples_bad2/test' , wait = 3 ) : test.draw() + with use_canvas ( 'test_2samples_bad2/toys' , wait = 3 ) : toys.draw() + +# =============================================================================== +if '__main__' == __name__ : + + test_2samples_good () + test_2samples_same () + test_2samples_bad1 () + test_2samples_bad2 () + +# =============================================================================== +## The END +# =============================================================================== + diff --git a/ostap/stats/twosamples.py b/ostap/stats/twosamples.py new file mode 100644 index 00000000..11c543b5 --- /dev/null +++ b/ostap/stats/twosamples.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ============================================================================= +## @file ostap/stats/twosamples.py +# Two-Sample Tests +# @see https://www.jstor.org/stable/25471118 +# @author Vanya BELYAEV Ivan.Belyaev@cern.ch +# @date 2024-10-16 +# ============================================================================= +""" Two-Sample Tests + - see https://www.jstor.org/stable/25471118 +""" +# ============================================================================= +__version__ = "$Revision$" +__author__ = "Vanya BELYAEV Ivan.Belyaev@cern.ch" +__date__ = "2024-09-16" +__all__ = ( +) +# ============================================================================= +from ostap.core.meta_info import root_info +from ostap.core.ostap_types import listlike_types +from collections import defaultdict +from ostap.core.core import SE, VE, Ostap +from ostap.math.base import doubles, axis_range +from ostap.math.models import f1_draw +from ostap.stats.gof_utils import Estimators,Summary +import ostap.fitting.ds2numpy +import ostap.fitting.roofit +import ROOT, math, array +# ============================================================================= +# logging +# ============================================================================= +from ostap.logger.logger import getLogger +if '__main__' == __name__ : logger = getLogger( 'ostap.stats.twosamples' ) +else : logger = getLogger( __name__ ) +# ============================================================================= +logger.debug ( 'Two-sample test' ) +# ============================================================================= +try : # ======================================================================= + # ========================================================================= + import numpy as np + # ========================================================================= +except ImportError : # ======================================================== + # ========================================================================= + np = None +# ============================================================================= +if ( 6 , 32 ) <= root_info : data2vct = lambda s : s +else : data2vct = lambda s : doubles ( s ) +# ============================================================================= +## transform data into empirical CDF +# @code +# data = ... +# cdf = ecdf_from_data ( data ) +# @endcode +# @param data input dataset +# @return empirical cumulative distribution function +def ecdf_from_data ( data ) : + """ Transform data into empirical CDF + - data : input dataset + >>> data = ... + >>> cdf = ecdf_from_data ( data ) + """ + if isinstance ( data , Ostap.Math.ECDF ) : return data + elif np and isinstance ( data , np.ndarray ) : return Ostap.Math.ECDF ( data2vct ( data ) ) + elif isinstance ( data , array.array ) : return Ostap.Math.ECDF ( data2vct ( data ) ) + elif isinstance ( data1 , listlike_types ) : return Ostap.Math.ECDF ( doubles ( data ) ) + ## + raise TypeError ( "ecdf_from_data: Unsupported `data' type: %s" % type ( data1 ).__name__ ) +# =============================================================================== +## Prepare data +# @code +# data1 = ... +# data2 = ... +# ecdf1 , ecdf2 , ecdf = prepare_data1 ( data1 , data2 ) +# @endcode +# @param data1 the 1st dataset +# @param data2 the second dataset +# @param pooled (optional) the CDF for pooled data +# @return triplet of empiricla CDFs for the 1st, 2nd and pooled datasets +def prepare_data1 ( data1 , data2 , pooled = None ) : + """ Prepare data + >>> data1 = ... + >>> data2 = ... + >>> ecdf1 , ecdf2 , ecdf = prepare_data1 ( data1 , data2 ) + - data1 : the 1st dataset + - data2 : the second dataset + - pooled : (optiomal) the CDF for pooled data + - return a triplet of empirical CDFs for the 1st, 2nd and pooled datasets + """ + ecdf1 = ecdf_from_data ( data1 ) + ecdf2 = ecdf_from_data ( data2 ) + ## + n1 = len ( ecdf1 ) + n2 = len ( ecdf2 ) + ## + ecdf = pooled if ( isinstance ( pooled , Ostap.Math.ECDF ) and n1 + n2 == len ( pooled ) ) else ecdf1 + ecdf2 + ## + return ecdf1 , ecdf2 , ecdf +# ============================================================================= +## Prepare data +# @code +# data1 = ... +# data2 = ... +# ecdf1 , ecdf2 , ecdf , ranks1 , ranks2 = prepare_data2 ( data1 , data2 ) +# @endcode +# @param data1 the 1st dataset +# @param data2 the second dataset +# @param pooled (optional) the CDF for pooled data +# @param ranks1 (optional) array of ranks of elements from the pooled sample in the 1st dataset +# @param ranks2 (optional) array of ranks of elements from the pooled sample in the 2nd dataset +# @return quintet of empirical CDFs for the 1st, 2nd and pooled datasets and two ransk arrays +def prepare_data2 ( data1 , data2 , pooled = None , ranks1 = None , ranks2 = None ) : + """ Prepare data + >>> data1 = ... + >>> data2 = ... + >>> ecdf1 , ecdf2 , ecdf , ranks1 , ranks2 = prepare_data2 ( data1 , data2 ) + - data1 : the 1st dataset + - data2 : the second dataset + - pooled : (optional) the CDF for pooled data + - ranks1 : (optional) array of ranks of elements from the pooled sample in the 1st dataset + - ranks2 : (optional) array of ranks of elements from the pooled sample in the 2nd dataset + - return quintet of empirical CDFs for the 1st, 2nd and pooled datasets and two ransk arrays + """ + ## + ecdf1 , ecdf2 , ecdf = prepare_data1 ( data1 , data2 , pooled ) + ## + n1 = len ( ecdf1 ) + n2 = len ( ecdf2 ) + ## + ok1 = isinstance ( ranks1 , listlike_types ) and n1 + n2 == len ( ranks1 ) + if not ok1 : + r1 = ecdf1.ranks ( ecdf ) + ranks1 = np.asarray ( r1 , dtype = np.dtype ( 'uint16' ) ) if np else array.array ( 'I' , r1 ) + ## + ok2 = isinstance ( ranks2 , listlike_types ) and n1 + n2 == len ( ranks2 ) + if not ok2 : + r2 = ecdf2.ranks ( ecdf ) + ranks2 = np.asarray ( r2 , dtype = np.dtype ( 'uint16' ) ) if np else array.array ( 'I' , r2 ) + ## + return ecdf1, ecdf2, ecdf, ranks1 , ranks2 + +# ============================================================================= +## Get Kolmogorov-Smirnov statistics KS +# @code +# ecdf1 =... +# ecdf2 =... +# ks = kolmogorov_smirnov ( ecdf1, ecdf2 ) +# @endcode +# @param data1 the 1st dataset or empirical CDF +# @param data2 the 2nd dataset or empirical CDF +# @param pooled (optional) pooled dataset or empirical CDF +# @return Kolmogorov-Smirnov statistics KS +def kolmogorov_smirnov ( data1 , data2 , pooled = None ) : + """ Get Kolmogorov-Smirnov statistics KS + - data1 : the 1st dataset or empirical CDF + - data2 : the 2nd dataset or empirical CDF + - pooled : (optional) pooled dataset or CDF + """ + ecdf1, ecdf2, ecdf = prepare_data1 ( data1 , data2 , pooled ) + return max ( abs ( ecdf1 ( x ) - ecdf2 ( x ) ) for x in ecdf ) +# ================================================================================ +## Get Kuiper statistics K +# @code +# ecdf1 =... +# ecdf2 =... +# k = kuiper ( ecdf1, ecdf2 ) +# @endcode +# @param data1 the 1st dataset or empirical CDF +# @param data2 the 2nd dataset or empirical CDF +# @param pooled (optional) pooled dataset or empirical CDF +# @return Kuiper statistics K +def kuiper ( data1 , data2 , pooled = None ) : + """ Get kuiper statistics K + - data1 : the 1st dataset or empirical CDF + - data2 : the 2nd dataset or empirical CDF + - pooled : (optional) pooled dataset or CDF + """ + ecdf1, ecdf2, ecdf = prepare_data1 ( data1 , data2 , pooled ) + return max ( ecdf1 ( x ) - ecdf2 ( x ) for x in ecdf ) - min ( ecdf1 ( x ) - ecdf2 ( x ) for x in ecdf ) +# ============================================================================= +## Get ZK statistics ZK +# @see https://www.jstor.org/stable/25471118 +# @code +# ecdf1 =... +# ecdf2 =... +# zk = ZK ( ecdf1, ecdf2 ) +# @endcode +# @param data1 the 1st dataset or empirical CDF +# @param data2 the 2nd dataset or empirical CDF +# @param pooled (optional) pooled dataset or empirical CDF +# @param ranks1 (optional) ranks of the elements from the pooled data in the 1st dataset +# @param ranks2 (optional) ranks of the elements from the pooled data in the 2nd dataset +# @return ZK statistics ZK +def ZK ( data1 , data2 , pooled = None , ranks1 = None , ranks2 = None ) : + """ Get ZK statistics + - data1 : the 1st dataset or empirical CDF + - data2 : the 2nd dataset or empirical CDF + - pooled : (optional) pooled dataset or CDF + - ranks1 : (optional) ranks of the elements from the pooled data in the 1st dataset + - ranks2 : (optional) ranks of the elements from the pooled data in the 2nd dataset + - see https://www.jstor.org/stable/25471118 + """ + ## transform input (if needed) + ecdf1, ecdf2, ecdf, ranks1, ranks2 = prepare_data2 ( data1 , data2 , pooled , ranks1 , ranks2 ) + + n1 = len ( ecdf1 ) + n2 = len ( ecdf2 ) + n = n1 + n2 + + flog = math.log + def term ( k ) : + + r1 = ranks1 [ k ] + r2 = ranks2 [ k ] + + r1 = max ( 0.5 , min ( n1 - 0.5 , r1 ) ) ## r1 + 0.5 ?? + r2 = max ( 0.5 , min ( n2 - 0.5 , r2 ) ) ## r2 + 0.5 ?? + + fk = 0.5 / n if not k else float ( k ) / n + f1k = float ( r1 ) / n1 + f2k = float ( r2 ) / n2 + + rr = n1 * ( f1k * flog ( f1k / fk ) + ( 1 - f1k ) * flog ( ( 1. - f1k ) / ( 1. - fk ) ) ) + rr += n2 * ( f2k * flog ( f2k / fk ) + ( 1 - f2k ) * flog ( ( 1. - f2k ) / ( 1. - fk ) ) ) + + return rr + + return max ( term ( k ) for k in range ( n ) ) + +# ============================================================================= +## Get ZA statistics ZA +# @see https://www.jstor.org/stable/25471118 +# @code +# ecdf1 =... +# ecdf2 =... +# za = ZA ( ecdf1, ecdf2 ) +# @endcode +# @param data1 the 1st dataset or empirical CDF +# @param data2 the 2nd dataset or empirical CDF +# @param pooled (optional) pooled dataset or empirical CDF +# @param ranks1 (optional) ranks of the elements from the pooled data in the 1st dataset +# @param ranks2 (optional) ranks of the elements from the pooled data in the 2nd dataset +# @return ZA statistics ZA +def ZA ( data1 , data2 , pooled = None , ranks1 = None , ranks2 = None ) : + """ Get ZA statistics + - data1 : the 1st dataset or empirical CDF + - data2 : the 2nd dataset or empirical CDF + - pooled : (optional) pooled dataset or CDF + - ranks1 : (optional) ranks of the elements from the pooled data in the 1st dataset + - ranks2 : (optional) ranks of the elements from the pooled data in the 2nd dataset + - see https://www.jstor.org/stable/25471118 + """ + ## transform input (if needed) + ecdf1, ecdf2, ecdf, ranks1, ranks2 = prepare_data2 ( data1 , data2 , pooled , ranks1 , ranks2 ) + + n1 = len ( ecdf1 ) + n2 = len ( ecdf2 ) + n = n1 + n2 + + flog = math.log + def term ( k ) : + + r1 = ranks1 [ k ] + r2 = ranks2 [ k ] + + r1 = max ( 0.5 , min ( n1 - 0.5 , r1 ) ) ## r1 + 0.5 ?? + r2 = max ( 0.5 , min ( n2 - 0.5 , r2 ) ) ## r2 + 0.5 ?? + + fk = 0.5 / n if not k else float ( k ) / n + f1k = float ( r1 ) / n1 + f2k = float ( r2 ) / n2 + + rr = n1 * ( f1k * flog ( f1k ) + ( 1 - f1k ) * flog ( 1. - f1k ) ) / ( ( k + 0.5 ) * ( n - k - 0.5 ) ) + rr += n2 * ( f2k * flog ( f2k ) + ( 1 - f2k ) * flog ( 1. - f2k ) ) / ( ( k + 0.5 ) * ( n - k - 0.5 ) ) + + return rr + + return sum ( term ( k ) for k in range ( n ) ) + +# ============================================================================= +## Get ZC statistics ZC +# @see https://www.jstor.org/stable/25471118 +# @code +# ecdf1 =... +# ecdf2 =... +# zc = ZC ( ecdf1, ecdf2 ) +# @endcode +# @param data1 the 1st dataset or empirical CDF +# @param data2 the 2nd dataset or empirical CDF +# @param pooled (optional) pooled dataset or empirical CDF +# @param ranks1 (optional) ranks of the elements from the pooled data in the 1st dataset +# @param ranks2 (optional) ranks of the elements from the pooled data in the 2nd dataset +# @return ZA statistics ZA +def ZC ( data1 , data2 , pooled = None , ranks1 = None , ranks2 = None ) : + """ Get ZC statistics + - data1 : the 1st dataset or empirical CDF + - data2 : the 2nd dataset or empirical CDF + - pooled : (optional) pooled dataset or CDF + - ranks1 : (optional) ranks of the elements from the pooled data in the 1st dataset + - ranks2 : (optional) ranks of the elements from the pooled data in the 2nd dataset + - see https://www.jstor.org/stable/25471118 + """ + ## transform input (if needed) + ecdf1, ecdf2, ecdf, ranks1, ranks2 = prepare_data2 ( data1 , data2 , pooled , ranks1 , ranks2 ) + + n1 = len ( ecdf1 ) + n2 = len ( ecdf2 ) + n = n1 + n2 + + flog = math.log + def term1 ( j ) : + r1 = ranks1 [ j ] + return flog ( n1 / ( j + 0.5 ) - 1 ) * flog ( n / ( r1 + 0.5 ) - 1 ) + + def term2 ( j ) : + r2 = ranks2 [ j ] + return flog ( n2 / ( j + 0.5 ) - 1 ) * flog ( n / ( r2 + 0.5 ) - 1 ) + + t1 = sum ( term1 ( j ) for j in range ( n1 ) ) + t2 = sum ( term2 ( j ) for j in range ( n2 ) ) + + return ( t1 + t2 ) / n + +# ============================================================================= +## @class TSTest +# Two-Sample Test +class TSTest(Estimators): + """ Two-sample test + """ + def __init__ ( self , + data1 , + data2 ) : + + ## transform input data + ecdf1, ecdf2, ecdf, ranks1, ranks2 = prepare_data2 ( data1 , data2 ) + + self.__ecdf1 = ecdf1 + self.__ecdf2 = ecdf2 + self.__ecdf = ecdf + self.__ranks1 = ranks1 + self.__ranks2 = ranks2 + + self.__estimators = { + 'KS' : kolmogorov_smirnov ( self.ecdf1 , self.ecdf2 , self.ecdf ) , + 'K' : kuiper ( self.ecdf1 , self.ecdf2 , self.ecdf ) , + 'ZK' : ZK ( self.ecdf1 , self.ecdf2 , self.ecdf , self.ranks1 , self.ranks2 ) , + 'ZA' : ZA ( self.ecdf1 , self.ecdf2 , self.ecdf , self.ranks1 , self.ranks2 ) , + 'ZC' : ZC ( self.ecdf1 , self.ecdf2 , self.ecdf , self.ranks1 , self.ranks2 ) , + } + + @property + def ecdf1 ( self ) : + """`ecdf1` : empirical CDF for the 1st dataset + """ + return self.__ecdf1 + @property + def ecdf2 ( self ) : + """`ecdf2` : empirical CDF for the 2nd dataset + """ + return self.__ecdf2 + @property + def ecdf ( self ) : + """`ecdf` : empirical CDF for combined dataset + """ + return self.__ecdf + @property + def ranks1 ( self ) : + """ ranks of elements from pooled sample in the 1st dataset""" + return self.__ranks1 + @property + def ranks2 ( self ) : + """ ranks of elements from pooled sample in the 2nd dataset""" + return self.__ranks2 + + # ========================================================================= + @property + def estimators ( self ) : + """`estimators` : get a dictionary of all estimators""" + return self.__estimators + + # ========================================================================= + ## Get Kolmogorov-Smirnov statistiscs + @property + def kolmogorov_smirnov_estimator ( self ) : + """ Get Kolmogorov-Smirnov statistiscs KS + """ + return self.estimators.get ( 'KS' , None ) + + # ========================================================================= + ## Get Kuiper statististics + @property + def kuiper_estimator ( self ) : + """ Get Kuiperstatistics + """ + return self.estimators.get ( 'K' , None ) + + # ========================================================================= + ## Get ZK statististics + @property + def ZK_estimator ( self ) : + """ Get ZK statistics + """ + return self.estimators.get ( 'ZK' , None ) + + # ========================================================================= + ## Get ZA statististics + @property + def ZA_estimator ( self ) : + """ Get ZA statistics + """ + return self.estimators.get ( 'ZA' , None ) + + # ========================================================================= + ## Get ZC statististics + @property + def ZC_estimator ( self ) : + """ Get ZC statistics + """ + return self.estimators.get ( 'ZC' , None ) + + __repr__ = Estimators.table + __str__ = Estimators.table + + # ========================================================================= + def ks_acccept ( self , alpha ) : + + assert 0 < alpha < 1 , 'Invalid "alpha"%s' % alpha + c_alpha = math.sqrt ( -1 * math.log ( alpha / 2. ) / 2. ) + n = len ( self.ecdf1 ) + m = len ( self.ecdf2 ) + d = self.__KS_val + return d > c_alpha * math.sqrt ( ( n + m ) / ( n * m ) ) + + # ========================================================================= + def ks_alpha ( self ) : + + n = len ( self.ecdf1 ) + m = len ( self.ecdf2 ) + d = self.__KS_val + c = d / math.sqrt ( ( n + m ) / ( n * m ) ) + return 2 * math.exp ( -2 * c * c ) + + # ========================================================================= + ## draw all involved empirical CDFS + def draw ( self , opts = '' , *args , **kwargs ) : + """ Draw fit CDF & empirical CDF + """ + + ecdf1 = self.ecdf1 + ecdf2 = self.ecdf2 + ecdf = self.ecdf + + xmin , xmax = ecdf .xmin (), ecdf .xmax() + xmin1 , xmax1 = ecdf1.xmin (), ecdf1.xmax() + xmin2 , xmax2 = ecdf1.xmin (), ecdf1.xmax() + + xmin = min ( xmin , xmin1 , xmin2 ) + xmax = max ( xmax , xmax1 , xmax2 ) + + xmin , xmax = axis_range ( xmin , xmax , delta = 0.05 ) + + xmin = kwargs.pop ( 'xmin' , xmin ) + xmax = kwargs.pop ( 'xmax' , xmax ) + + r1 = ecdf.draw ( opts , color = 2 , linewidth = 3 , xmin = xmin , xmax = xmax , **kwargs ) + r2 = ecdf.draw ( '%s %s' % ( 'same' , opts ) , color = 4 , linewidth = 3 , xmin = xmin , xmax = xmax , **kwargs ) + r = ecdf.draw ( '%s %s' % ( 'same' , opts ) , color = 8 , linewidth = 3 , xmin = xmin , xmax = xmax , **kwargs ) + + return r + +# ============================================================================= +## @class TSToys +# Two-Sample Test +class TSToys(TSTest,Summary): + """ Two-sample test + """ + def __init__ ( self , + data1 , + data2 , + nToys = 100 , + silent = False ) : + + ## initialize the base + TSTest.__init__ ( self , data1 , data2 ) + + self.__counters = defaultdict(SE) + self.__ecdfs = {} + self.__silent = True if silent else False + + self.__nToys = 0 + if 0 < nToys : self.run ( nToys , silent = silent ) + + # ========================================================================= + ## number of toys/permutations + @property + def nToys ( self ) : + """`nToys` : number of toys/permutations""" + return self.__nToys + + # ========================================================================= + ## ECDFs + @property + def ecdfs ( self ) : + """`ecdfs` : toys/permutations results as empirical cumulative distribution functions""" + return self.__ecdfs + + # ========================================================================= + ## Counters + @property + def counters ( self ) : + """`counters` : toy results as counters""" + return self.__counters + + # =============================================================================== + ## run toys + def run ( self , nToys = 1000 , silent = False ) : + """ Run toys + """ + assert isinstance ( nToys , int ) and 0 < nToys , "Invalid `nToys` argument!" + + n1 = len ( self.ecdf1 ) + n2 = len ( self.ecdf1 ) + + data = np.asarray ( self.ecdf.data () , dtype = float ) + + results = defaultdict(list) + counters = self.counters + + from ostap.utils.progress_bar import progress_bar + for i in progress_bar ( nToys , silent = silent , description = 'Permutations:') : + + np.random.shuffle ( data ) + data1 = data [ : n1 ] + data2 = data [ n1 : ] + + ## transform input data + ecdf1, ecdf2, ecdf, ranks1, ranks2 = prepare_data2 ( data1 , data2 ) + + ks = kolmogorov_smirnov ( ecdf1 , ecdf2 , ecdf ) + k = kuiper ( ecdf1 , ecdf2 , ecdf ) + zk = ZK ( ecdf1 , ecdf2 , ecdf , ranks1 , ranks2 ) + za = ZA ( ecdf1 , ecdf2 , ecdf , ranks1 , ranks2 ) + zc = ZC ( ecdf1 , ecdf2 , ecdf , ranks1 , ranks2 ) + + counters [ 'KS' ] += ks + counters [ 'K' ] += k + counters [ 'ZK' ] += zk + counters [ 'ZA' ] += za + counters [ 'ZC' ] += zc + + results [ 'KS' ].append ( ks ) + results [ 'K' ].append ( k ) + results [ 'ZK' ].append ( zk ) + results [ 'ZA' ].append ( za ) + results [ 'ZC' ].append ( zc ) + + ## accumulate number of toys + self.__nToys += nToys + + ECDF = Ostap.Math.ECDF + for key in results : + data = results [ key ] + if not data : continue + if not key in self.__ecdfs : self.__ecdfs [ key ] = ECDF ( data , True ) ## complementary ECDF! + else : self.__ecdfs [ key ] .add ( data2vct ( data ) ) + + del results + + # ========================================================================= + ## Get Kolmogorov-Smirnov statistiscs + @property + def kolmogorov_smirnov ( self ) : + """ Get Kolmogorov-Smirnov statistiscs KS + """ + return self.result ( 'KS' ) + + # ========================================================================= + ## Get Kuiper statististics + @property + def kuiper ( self ) : + """ Get Kuiper statistics + """ + return self.result ( 'K' ) + + # ========================================================================= + ## Get ZK statististics + @property + def ZK ( self ) : + """ Get ZK statistics + """ + return self.result ( 'ZK' ) + + # ========================================================================= + ## Get ZA statististics + @property + def ZA ( self ) : + """ Get ZA statistics + """ + return self.result ( 'ZA' ) + + # ========================================================================= + ## Get ZC statististics + @property + def ZC ( self ) : + """ Get ZC statistics + """ + return self.result ( 'ZC' ) + + table = Summary.table + __repr__ = Summary.table + __str__ = Summary.table + +# ============================================================================= +if '__main__' == __name__ : + + from ostap.utils.docme import docme + docme ( __name__ , logger = logger ) + +# ============================================================================= +## The END +# ============================================================================= + + + + diff --git a/source/include/Ostap/ECDF.h b/source/include/Ostap/ECDF.h index 85abf306..6f12398d 100644 --- a/source/include/Ostap/ECDF.h +++ b/source/include/Ostap/ECDF.h @@ -29,8 +29,10 @@ namespace Ostap { public: // ====================================================================== - /// the actual type pof data - typedef std::vector Data ; + /// the actual type of data + typedef std::vector Data ; + /// the actual type of indices + typedef std::vector Indices ; // ====================================================================== public: // ====================================================================== @@ -47,13 +49,14 @@ namespace Ostap template ::value_type, typename = std::enable_if::value> > - ECDF ( ITERATOR begin , - ITERATOR end , - const bool complementary = false ) + ECDF + ( ITERATOR begin , + ITERATOR end , + const bool complementary = false ) : m_data ( begin , end ) , m_complementary ( complementary ) - { this->sort_me() ; } - /// constructoer to create complementary/oridnary ECDF + { if ( !std::is_sorted ( m_data.begin() , m_data.end() ) ) { this->sort_me() ; } } + /// constructoer to create complementary/ordinary ECDF ECDF ( const ECDF& right , const bool complementary ) ; @@ -85,34 +88,34 @@ namespace Ostap unsigned long add ( const ECDF& values ) ; /// add a bunch oif values template ::value_type, - typename = std::enable_if::value> > - unsigned long add - ( ITERATOR begin , - ITERATOR end ) - { - /// copy input data - Data tmp1 ( begin , end ) ; - /// sort input data - std::sort ( tmp1.begin () , tmp1.end () ) ; - /// prepare the output - Data tmp2 ( m_data.size () + tmp1.size () ) ; - /// merge two sorted continers - std::merge ( m_data.begin() , - m_data.end () , - tmp1.begin () , - tmp1.end () , - tmp2.begin () ) ; - /// swap the output with own data - std::swap ( m_data , tmp2 ) ; - return m_data.size () ; - } + typename value_type = typename std::iterator_traits::value_type, + typename = std::enable_if::value> > + unsigned long add + ( ITERATOR begin , + ITERATOR end ) + { + /// copy input data + Data tmp1 ( begin , end ) ; + /// sort input data + if ( !std::is_sorted ( tmp1.begin() , tmp1.end() ) ) { std::sort ( tmp1.begin () , tmp1.end () ) ; } + /// prepare the output + Data tmp2 ( m_data.size () + tmp1.size () ) ; + /// merge two sorted containers + std::merge ( m_data.begin() , + m_data.end () , + tmp1.begin () , + tmp1.end () , + tmp2.begin () ) ; + /// swap the result with own data + std::swap ( m_data , tmp2 ) ; + return m_data.size () ; + } // ====================================================================== public : // ====================================================================== ECDF& __iadd__ ( const double x ) { add ( x ) ; return *this ; } ECDF& __iadd__ ( const Data& x ) { add ( x ) ; return *this ; } - ECDF __add__ ( const ECDF& x ) ; + ECDF __add__ ( const ECDF& x ) const ; // ====================================================================== inline ECDF& operator+= ( const double x ) { add ( x ) ; return *this ; } inline ECDF& operator+= ( const Data& x ) { add ( x ) ; return *this ; } @@ -121,18 +124,33 @@ namespace Ostap public: // ====================================================================== /// data size - inline unsigned long N () const { return m_data.size () ; } + inline Data::size_type N () const { return m_data.size () ; } /// data size - inline unsigned long size () const { return m_data.size () ; } + inline Data::size_type size () const { return m_data.size () ; } /// access to data - inline const Data& data () const { return m_data ; } + inline const Data& data () const { return m_data ; } // ====================================================================== /// complementary? - inline bool complementary () const { return m_complementary ; } + inline bool complementary () const { return m_complementary ; } /// minimal x-value - inline double xmin () const { return m_data.front () ; } + inline double xmin () const { return m_data.front () ; } /// maximal x-value - inline double xmax () const { return m_data.back () ; } + inline double xmax () const { return m_data.back () ; } + // ====================================================================== + /// get the abscissa value with the given index + inline double operator[] ( const unsigned int index ) const + { return index < m_data.size() ? m_data[index] : m_data.back() ; } + // ====================================================================== + public: + // ====================================================================== + /** number of elements that are less or equal to x + * "rank of x" in the ordered sample + */ + inline Data::size_type rank ( const double x ) const + { return std::upper_bound ( m_data.begin () , m_data.end () , x ) - m_data.begin() ; } + // ====================================================================== + /// get ranks for all elements from another sample + Indices ranks ( const ECDF& sample ) const ; // ====================================================================== public: // ====================================================================== @@ -146,11 +164,11 @@ namespace Ostap private: // ====================================================================== /// container of sorted data - Data m_data {} ; // container of sorted data + Data m_data { } ; // container of sorted data /// complementary CDF? bool m_complementary { false } ; // complementary CDF ? // ====================================================================== - }; + }; // The end of the class Ostap::Math::ECDF // ======================================================================== /// swap two objects inline void swap ( ECDF& left , ECDF& right ) { left.swap ( right ) ; } @@ -158,10 +176,10 @@ namespace Ostap inline ECDF operator+( const ECDF& a , const ECDF& b ) { ECDF c { a } ; c += b ; return c ; } - // ======================================================================= - } // end of namespace Ostap::Math + // ======================================================================== + } // The end of namespace Ostap::Math // ========================================================================== -} // end of namespace Ostap +} // The end of namespace Ostap // ============================================================================ // The END // ============================================================================ diff --git a/source/src/ECDF.cpp b/source/src/ECDF.cpp index 874cb933..1aa079be 100644 --- a/source/src/ECDF.cpp +++ b/source/src/ECDF.cpp @@ -26,11 +26,8 @@ Ostap::Math::ECDF::ECDF ( const Ostap::Math::ECDF::Data& data , const bool complementary ) - : m_data ( data ) - , m_complementary ( complementary ) -{ - this->sort_me() ; -} + : Ostap::Math::ECDF::ECDF ( data.begin() , data.end() , complementary ) +{} // ============================================================================ // constructoer to create complementary/oridnary ECDF // ============================================================================ @@ -68,8 +65,9 @@ double Ostap::Math::ECDF::evaluate ( const double x ) const if ( x < m_data.front () ) { return m_complementary ? 1.0 : 0.0 ; } else if ( x > m_data.back () ) { return m_complementary ? 0.0 : 1.0 ; } // - const double result = double ( std::upper_bound ( m_data.begin () , - m_data.end () , x ) - m_data.begin() ) / m_data.size () ; + // const double result = double ( std::upper_bound ( m_data.begin () , + // m_data.end () , x ) - m_data.begin() ) / m_data.size () ; + const double result = double ( rank ( x ) ) / m_data.size () ; return m_complementary ? ( 1 - result ) : result ; } // ============================================================================ @@ -107,13 +105,15 @@ Ostap::Math::ECDF::add unsigned long Ostap::Math::ECDF::add ( const Ostap::Math::ECDF::Data& values ) -{ - return add ( values.begin() , values.end() ) ; -} +{ return add ( values.begin() , values.end() ) ; } // ============================================================================ Ostap::Math::ECDF -Ostap::Math::ECDF::__add__ ( const Ostap::Math::ECDF& x ) -{ return (*this) + x ; } +Ostap::Math::ECDF::__add__ ( const Ostap::Math::ECDF& x ) const +{ + ECDF c { *this } ; + c.add ( x ) ; + return c ; +} // ============================================================================ // add a value to data container // ============================================================================ @@ -122,7 +122,7 @@ Ostap::Math::ECDF::add ( const Ostap::Math::ECDF& values ) { Data tmp2 ( m_data.size() + values.size() ) ; - /// merge two sorted continers + /// merge two sorted containers std::merge ( m_data.begin () , m_data.end () , values.m_data.begin () , @@ -131,6 +131,28 @@ Ostap::Math::ECDF::add std::swap ( m_data , tmp2 ) ; return m_data.size () ; } +// ============================================================================ +// get ranks of the elements in the (pooled) sample +// ============================================================================ +Ostap::Math::ECDF::Indices +Ostap::Math::ECDF::ranks +( const Ostap::Math::ECDF& sample ) const +{ + const Data::size_type N = size() ; + // fill outptu array with N + Indices result ( sample.size () , N ) ; + Data::size_type NS= sample.size() ; + for ( Data::size_type i = 0 ; i < NS ; ++i ) + { + Data::size_type r = rank ( sample.m_data [ i ] ) ; + result [ i ] = r ; + // try to be a bit more efficient, the rest of array is filled with N + if ( N <= r ) { break ; } + } + return result ; +} +// ============================================================================ + // ============================================================================ // The END // ============================================================================