diff --git a/ostap/stats/gof.py b/ostap/stats/gof.py index 1b0da248..300e6d28 100644 --- a/ostap/stats/gof.py +++ b/ostap/stats/gof.py @@ -129,10 +129,135 @@ def nEff ( weights ) : # ds = ... # data set as structured array # dsn = normalize ( ds ) # @endcode +def normalize2 ( datasets , weight = () , first = True ) : + """ Get the `normalized' input datasets + All floating felds are calculated as + + x = (x - )/sigma + + - is a mean value + - is a standard deviation. + + - If several datasets are specified, all floating names must be the same + and the mean and sigma are either taken either from the first dataset, + if `first=True` or as combined through all datasets, otherwise + + - If `weight` is specified, this floating column is concidered + as the weight + + - attention Only the floating point columns are transformed! + - attention Input datasets are expected to be numpy structured arrays + """ + + nd = len ( datasets ) + if not weight : weight = nd * [ '' ] + elif isinstance ( weight , string_types ) : weight = nd * [ weight ] + + assert ( len ( weight ) == nd ) and \ + all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \ + 'Invalid specification of weight!' + + weight = list ( weight ) + for i , w in enumerate ( weight ) : + if not w : weight [ i ] = '' + weight = tuple ( weight ) -if (3,0) <= sys.version_info : + ds = datasets [ 0 ] + others = datasets [ 1: ] + + ## collect the floating columns + columns = [] + w0 = weight [ 0 ] + for n,t in ds.dtype.fields.items () : + if t[0] in _np_floats and n != w0 : columns.append ( n ) + + vmeans = [] + for i , c in enumerate ( columns ) : + mean, var = mean_var ( ds [ c ] , None if not w0 else ds [ w0 ] ) + vmeans.append ( VE ( mean , var ) ) + + ## Number of events/effective entries + nevents = 1.0 * ds.shape [ 0 ] if not w0 else nEff ( ds [ w0 ] ) - def normalize ( ds , *others , weight = () , first = True ) : + if not first and others : + nevents = ds.shape[0] + for k , dd in enumerate ( others ) : + + wk = weight [ k + 1 ] + nn = 1.0 * dd.shape [ 0 ] if not wk else nEff ( dd [ wk ] ) + + for i , c in enumerate ( columns ) : + + mean, var = mean_var ( dd [ c ] , None if not wk else dd [ wk ] ) + vv = VE ( mean , var ) + vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn ) + + nevents += nn + + result = [] + for d in datasets : + + nds = d.copy () + for ic , c in enumerate ( columns ) : + vv = vmeans [ ic ] + mean = vv.value () + sigma = vv.error () + a = nds [ c ] + nds [ c ] = ( a - mean ) / sigma + + result.append ( nds ) + + return tuple ( result ) + + + +# ============================================================================= +if (3,0) <= sys.version_info : + # ========================================================================= + ## Get the "normalized" input datasets + # All floating felds are calculated as + # \f[ x = \frac{x - \left\langle x \right\rangle}{\sigma} \f] + # where \f$ \left\langle x \right\rangle\f$ is mena value + # and \f$ \sigma \f$ is a standard deviation. + # + # @code + # ds = ... # data set as structured array + # dsn = normalize ( ds ) + # @endcode + # + # - If several datasets are specified, all floating names must be the same + # and the mean and sigma are either taken either from the first dataset, + # if first=True or as combined through all datasets otherwise + # + # @code + # ds1 = ... # data set as structured array + # ds2 = ... # data set as structured array + # ds3 = ... # data set as structured array + # ds1n, ds2n, ds3n = normalize ( ds1 , ds2 , ds3 , first = True ) + # @endcode + # + # - If weight is specified, this floating column is considered + # as the weight + # + # @code + # ds = ... # data set as structured array with weight + # dsn = normalize ( ds , weight = 'weight' ) + # @endcode + # + # @code + # ds1 = ... # data set as structured array without weight + # ds2 = ... # data set as structured array with weight + # ds1n , ds2n = normalize ( ds1 , ds2 , weight = ( None , 'weight' ) ) + # @endcode + # + # @attention Only the floating point columns are transformed! + # @attention Input datasets are expected to be numpy structured arrays + # + # @code + # ds = ... # data set as structured array + # dsn = normalize ( ds ) + # @endcode + def normalize ( ds , *others , weight = () , first = True ) : """ Get the `normalized' input datasets All floating felds are calculated as @@ -152,68 +277,10 @@ def normalize ( ds , *others , weight = () , first = True ) : - attention Input datasets are expected to be numpy structured arrays """ - nd = len ( others ) + 1 - if not weight : weight = nd * [ '' ] - elif isinstance ( weight , string_types ) : weight = nd * [ weight ] - - assert ( len ( weight ) == nd ) and \ - all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \ - 'Invalid specification of weight!' - - weight = list ( weight ) - for i , w in enumerate ( weight ) : - if not w : weight [ i ] = '' - weight = tuple ( weight ) - - data = ( ds , ) + others - result = [] - - ## collect the floating columns - columns = [] - w0 = weight [ 0 ] - for n,t in ds.dtype.fields.items () : - if t[0] in _np_floats and n != w0 : columns.append ( n ) - - - vmeans = [] - for i , c in enumerate ( columns ) : - mean, var = mean_var ( ds [ c ] , None if not w0 else ds [ w0 ] ) - vmeans.append ( VE ( mean , var ) ) - - ## Number of events/effective entries - nevents = 1.0 * ds.shape [ 0 ] if not w0 else nEff ( ds [ w0 ] ) - - if not first and others : - nevents = ds.shape[0] - for k , dd in enumerate ( others ) : - - wk = weight [ k + 1 ] - nn = 1.0 * dd.shape [ 0 ] if not wk else nEff ( dd [ wk ] ) - - for i , c in enumerate ( columns ) : - - mean, var = mean_var ( dd [ c ] , None if not wk else dd [ wk ] ) - vv = VE ( mean , var ) - vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn ) - - nevents += nn - - result = [] - for d in ( ds , *others ) : - - nds = d.copy () - for ic , c in enumerate ( columns ) : - vv = vmeans [ ic ] - mean = vv.value () - sigma = vv.error () - a = nds [ c ] - nds [ c ] = ( a - mean ) / sigma - - result.append ( nds ) - - return result [ 0 ] if not others else tuple ( result ) - + result = normalize2 ( ( ds , *others ) , weight = weight , first = first ) + return result [ 0 ] if not others else resut + # ============================================================================= if '__main__' == __name__ :