Skip to content

Commit

Permalink
fix?
Browse files Browse the repository at this point in the history
  • Loading branch information
VanyaBelyaev committed Dec 7, 2023
1 parent eeeef61 commit 37c8be2
Showing 1 changed file with 130 additions and 63 deletions.
193 changes: 130 additions & 63 deletions ostap/stats/gof.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,135 @@ def nEff ( weights ) :
# ds = ... # data set as structured array
# dsn = normalize ( ds )
# @endcode
def normalize2 ( datasets , weight = () , first = True ) :
""" Get the `normalized' input datasets
All floating felds are calculated as
x = (x - <x>)/sigma
- <x> is a mean value
- is a standard deviation.
- If several datasets are specified, all floating names must be the same
and the mean and sigma are either taken either from the first dataset,
if `first=True` or as combined through all datasets, otherwise
- If `weight` is specified, this floating column is concidered
as the weight
- attention Only the floating point columns are transformed!
- attention Input datasets are expected to be numpy structured arrays
"""

nd = len ( datasets )
if not weight : weight = nd * [ '' ]
elif isinstance ( weight , string_types ) : weight = nd * [ weight ]

assert ( len ( weight ) == nd ) and \
all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \
'Invalid specification of weight!'

weight = list ( weight )
for i , w in enumerate ( weight ) :
if not w : weight [ i ] = ''
weight = tuple ( weight )

if (3,0) <= sys.version_info :
ds = datasets [ 0 ]
others = datasets [ 1: ]

## collect the floating columns
columns = []
w0 = weight [ 0 ]
for n,t in ds.dtype.fields.items () :
if t[0] in _np_floats and n != w0 : columns.append ( n )

vmeans = []
for i , c in enumerate ( columns ) :
mean, var = mean_var ( ds [ c ] , None if not w0 else ds [ w0 ] )
vmeans.append ( VE ( mean , var ) )

## Number of events/effective entries
nevents = 1.0 * ds.shape [ 0 ] if not w0 else nEff ( ds [ w0 ] )

def normalize ( ds , *others , weight = () , first = True ) :
if not first and others :
nevents = ds.shape[0]
for k , dd in enumerate ( others ) :

wk = weight [ k + 1 ]
nn = 1.0 * dd.shape [ 0 ] if not wk else nEff ( dd [ wk ] )

for i , c in enumerate ( columns ) :

mean, var = mean_var ( dd [ c ] , None if not wk else dd [ wk ] )
vv = VE ( mean , var )
vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn )

nevents += nn

result = []
for d in datasets :

nds = d.copy ()
for ic , c in enumerate ( columns ) :
vv = vmeans [ ic ]
mean = vv.value ()
sigma = vv.error ()
a = nds [ c ]
nds [ c ] = ( a - mean ) / sigma

result.append ( nds )

return tuple ( result )



# =============================================================================
if (3,0) <= sys.version_info :
# =========================================================================
## Get the "normalized" input datasets
# All floating felds are calculated as
# \f[ x = \frac{x - \left\langle x \right\rangle}{\sigma} \f]
# where \f$ \left\langle x \right\rangle\f$ is mena value
# and \f$ \sigma \f$ is a standard deviation.
#
# @code
# ds = ... # data set as structured array
# dsn = normalize ( ds )
# @endcode
#
# - If several datasets are specified, all floating names must be the same
# and the mean and sigma are either taken either from the first dataset,
# if <code>first=True</code> or as combined through all datasets otherwise
#
# @code
# ds1 = ... # data set as structured array
# ds2 = ... # data set as structured array
# ds3 = ... # data set as structured array
# ds1n, ds2n, ds3n = normalize ( ds1 , ds2 , ds3 , first = True )
# @endcode
#
# - If <code>weight</code> is specified, this floating column is considered
# as the weight
#
# @code
# ds = ... # data set as structured array with weight
# dsn = normalize ( ds , weight = 'weight' )
# @endcode
#
# @code
# ds1 = ... # data set as structured array without weight
# ds2 = ... # data set as structured array with weight
# ds1n , ds2n = normalize ( ds1 , ds2 , weight = ( None , 'weight' ) )
# @endcode
#
# @attention Only the floating point columns are transformed!
# @attention Input datasets are expected to be numpy structured arrays
#
# @code
# ds = ... # data set as structured array
# dsn = normalize ( ds )
# @endcode
def normalize ( ds , *others , weight = () , first = True ) :
""" Get the `normalized' input datasets
All floating felds are calculated as
Expand All @@ -152,68 +277,10 @@ def normalize ( ds , *others , weight = () , first = True ) :
- attention Input datasets are expected to be numpy structured arrays
"""

nd = len ( others ) + 1
if not weight : weight = nd * [ '' ]
elif isinstance ( weight , string_types ) : weight = nd * [ weight ]

assert ( len ( weight ) == nd ) and \
all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \
'Invalid specification of weight!'

weight = list ( weight )
for i , w in enumerate ( weight ) :
if not w : weight [ i ] = ''
weight = tuple ( weight )

data = ( ds , ) + others
result = []

## collect the floating columns
columns = []
w0 = weight [ 0 ]
for n,t in ds.dtype.fields.items () :
if t[0] in _np_floats and n != w0 : columns.append ( n )


vmeans = []
for i , c in enumerate ( columns ) :
mean, var = mean_var ( ds [ c ] , None if not w0 else ds [ w0 ] )
vmeans.append ( VE ( mean , var ) )

## Number of events/effective entries
nevents = 1.0 * ds.shape [ 0 ] if not w0 else nEff ( ds [ w0 ] )

if not first and others :
nevents = ds.shape[0]
for k , dd in enumerate ( others ) :

wk = weight [ k + 1 ]
nn = 1.0 * dd.shape [ 0 ] if not wk else nEff ( dd [ wk ] )

for i , c in enumerate ( columns ) :

mean, var = mean_var ( dd [ c ] , None if not wk else dd [ wk ] )
vv = VE ( mean , var )
vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn )

nevents += nn

result = []
for d in ( ds , *others ) :

nds = d.copy ()
for ic , c in enumerate ( columns ) :
vv = vmeans [ ic ]
mean = vv.value ()
sigma = vv.error ()
a = nds [ c ]
nds [ c ] = ( a - mean ) / sigma

result.append ( nds )

return result [ 0 ] if not others else tuple ( result )

result = normalize2 ( ( ds , *others ) , weight = weight , first = first )

return result [ 0 ] if not others else resut

# =============================================================================
if '__main__' == __name__ :

Expand Down

0 comments on commit 37c8be2

Please sign in to comment.