From d6a9cf68ac141dc6dc92b377882317ee075f08eb Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 23:48:46 -0400 Subject: [PATCH 1/5] MISC: addressing most stuffs in github.com/npinto/bangmetric/pull/8 (thanks @npinto!) --- bangmetric/dprime.py | 98 ++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 32d4799..ee3ff08 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,7 +6,7 @@ # # License: BSD -__all__ = ['dprime', 'dprime_from_confusion_ova'] +__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion_ova'] import numpy as np from scipy.stats import norm @@ -33,9 +33,8 @@ def dprime(y_pred, y_true, **kwargs): Returns ------- - dp: float or None - d-prime, None if d-prime is undefined and raw d-prime value (``safedp=False``) - is not requested (default). + dp: float + d-prime References ---------- @@ -60,11 +59,11 @@ def dprime(y_pred, y_true, **kwargs): pos = y_pred[i_pos] neg = y_pred[i_neg] - dp = dprime_from_samp(pos, neg, bypass_nchk=True, **kwargs) + dp = dprime_from_samp(pos, neg, **kwargs) return dp -def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=False): +def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): """Computes the d-prime sensitivity index from positive and negative samples. Parameters @@ -75,26 +74,16 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa neg: array-like Negative sample values. - maxv: float, optional - Maximum possible d-prime value. If None (default), there's no limit on - the maximum value. + max_value: float, optional + Maximum possible d-prime value. Default is ``np.inf``. - minv: float, optional - Minimum possible d-prime value. If None (default), there's no limit. - - safedp: bool, optional - If True (default), this function will return None if the resulting d-prime - value becomes non-finite. - - bypass_nchk: bool, optional - If False (default), do not bypass the test to ensure that enough positive - and negatives samples are there for the variance estimation. + min_value: float, optional + Minimum possible d-prime value. Default is ``-np.inf``. Returns ------- - dp: float or None - d-prime, None if d-prime is undefined and raw d-prime value (``safedp=False``) - is not requested (default). + dp: float + d-prime References ---------- @@ -104,9 +93,10 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa pos = np.array(pos) neg = np.array(neg) - if not bypass_nchk: - assert pos.size > 1, 'Not enough positive samples to estimate the variance' - assert neg.size > 1, 'Not enough negative samples to estimate the variance' + if pos.size <= 1: + raise ValueError('Not enough positive samples to estimate the variance') + if neg.size <= 1: + raise ValueError('Not enough negative samples to estimate the variance') pos_mean = pos.mean() neg_mean = neg.mean() @@ -117,22 +107,16 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa div = np.sqrt((pos_var + neg_var) / 2.) # from Dan's suggestion about clipping d' values... - if maxv is None: - maxv = np.inf - if minv is None: - minv = -np.inf - - dp = np.clip(num / div, minv, maxv) - - if safedp and not np.isfinite(dp): - dp = None + dp = np.clip(num / div, min_value, max_value) return dp def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ - fudge_fac=DEFAULT_FUDGE_FACTOR): + fudge_factor=DEFAULT_FUDGE_FACTOR, max_value=np.inf, min_value=-np.inf): """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. + This function is mostly for when there is no access to internal representation + and/or decision making (like human data). Parameters ---------- @@ -141,13 +125,21 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. - fudge_fac: float, optional + fudge_factor: float, optional A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. fudge_mode: str, optional - Determins how to apply the fudge factor + Determins how to apply the fudge factor. Can be one of: 'always': always apply the fudge factor 'correction': apply only when needed + 'none': no fudging --- equivalent to ``fudge_factor=0`` + + max_value: float, optional + Maximum possible d-prime value. Default is ``np.inf``. + + min_value: float, optional + Minimum possible d-prime value. Default is ``-np.inf``. + Returns ------- @@ -170,26 +162,36 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ TP = np.diag(M) FP = np.sum(M, axis=0) - TP - if fudge_mode == 'always': # always apply fudge factor - TPR = (TP.astype('float') + fudge_fac) / (P + 2.*fudge_fac) - FPR = (FP.astype('float') + fudge_fac) / (N + 2.*fudge_fac) + + # -- application of fudge factor + + if fudge_mode == 'none': # no fudging + fudge_mode = 'always' + fudge_factor = 0 + + if fudge_mode == 'always': # always apply fudge factor + TPR = (TP.astype('float64') + fudge_factor) / (P + 2.*fudge_factor) + FPR = (FP.astype('float64') + fudge_factor) / (N + 2.*fudge_factor) elif fudge_mode == 'correction': # apply fudge factor only when needed - TP = TP.astype('float') - FP = FP.astype('float') + TP = TP.astype('float64') + FP = FP.astype('float64') - TP[TP == P] = P[TP == P] - fudge_fac # 100% correct - TP[TP == 0] = fudge_fac # 0% correct - FP[FP == N] = N[FP == N] - fudge_fac # always FAR - FP[FP == 0] = fudge_fac # no false alarm + TP[TP == P] = P[TP == P] - fudge_factor # 100% correct + TP[TP == 0] = fudge_factor # 0% correct + FP[FP == N] = N[FP == N] - fudge_factor # always FAR + FP[FP == 0] = fudge_factor # no false alarm TPR = TP / P FPR = FP / N else: - assert False, 'Not implemented' + raise ValueError('Invalid fudge_mode') + + + # -- done. compute the d' - dp = norm.ppf(TPR) - norm.ppf(FPR) + dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value if len(dp) == 2: dp = np.array([dp[0]]) From 6f5cbac75972f0c06ad035f72dbdec67b9cee5f8 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 23:56:14 -0400 Subject: [PATCH 2/5] DOC: small retouches --- bangmetric/dprime.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index ee3ff08..147429a 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -127,11 +127,12 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ fudge_factor: float, optional A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. + Default is 0.5. fudge_mode: str, optional Determins how to apply the fudge factor. Can be one of: - 'always': always apply the fudge factor - 'correction': apply only when needed + 'correction': apply only when needed (default) + 'always': always apply the fudge factor 'none': no fudging --- equivalent to ``fudge_factor=0`` max_value: float, optional @@ -144,7 +145,7 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ Returns ------- dp: array, shape = [n_classes] - Array of d-primes, each element corresponding to each class + Array of d-primes, where each element corresponds to each class References ---------- From afa86fe0dca8b842bb6c2328a5b05696849377c6 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 00:02:58 -0400 Subject: [PATCH 3/5] COSMIT --- bangmetric/dprime.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 147429a..46fa009 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -162,37 +162,34 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ TP = np.diag(M) FP = np.sum(M, axis=0) - TP - + TP = TP.astype('float64') + FP = FP.astype('float64') # -- application of fudge factor - if fudge_mode == 'none': # no fudging fudge_mode = 'always' fudge_factor = 0 if fudge_mode == 'always': # always apply fudge factor - TPR = (TP.astype('float64') + fudge_factor) / (P + 2.*fudge_factor) - FPR = (FP.astype('float64') + fudge_factor) / (N + 2.*fudge_factor) + TP += fudge_factor + FP += fudge_factor + P += 2.*fudge_factor + N += 2.*fudge_factor elif fudge_mode == 'correction': # apply fudge factor only when needed - TP = TP.astype('float64') - FP = FP.astype('float64') - TP[TP == P] = P[TP == P] - fudge_factor # 100% correct TP[TP == 0] = fudge_factor # 0% correct FP[FP == N] = N[FP == N] - fudge_factor # always FAR FP[FP == 0] = fudge_factor # no false alarm - TPR = TP / P - FPR = FP / N - else: raise ValueError('Invalid fudge_mode') - # -- done. compute the d' - + TPR = TP / P + FPR = FP / N dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) + # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value if len(dp) == 2: dp = np.array([dp[0]]) From 8babb28e085c1da481223581aba5610f54a307ae Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 00:05:47 -0400 Subject: [PATCH 4/5] COSMIT --- bangmetric/dprime.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 46fa009..9568d98 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -167,10 +167,9 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ # -- application of fudge factor if fudge_mode == 'none': # no fudging - fudge_mode = 'always' - fudge_factor = 0 + pass - if fudge_mode == 'always': # always apply fudge factor + elif fudge_mode == 'always': # always apply fudge factor TP += fudge_factor FP += fudge_factor P += 2.*fudge_factor From d6ab20be4e19fff4c8d6e5910de92964c2d5cfa1 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 01:54:31 -0400 Subject: [PATCH 5/5] ENH: more general dprime_from_confusion (thanks, @npinto!) --- bangmetric/dprime.py | 55 +++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 9568d98..00f496e 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,7 +6,7 @@ # # License: BSD -__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion_ova'] +__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion'] import numpy as np from scipy.stats import norm @@ -112,7 +112,7 @@ def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): return dp -def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ +def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ fudge_factor=DEFAULT_FUDGE_FACTOR, max_value=np.inf, min_value=-np.inf): """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. This function is mostly for when there is no access to internal representation @@ -120,11 +120,23 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ Parameters ---------- - M: array, shape = [n_classes (true), n_classes (pred)] + M: array-like, shape = [n_classes (true), n_classes (pred)] Confusion matrix, where the element M_{rc} means the number of times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. + collation: None (default) or array-like with shape = [n_grouping, n_classes] + Defines how to group entries in `M` to compute TPR and FPR. + Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, + where +1, -1, and 0 designate the corresponding class as a + positive, negative, and ignored class, respectively. For example, + the following `collation` defines a 3-way one vs. rest grouping + (given that `M` is a 3x3 matrix): + [[+1, -1, -1], + [-1, +1, -1], + [-1, -1, +1]] + If `None` (default), one vs. rest grouping is assumed. + fudge_factor: float, optional A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. Default is 0.5. @@ -144,8 +156,9 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ Returns ------- - dp: array, shape = [n_classes] - Array of d-primes, where each element corresponds to each class + dp: array, shape = [n_grouping] + Array of d-primes, where each element corresponds to each grouping + defined by `collation`. References ---------- @@ -153,17 +166,31 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ http://en.wikipedia.org/wiki/Confusion_matrix """ + # M: confusion matrix, row means true classes, col means predicted classes M = np.array(M) assert M.ndim == 2 assert M.shape[0] == M.shape[1] - - P = np.sum(M, axis=1) # number of positives, for each class - N = np.sum(P) - P + n_classes = M.shape[0] - TP = np.diag(M) - FP = np.sum(M, axis=0) - TP - TP = TP.astype('float64') - FP = FP.astype('float64') + if collation is None: + # make it one vs. rest + collation = -np.ones((n_classes, n_classes), dtype='int8') + collation += 2 * np.eye(n_classes, dtype='int8') + else: + collation = np.array(collation, dtype='int8') + assert collation.ndim == 2 + assert collation.shape[1] == n_classes + + # P0: number of positives, for each class + # P: number of positives, for each grouping + # N: number of negatives, for each grouping + # TP: number of true positives, for each grouping + # FP: number of false positives, for each grouping + P0 = np.sum(M, axis=1) + P = np.array([np.sum(P0[coll == +1]) for coll in collation], dtype='float64') + N = np.array([np.sum(P0[coll == -1]) for coll in collation], dtype='float64') + TP = np.array([np.sum(M[coll == +1][:, coll == +1]) for coll in collation], dtype='float64') + FP = np.array([np.sum(M[coll == -1][:, coll == +1]) for coll in collation], dtype='float64') # -- application of fudge factor if fudge_mode == 'none': # no fudging @@ -189,9 +216,5 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ FPR = FP / N dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) - # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value - if len(dp) == 2: - dp = np.array([dp[0]]) - return dp