From 340f5d3229ea5fecc331abed86e5b3b1b9366247 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Mon, 16 Jul 2012 23:46:08 -0400 Subject: [PATCH 01/33] MISC: updating .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index d7f1163..1caac9b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ __pycache__ .idea build +*.DS_Store +*~ +.*swp From fa1369250346b9eb3dc8950f302a0a0613503eb2 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 17 Jul 2012 12:20:48 -0400 Subject: [PATCH 02/33] ENH: added d-prime calculation from a confusion matrix --- bangmetric/dprime.py | 80 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index a80fcf2..2b757b9 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -2,13 +2,18 @@ # Authors: Nicolas Pinto # Nicolas Poilvert +# Ha Hong # # License: BSD -__all__ = ['dprime'] +__all__ = ['dprime', 'dprime_ova_from_confusion'] import numpy as np +from scipy.stats import norm +DEFAULT_FUDGE_FACTOR = 0.5 +DEFAULT_FUDGE_MODE = 'correction' +ATOL = 1e-7 def dprime(y_pred, y_true): """Computes the d-prime sensitivity index of the predictions. @@ -61,3 +66,76 @@ def dprime(y_pred, y_true): dp = num / div return dp + + +def dprime_ova_from_confusion(M, fudge_mode=DEFAULT_FUDGE_MODE, \ + fudge_fac=DEFAULT_FUDGE_FACTOR, atol=ATOL): + """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. + + Parameters + ---------- + M: array, shape = [n_classes (true), n_classes (pred)] + Confusion matrix, where the element M_{rc} means the number of + times when the classifier guesses that a test sample in the r-th class + belongs to the c-th class. + + fudge_fac: float, optional + A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. + + fudge_mode: str, optional + Determins how to apply the fudge factor + 'always': always apply the fudge factor + 'correction': apply only when needed + + atol: float, optional + Tolerance to simplify the dp from a 2-way (i.e., 2x2) confusion matrix. + + Returns + ------- + dp: array, shape = [n_classes] + Array of d-primes, each element corresponding to each class + + References + ---------- + http://en.wikipedia.org/wiki/D' + http://en.wikipedia.org/wiki/Confusion_matrix + + XXX: no normalization for unbalanced data + """ + + M = np.array(M) + assert M.ndim == 2 + assert M.shape[0] == M.shape[1] + + P = np.sum(M, axis=1) # number of positives, for each class + N = np.sum(P) - P + + TP = np.diag(M) + FP = np.sum(M, axis=0) - TP + + if fudge_mode == 'always': # always apply fudge factor + TPR = (TP.astype('float') + fudge_fac) / (P + 2.*fudge_fac) + FPR = (FP.astype('float') + fudge_fac) / (N + 2.*fudge_fac) + + elif fudge_mode == 'correction': # apply fudge factor only when needed + TP = TP.astype('float') + FP = FP.astype('float') + + TP[TP == P] = P[TP == P] - fudge_fac # 100% correct + TP[TP == 0] = fudge_fac # 0% correct + FP[FP == N] = N[FP == N] - fudge_fac # always FAR + FP[FP == 0] = fudge_fac # no false alarm + + TPR = TP / P + FPR = FP / N + + else: + assert False, 'Not implemented' + + dp = norm.ppf(TPR) - norm.ppf(FPR) + # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value + if len(dp) == 2 and np.abs(dp[0] - dp[1]) < atol: + dp = np.array([dp[0]]) + + return dp + From 0b00116c8c9d8f756912057b70515fe6f166a9aa Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 17 Jul 2012 16:39:46 -0400 Subject: [PATCH 03/33] MISC: small cosmetics changes and assertions to check positives and negatives --- bangmetric/dprime.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 8400be0..bd15b87 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -15,6 +15,7 @@ DEFAULT_FUDGE_MODE = 'correction' ATOL = 1e-7 + def dprime(y_pred, y_true): """Computes the d-prime sensitivity index of the predictions. @@ -52,6 +53,10 @@ def dprime(y_pred, y_true): # -- actual computation pos = y_true > 0 neg = ~pos + + assert pos.sum() > 1, 'Not enough positives to estimate the variance' + assert neg.sum() > 1, 'Not enough negatives to estimate the variance' + pos_mean = y_pred[pos].mean() neg_mean = y_pred[neg].mean() pos_var = y_pred[pos].var(ddof=1) @@ -67,7 +72,7 @@ def dprime(y_pred, y_true): return dp -def dprime_ova_from_confusion(M, fudge_mode=DEFAULT_FUDGE_MODE, \ +def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ fudge_fac=DEFAULT_FUDGE_FACTOR, atol=ATOL): """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. From 69f09742023a77f717ad30b9ad48362b0422d494 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 01:40:21 -0400 Subject: [PATCH 04/33] ENH: added d-prime calcualtion function that directly takes sample values (+ minor changes) --- tests should be added --- bangmetric/dprime.py | 89 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index bd15b87..3e5ac94 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,17 +6,17 @@ # # License: BSD -__all__ = ['dprime', 'dprime_ova_from_confusion'] +__all__ = ['dprime', 'dprime_from_confusion_ova'] import numpy as np from scipy.stats import norm DEFAULT_FUDGE_FACTOR = 0.5 DEFAULT_FUDGE_MODE = 'correction' -ATOL = 1e-7 +ATOL = 1e-6 -def dprime(y_pred, y_true): +def dprime(y_pred, y_true, **kwargs): """Computes the d-prime sensitivity index of the predictions. Parameters @@ -29,10 +29,14 @@ def dprime(y_pred, y_true): y_pred: array, shape = [n_samples] Predicted values (real). + kwargs: named arguments, optional + Passed to ``dprime_from_samp()``. + Returns ------- dp: float or None - d-prime, None if d-prime is undefined + d-prime, None if d-prime is undefined and raw d-prime value (``safedp=False``) + is not requested (default). References ---------- @@ -51,23 +55,78 @@ def dprime(y_pred, y_true): assert y_pred.ndim == 1 # -- actual computation - pos = y_true > 0 - neg = ~pos + i_pos = y_true > 0 + i_neg = ~i_pos + + pos = y_pred[i_pos] + neg = y_pred[i_neg] + + dp = dprime_from_samp(pos, neg, bypass_nchk=True, **kwargs) + return dp + + +def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=False): + """Computes the d-prime sensitivity index from positive and negative samples. + + Parameters + ---------- + pos: array-like + Positive sample values (e.g., raw projection values of the positive classifier). + + neg: array-like + Negative sample values. + + maxv: float, optional + Maximum possible d-prime value. If None (default), there's no limit on + the maximum value. + + minv: float, optional + Minimum possible d-prime value. If None (default), there's no limit. + + safedp: bool, optional + If True (default), this function will return None if the resulting d-prime + value becomes non-finite. + + bypass_nchk: bool, optional + If False (default), do not bypass the test to ensure that enough positive + and negatives samples are there for the variance estimation. - assert pos.sum() > 1, 'Not enough positives to estimate the variance' - assert neg.sum() > 1, 'Not enough negatives to estimate the variance' + Returns + ------- + dp: float or None + d-prime, None if d-prime is undefined and raw d-prime value (``safedp=False``) + is not requested (default). + + References + ---------- + http://en.wikipedia.org/wiki/D' + """ + + pos = np.array(pos) + neg = np.array(neg) - pos_mean = y_pred[pos].mean() - neg_mean = y_pred[neg].mean() - pos_var = y_pred[pos].var(ddof=1) - neg_var = y_pred[neg].var(ddof=1) + if not bypass_nchk: + assert pos.size > 1, 'Not enough positive samples to estimate the variance' + assert neg.size > 1, 'Not enough negative samples to estimate the variance' + + pos_mean = pos.mean() + neg_mean = neg.mean() + pos_var = pos.var(ddof=1) + neg_var = neg.var(ddof=1) num = pos_mean - neg_mean div = np.sqrt((pos_var + neg_var) / 2.) - if div == 0: + + # from Dan's suggestion about clipping d' values... + if maxv is None: + maxv = np.inf + if minv is None: + minv = -np.inf + + dp = np.clip(num / div, minv, maxv) + + if safedp and not np.isfinite(dp): dp = None - else: - dp = num / div return dp From 43cabf5ddeee7880f194cb75cd3f529eee3d3750 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 12:57:43 -0400 Subject: [PATCH 05/33] MISC: small chanages for 2x2 confusion matrix d' calculation --- bangmetric/dprime.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 3e5ac94..eee5198 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -13,7 +13,6 @@ DEFAULT_FUDGE_FACTOR = 0.5 DEFAULT_FUDGE_MODE = 'correction' -ATOL = 1e-6 def dprime(y_pred, y_true, **kwargs): @@ -132,7 +131,7 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ - fudge_fac=DEFAULT_FUDGE_FACTOR, atol=ATOL): + fudge_fac=DEFAULT_FUDGE_FACTOR): """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. Parameters @@ -150,9 +149,6 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ 'always': always apply the fudge factor 'correction': apply only when needed - atol: float, optional - Tolerance to simplify the dp from a 2-way (i.e., 2x2) confusion matrix. - Returns ------- dp: array, shape = [n_classes] @@ -197,7 +193,7 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ dp = norm.ppf(TPR) - norm.ppf(FPR) # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value - if len(dp) == 2 and np.abs(dp[0] - dp[1]) < atol: + if len(dp) == 2: dp = np.array([dp[0]]) return dp From 5f8f071451d85d7ca17e8000f67913f456501dd4 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 12:59:39 -0400 Subject: [PATCH 06/33] MISC: no need to "balance" data for d' calculation --- bangmetric/dprime.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index eee5198..32d4799 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -158,8 +158,6 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ ---------- http://en.wikipedia.org/wiki/D' http://en.wikipedia.org/wiki/Confusion_matrix - - XXX: no normalization for unbalanced data """ M = np.array(M) From d6a9cf68ac141dc6dc92b377882317ee075f08eb Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 23:48:46 -0400 Subject: [PATCH 07/33] MISC: addressing most stuffs in github.com/npinto/bangmetric/pull/8 (thanks @npinto!) --- bangmetric/dprime.py | 98 ++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 32d4799..ee3ff08 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,7 +6,7 @@ # # License: BSD -__all__ = ['dprime', 'dprime_from_confusion_ova'] +__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion_ova'] import numpy as np from scipy.stats import norm @@ -33,9 +33,8 @@ def dprime(y_pred, y_true, **kwargs): Returns ------- - dp: float or None - d-prime, None if d-prime is undefined and raw d-prime value (``safedp=False``) - is not requested (default). + dp: float + d-prime References ---------- @@ -60,11 +59,11 @@ def dprime(y_pred, y_true, **kwargs): pos = y_pred[i_pos] neg = y_pred[i_neg] - dp = dprime_from_samp(pos, neg, bypass_nchk=True, **kwargs) + dp = dprime_from_samp(pos, neg, **kwargs) return dp -def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=False): +def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): """Computes the d-prime sensitivity index from positive and negative samples. Parameters @@ -75,26 +74,16 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa neg: array-like Negative sample values. - maxv: float, optional - Maximum possible d-prime value. If None (default), there's no limit on - the maximum value. + max_value: float, optional + Maximum possible d-prime value. Default is ``np.inf``. - minv: float, optional - Minimum possible d-prime value. If None (default), there's no limit. - - safedp: bool, optional - If True (default), this function will return None if the resulting d-prime - value becomes non-finite. - - bypass_nchk: bool, optional - If False (default), do not bypass the test to ensure that enough positive - and negatives samples are there for the variance estimation. + min_value: float, optional + Minimum possible d-prime value. Default is ``-np.inf``. Returns ------- - dp: float or None - d-prime, None if d-prime is undefined and raw d-prime value (``safedp=False``) - is not requested (default). + dp: float + d-prime References ---------- @@ -104,9 +93,10 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa pos = np.array(pos) neg = np.array(neg) - if not bypass_nchk: - assert pos.size > 1, 'Not enough positive samples to estimate the variance' - assert neg.size > 1, 'Not enough negative samples to estimate the variance' + if pos.size <= 1: + raise ValueError('Not enough positive samples to estimate the variance') + if neg.size <= 1: + raise ValueError('Not enough negative samples to estimate the variance') pos_mean = pos.mean() neg_mean = neg.mean() @@ -117,22 +107,16 @@ def dprime_from_samp(pos, neg, maxv=None, minv=None, safedp=True, bypass_nchk=Fa div = np.sqrt((pos_var + neg_var) / 2.) # from Dan's suggestion about clipping d' values... - if maxv is None: - maxv = np.inf - if minv is None: - minv = -np.inf - - dp = np.clip(num / div, minv, maxv) - - if safedp and not np.isfinite(dp): - dp = None + dp = np.clip(num / div, min_value, max_value) return dp def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ - fudge_fac=DEFAULT_FUDGE_FACTOR): + fudge_factor=DEFAULT_FUDGE_FACTOR, max_value=np.inf, min_value=-np.inf): """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. + This function is mostly for when there is no access to internal representation + and/or decision making (like human data). Parameters ---------- @@ -141,13 +125,21 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. - fudge_fac: float, optional + fudge_factor: float, optional A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. fudge_mode: str, optional - Determins how to apply the fudge factor + Determins how to apply the fudge factor. Can be one of: 'always': always apply the fudge factor 'correction': apply only when needed + 'none': no fudging --- equivalent to ``fudge_factor=0`` + + max_value: float, optional + Maximum possible d-prime value. Default is ``np.inf``. + + min_value: float, optional + Minimum possible d-prime value. Default is ``-np.inf``. + Returns ------- @@ -170,26 +162,36 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ TP = np.diag(M) FP = np.sum(M, axis=0) - TP - if fudge_mode == 'always': # always apply fudge factor - TPR = (TP.astype('float') + fudge_fac) / (P + 2.*fudge_fac) - FPR = (FP.astype('float') + fudge_fac) / (N + 2.*fudge_fac) + + # -- application of fudge factor + + if fudge_mode == 'none': # no fudging + fudge_mode = 'always' + fudge_factor = 0 + + if fudge_mode == 'always': # always apply fudge factor + TPR = (TP.astype('float64') + fudge_factor) / (P + 2.*fudge_factor) + FPR = (FP.astype('float64') + fudge_factor) / (N + 2.*fudge_factor) elif fudge_mode == 'correction': # apply fudge factor only when needed - TP = TP.astype('float') - FP = FP.astype('float') + TP = TP.astype('float64') + FP = FP.astype('float64') - TP[TP == P] = P[TP == P] - fudge_fac # 100% correct - TP[TP == 0] = fudge_fac # 0% correct - FP[FP == N] = N[FP == N] - fudge_fac # always FAR - FP[FP == 0] = fudge_fac # no false alarm + TP[TP == P] = P[TP == P] - fudge_factor # 100% correct + TP[TP == 0] = fudge_factor # 0% correct + FP[FP == N] = N[FP == N] - fudge_factor # always FAR + FP[FP == 0] = fudge_factor # no false alarm TPR = TP / P FPR = FP / N else: - assert False, 'Not implemented' + raise ValueError('Invalid fudge_mode') + + + # -- done. compute the d' - dp = norm.ppf(TPR) - norm.ppf(FPR) + dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value if len(dp) == 2: dp = np.array([dp[0]]) From 6f5cbac75972f0c06ad035f72dbdec67b9cee5f8 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 18 Jul 2012 23:56:14 -0400 Subject: [PATCH 08/33] DOC: small retouches --- bangmetric/dprime.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index ee3ff08..147429a 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -127,11 +127,12 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ fudge_factor: float, optional A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. + Default is 0.5. fudge_mode: str, optional Determins how to apply the fudge factor. Can be one of: - 'always': always apply the fudge factor - 'correction': apply only when needed + 'correction': apply only when needed (default) + 'always': always apply the fudge factor 'none': no fudging --- equivalent to ``fudge_factor=0`` max_value: float, optional @@ -144,7 +145,7 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ Returns ------- dp: array, shape = [n_classes] - Array of d-primes, each element corresponding to each class + Array of d-primes, where each element corresponds to each class References ---------- From afa86fe0dca8b842bb6c2328a5b05696849377c6 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 00:02:58 -0400 Subject: [PATCH 09/33] COSMIT --- bangmetric/dprime.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 147429a..46fa009 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -162,37 +162,34 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ TP = np.diag(M) FP = np.sum(M, axis=0) - TP - + TP = TP.astype('float64') + FP = FP.astype('float64') # -- application of fudge factor - if fudge_mode == 'none': # no fudging fudge_mode = 'always' fudge_factor = 0 if fudge_mode == 'always': # always apply fudge factor - TPR = (TP.astype('float64') + fudge_factor) / (P + 2.*fudge_factor) - FPR = (FP.astype('float64') + fudge_factor) / (N + 2.*fudge_factor) + TP += fudge_factor + FP += fudge_factor + P += 2.*fudge_factor + N += 2.*fudge_factor elif fudge_mode == 'correction': # apply fudge factor only when needed - TP = TP.astype('float64') - FP = FP.astype('float64') - TP[TP == P] = P[TP == P] - fudge_factor # 100% correct TP[TP == 0] = fudge_factor # 0% correct FP[FP == N] = N[FP == N] - fudge_factor # always FAR FP[FP == 0] = fudge_factor # no false alarm - TPR = TP / P - FPR = FP / N - else: raise ValueError('Invalid fudge_mode') - # -- done. compute the d' - + TPR = TP / P + FPR = FP / N dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) + # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value if len(dp) == 2: dp = np.array([dp[0]]) From 8babb28e085c1da481223581aba5610f54a307ae Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 00:05:47 -0400 Subject: [PATCH 10/33] COSMIT --- bangmetric/dprime.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 46fa009..9568d98 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -167,10 +167,9 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ # -- application of fudge factor if fudge_mode == 'none': # no fudging - fudge_mode = 'always' - fudge_factor = 0 + pass - if fudge_mode == 'always': # always apply fudge factor + elif fudge_mode == 'always': # always apply fudge factor TP += fudge_factor FP += fudge_factor P += 2.*fudge_factor From d6ab20be4e19fff4c8d6e5910de92964c2d5cfa1 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 01:54:31 -0400 Subject: [PATCH 11/33] ENH: more general dprime_from_confusion (thanks, @npinto!) --- bangmetric/dprime.py | 55 +++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 9568d98..00f496e 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,7 +6,7 @@ # # License: BSD -__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion_ova'] +__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion'] import numpy as np from scipy.stats import norm @@ -112,7 +112,7 @@ def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): return dp -def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ +def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ fudge_factor=DEFAULT_FUDGE_FACTOR, max_value=np.inf, min_value=-np.inf): """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. This function is mostly for when there is no access to internal representation @@ -120,11 +120,23 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ Parameters ---------- - M: array, shape = [n_classes (true), n_classes (pred)] + M: array-like, shape = [n_classes (true), n_classes (pred)] Confusion matrix, where the element M_{rc} means the number of times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. + collation: None (default) or array-like with shape = [n_grouping, n_classes] + Defines how to group entries in `M` to compute TPR and FPR. + Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, + where +1, -1, and 0 designate the corresponding class as a + positive, negative, and ignored class, respectively. For example, + the following `collation` defines a 3-way one vs. rest grouping + (given that `M` is a 3x3 matrix): + [[+1, -1, -1], + [-1, +1, -1], + [-1, -1, +1]] + If `None` (default), one vs. rest grouping is assumed. + fudge_factor: float, optional A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. Default is 0.5. @@ -144,8 +156,9 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ Returns ------- - dp: array, shape = [n_classes] - Array of d-primes, where each element corresponds to each class + dp: array, shape = [n_grouping] + Array of d-primes, where each element corresponds to each grouping + defined by `collation`. References ---------- @@ -153,17 +166,31 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ http://en.wikipedia.org/wiki/Confusion_matrix """ + # M: confusion matrix, row means true classes, col means predicted classes M = np.array(M) assert M.ndim == 2 assert M.shape[0] == M.shape[1] - - P = np.sum(M, axis=1) # number of positives, for each class - N = np.sum(P) - P + n_classes = M.shape[0] - TP = np.diag(M) - FP = np.sum(M, axis=0) - TP - TP = TP.astype('float64') - FP = FP.astype('float64') + if collation is None: + # make it one vs. rest + collation = -np.ones((n_classes, n_classes), dtype='int8') + collation += 2 * np.eye(n_classes, dtype='int8') + else: + collation = np.array(collation, dtype='int8') + assert collation.ndim == 2 + assert collation.shape[1] == n_classes + + # P0: number of positives, for each class + # P: number of positives, for each grouping + # N: number of negatives, for each grouping + # TP: number of true positives, for each grouping + # FP: number of false positives, for each grouping + P0 = np.sum(M, axis=1) + P = np.array([np.sum(P0[coll == +1]) for coll in collation], dtype='float64') + N = np.array([np.sum(P0[coll == -1]) for coll in collation], dtype='float64') + TP = np.array([np.sum(M[coll == +1][:, coll == +1]) for coll in collation], dtype='float64') + FP = np.array([np.sum(M[coll == -1][:, coll == +1]) for coll in collation], dtype='float64') # -- application of fudge factor if fudge_mode == 'none': # no fudging @@ -189,9 +216,5 @@ def dprime_from_confusion_ova(M, fudge_mode=DEFAULT_FUDGE_MODE, \ FPR = FP / N dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) - # if there's only two dp's then, it's must be "A" vs. "~A" task. If so, just give one value - if len(dp) == 2: - dp = np.array([dp[0]]) - return dp From 60814d8f5c72884a18eab577d4360b06d65cdd58 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 15:17:16 -0400 Subject: [PATCH 12/33] COSMIT --- bangmetric/dprime.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 00f496e..8d8521c 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -114,7 +114,7 @@ def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ fudge_factor=DEFAULT_FUDGE_FACTOR, max_value=np.inf, min_value=-np.inf): - """Computes the one-vs-all d-prime sensitivity index of the confusion matrix. + """Computes the d-prime sensitivity index of the confusion matrix. This function is mostly for when there is no access to internal representation and/or decision making (like human data). @@ -125,7 +125,7 @@ def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. - collation: None (default) or array-like with shape = [n_grouping, n_classes] + collation: None or array-like with shape = [n_groupings, n_classes], optional Defines how to group entries in `M` to compute TPR and FPR. Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, where +1, -1, and 0 designate the corresponding class as a @@ -156,7 +156,7 @@ def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ Returns ------- - dp: array, shape = [n_grouping] + dp: array, shape = [n_groupings] Array of d-primes, where each element corresponds to each grouping defined by `collation`. From 056aa5e3f3d8918f18890bcb55a667276a85442d Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 16:38:57 -0400 Subject: [PATCH 13/33] ENH: refactoring out a function that computes stats of a confu matrix. --- bangmetric/utils.py | 130 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 bangmetric/utils.py diff --git a/bangmetric/utils.py b/bangmetric/utils.py new file mode 100644 index 0000000..2ae01c3 --- /dev/null +++ b/bangmetric/utils.py @@ -0,0 +1,130 @@ +"""Other utility functions""" + +# Authors: Ha Hong +# +# License: BSD + +__all__ = ['confusion_stats'] + +import numpy as np + +DEFAULT_FUDGE_FACTOR = 0.5 +DEFAULT_FUDGE_MODE = 'correction' + + +def confusion_stats(M, collation=None, \ + fudge_mode=DEFAULT_FUDGE_MODE, fudge_factor=DEFAULT_FUDGE_FACTOR): + """Computes classification statistics of sub-confusion matrices inside + the given original confusion matrix M. If no ``collation`` is given, + statistics for each one vs. rest sub-confusion matrix will be computed. + + Parameters + ---------- + M: array-like, shape = [n_classes (true), n_classes (pred)] + Confusion matrix, where the element M_{rc} means the number of + times when the classifier guesses that a test sample in the r-th class + belongs to the c-th class. + + collation: None or array-like with shape = [n_groupings, n_classes], optional + Defines how to group entries in `M` to compute TPR and FPR. + Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, + where +1, -1, and 0 designate the corresponding class as a + positive, negative, and ignored class, respectively. For example, + the following `collation` defines a 3-way one vs. rest grouping + (given that `M` is a 3x3 matrix): + [[+1, -1, -1], + [-1, +1, -1], + [-1, -1, +1]] + If `None` (default), one vs. rest grouping is assumed. + + fudge_factor: float, optional + A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. + Default is 0.5. + + fudge_mode: str, optional + Determins how to apply the fudge factor. Can be one of: + 'correction': apply only when needed (default) + 'always': always apply the fudge factor + 'none': no fudging --- equivalent to ``fudge_factor=0`` + + + Returns + ------- + P: array, shape = [n_groupings] + Array of the number of positives, where each element corresponds to each + grouping defined by `collation`. + N: array, shape = [n_groupings] + Same as P, except that this is an array of the number of negatives. + TP: array, shape = [n_groupings] + Same as P, except that this is an array of the number of true positives. + TN: array, shape = [n_groupings] + Same as P, except that this is an array of the number of true negatives. + FP: array, shape = [n_groupings] + Same as P, except that this is an array of the number of false positives. + FN: array, shape = [n_groupings] + Same as P, except that this is an array of the number of false negatives. + + + References + ---------- + http://en.wikipedia.org/wiki/Confusion_matrix + http://en.wikipedia.org/wiki/Receiver_operating_characteristic + """ + + # M: confusion matrix, row means true classes, col means predicted classes + M = np.array(M) + assert M.ndim == 2 + assert M.shape[0] == M.shape[1] + n_classes = M.shape[0] + + if collation is None: + # make it one vs. rest + collation = -np.ones((n_classes, n_classes), dtype='int8') + collation += 2 * np.eye(n_classes, dtype='int8') + else: + collation = np.array(collation, dtype='int8') + assert collation.ndim == 2 + assert collation.shape[1] == n_classes + + # P0: number of positives, for each class + # P: number of positives, for each grouping + # N: number of negatives, for each grouping + # TP: number of true positives, for each grouping + # FP: number of false positives, for each grouping + P0 = np.sum(M, axis=1) + P = np.array([np.sum(P0[coll == +1]) for coll in collation], dtype='float64') + N = np.array([np.sum(P0[coll == -1]) for coll in collation], dtype='float64') + TP = np.array([np.sum(M[coll == +1][:, coll == +1]) for coll in collation], dtype='float64') + TN = np.array([np.sum(M[coll == -1][:, coll == -1]) for coll in collation], dtype='float64') + FP = np.array([np.sum(M[coll == -1][:, coll == +1]) for coll in collation], dtype='float64') + FN = np.array([np.sum(M[coll == +1][:, coll == -1]) for coll in collation], dtype='float64') + + # -- application of fudge factor + if fudge_mode == 'none': # no fudging + pass + + elif fudge_mode == 'always': # always apply fudge factor + TP += fudge_factor + FP += fudge_factor + TN += fudge_factor + FN += fudge_factor + P += 2.*fudge_factor + N += 2.*fudge_factor + + elif fudge_mode == 'correction': # apply fudge factor only when needed + TP[TP == P] = P[TP == P] - fudge_factor # 100% correct + TP[TP == 0] = fudge_factor # 0% correct + FP[FP == N] = N[FP == N] - fudge_factor # always FAR + FP[FP == 0] = fudge_factor # no false alarm + + TN[TN == N] = N[TN == N] - fudge_factor + TN[TN == 0] = fudge_factor + FN[FN == P] = P[FN == P] - fudge_factor + FN[FN == 0] = fudge_factor + + else: + raise ValueError('Invalid fudge_mode') + + # -- done + return P, N, TP, TN, FP, FN + From 396224ba48336b15e7f8cd6800c86049a040a927 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 16:41:25 -0400 Subject: [PATCH 14/33] COSMIT: refactoring confusion matrix handling part --- bangmetric/dprime.py | 89 +++++++------------------------------------- 1 file changed, 14 insertions(+), 75 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 8d8521c..6b2f44d 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -10,9 +10,7 @@ import numpy as np from scipy.stats import norm - -DEFAULT_FUDGE_FACTOR = 0.5 -DEFAULT_FUDGE_MODE = 'correction' +from .utils import confusion_stats def dprime(y_pred, y_true, **kwargs): @@ -112,11 +110,12 @@ def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): return dp -def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ - fudge_factor=DEFAULT_FUDGE_FACTOR, max_value=np.inf, min_value=-np.inf): - """Computes the d-prime sensitivity index of the confusion matrix. - This function is mostly for when there is no access to internal representation - and/or decision making (like human data). +def dprime_from_confusion(M, max_value=np.inf, min_value=-np.inf, **kwargs): + """Computes the d-prime sensitivity index of the given confusion matrix. + This function is designed mostly for when there is no access to internal + representations and/or decision making mechanisms (like human data). + If no ``collation`` is defined in ``kwargs`` this function computes + one vs. rest d-prime for each class. Parameters ---------- @@ -125,34 +124,17 @@ def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. - collation: None or array-like with shape = [n_groupings, n_classes], optional - Defines how to group entries in `M` to compute TPR and FPR. - Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, - where +1, -1, and 0 designate the corresponding class as a - positive, negative, and ignored class, respectively. For example, - the following `collation` defines a 3-way one vs. rest grouping - (given that `M` is a 3x3 matrix): - [[+1, -1, -1], - [-1, +1, -1], - [-1, -1, +1]] - If `None` (default), one vs. rest grouping is assumed. - - fudge_factor: float, optional - A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. - Default is 0.5. - - fudge_mode: str, optional - Determins how to apply the fudge factor. Can be one of: - 'correction': apply only when needed (default) - 'always': always apply the fudge factor - 'none': no fudging --- equivalent to ``fudge_factor=0`` - max_value: float, optional Maximum possible d-prime value. Default is ``np.inf``. min_value: float, optional Minimum possible d-prime value. Default is ``-np.inf``. + kwargs: named arguments, optional + Passed to ``confusion_stats()``. By passing ``collation``, ``fudge_mode``, + ``fudge_factor``, etc. one can change the behavior of d-prime computation + (see ``confusion_stats()`` for details). + Returns ------- @@ -167,51 +149,8 @@ def dprime_from_confusion(M, collation=None, fudge_mode=DEFAULT_FUDGE_MODE, \ """ # M: confusion matrix, row means true classes, col means predicted classes - M = np.array(M) - assert M.ndim == 2 - assert M.shape[0] == M.shape[1] - n_classes = M.shape[0] - - if collation is None: - # make it one vs. rest - collation = -np.ones((n_classes, n_classes), dtype='int8') - collation += 2 * np.eye(n_classes, dtype='int8') - else: - collation = np.array(collation, dtype='int8') - assert collation.ndim == 2 - assert collation.shape[1] == n_classes - - # P0: number of positives, for each class - # P: number of positives, for each grouping - # N: number of negatives, for each grouping - # TP: number of true positives, for each grouping - # FP: number of false positives, for each grouping - P0 = np.sum(M, axis=1) - P = np.array([np.sum(P0[coll == +1]) for coll in collation], dtype='float64') - N = np.array([np.sum(P0[coll == -1]) for coll in collation], dtype='float64') - TP = np.array([np.sum(M[coll == +1][:, coll == +1]) for coll in collation], dtype='float64') - FP = np.array([np.sum(M[coll == -1][:, coll == +1]) for coll in collation], dtype='float64') - - # -- application of fudge factor - if fudge_mode == 'none': # no fudging - pass - - elif fudge_mode == 'always': # always apply fudge factor - TP += fudge_factor - FP += fudge_factor - P += 2.*fudge_factor - N += 2.*fudge_factor - - elif fudge_mode == 'correction': # apply fudge factor only when needed - TP[TP == P] = P[TP == P] - fudge_factor # 100% correct - TP[TP == 0] = fudge_factor # 0% correct - FP[FP == N] = N[FP == N] - fudge_factor # always FAR - FP[FP == 0] = fudge_factor # no false alarm - - else: - raise ValueError('Invalid fudge_mode') - - # -- done. compute the d' + P, N, TP, _, FP, _ = confusion_stats(M, **kwargs) + TPR = TP / P FPR = FP / N dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) From ad8e3afae5bf28d07f4fbd1353fc69cd380d6218 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 17:00:08 -0400 Subject: [PATCH 15/33] COSMIT --- bangmetric/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bangmetric/utils.py b/bangmetric/utils.py index 2ae01c3..a687aa6 100644 --- a/bangmetric/utils.py +++ b/bangmetric/utils.py @@ -26,7 +26,7 @@ def confusion_stats(M, collation=None, \ belongs to the c-th class. collation: None or array-like with shape = [n_groupings, n_classes], optional - Defines how to group entries in `M` to compute TPR and FPR. + Defines how to group entries in `M` to make sub-confusion matrices. Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, where +1, -1, and 0 designate the corresponding class as a positive, negative, and ignored class, respectively. For example, @@ -78,7 +78,10 @@ def confusion_stats(M, collation=None, \ n_classes = M.shape[0] if collation is None: - # make it one vs. rest + # make it one vs. rest. E.g., for a 3-classes case: + # [[+1, -1, -1], + # [-1, +1, -1], + # [-1, -1, +1]] collation = -np.ones((n_classes, n_classes), dtype='int8') collation += 2 * np.eye(n_classes, dtype='int8') else: From b1d8b77de7456c837650c91df92793e5caa12b1f Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 17:05:50 -0400 Subject: [PATCH 16/33] DOC: small changes --- bangmetric/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bangmetric/utils.py b/bangmetric/utils.py index a687aa6..482dbf5 100644 --- a/bangmetric/utils.py +++ b/bangmetric/utils.py @@ -38,8 +38,8 @@ def confusion_stats(M, collation=None, \ If `None` (default), one vs. rest grouping is assumed. fudge_factor: float, optional - A small factor to avoid non-finite numbers when TPR or FPR becomes 0 or 1. - Default is 0.5. + A small factor to avoid TPR, FPR, TNR, or FNR becoming 0 or 1. + Mostly intended for d-prime calculation. Default is 0.5. fudge_mode: str, optional Determins how to apply the fudge factor. Can be one of: From b0d58c17871e646199b8397942a89c84c3e6fc9d Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Thu, 19 Jul 2012 17:09:21 -0400 Subject: [PATCH 17/33] DOC: small changes --- bangmetric/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bangmetric/utils.py b/bangmetric/utils.py index 482dbf5..fc75db3 100644 --- a/bangmetric/utils.py +++ b/bangmetric/utils.py @@ -52,7 +52,7 @@ def confusion_stats(M, collation=None, \ ------- P: array, shape = [n_groupings] Array of the number of positives, where each element corresponds to each - grouping defined by `collation`. + grouping (row) defined by `collation`. N: array, shape = [n_groupings] Same as P, except that this is an array of the number of negatives. TP: array, shape = [n_groupings] From 15295c56001921730501f6d01f23d3aecf4afffe Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Fri, 20 Jul 2012 23:00:17 -0400 Subject: [PATCH 18/33] COSMIT: combined dprime() and dprime_from_samp() --- bangmetric/dprime.py | 127 ++++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 56 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 6b2f44d..59c8e94 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,70 +6,37 @@ # # License: BSD -__all__ = ['dprime', 'dprime_from_samp', 'dprime_from_confusion'] +__all__ = ['dprime', 'dprime_from_confusion_matrix'] import numpy as np from scipy.stats import norm from .utils import confusion_stats -def dprime(y_pred, y_true, **kwargs): - """Computes the d-prime sensitivity index of the predictions. +def dprime(y_pred=None, y_true=None, pos=None, neg=None, max_value=np.inf, min_value=-np.inf): + """Computes the d-prime sensitivity index. + One must provide either y_pred and y_true or pos and neg. + This function computes the d-prime of predictions given by + y_pred and y_true by default. If pos and neg are provided + both y_pred and y_true are ignored and this function + computes the d-prime from positive and negative samples + given by pos and neg. Parameters ---------- - y_true: array, shape = [n_samples] + y_true: array, shape = [n_samples], optional True values, interpreted as strictly positive or not (i.e. converted to binary). Could be in {-1, +1} or {0, 1} or {False, True}. - y_pred: array, shape = [n_samples] + y_pred: array, shape = [n_samples], optional Predicted values (real). - kwargs: named arguments, optional - Passed to ``dprime_from_samp()``. - - Returns - ------- - dp: float - d-prime - - References - ---------- - http://en.wikipedia.org/wiki/D' - """ - - # -- basic checks and conversion - assert len(y_true) == len(y_pred) - assert np.isfinite(y_true).all() - assert np.isfinite(y_pred).all() + pos: array-like, optional + Positive sample values (e.g., raw projection values of + the positive classifier). - y_true = np.array(y_true) - assert y_true.ndim == 1 - - y_pred = np.array(y_pred) - assert y_pred.ndim == 1 - - # -- actual computation - i_pos = y_true > 0 - i_neg = ~i_pos - - pos = y_pred[i_pos] - neg = y_pred[i_neg] - - dp = dprime_from_samp(pos, neg, **kwargs) - return dp - - -def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): - """Computes the d-prime sensitivity index from positive and negative samples. - - Parameters - ---------- - pos: array-like - Positive sample values (e.g., raw projection values of the positive classifier). - - neg: array-like + neg: array-like, optional Negative sample values. max_value: float, optional @@ -88,14 +55,37 @@ def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): http://en.wikipedia.org/wiki/D' """ - pos = np.array(pos) - neg = np.array(neg) + # -- basic checks and conversion + if pos is not None and neg is not None: + pos = np.array(pos) + neg = np.array(neg) + + else: + assert len(y_true) == len(y_pred) + assert np.isfinite(y_true).all() + + y_true = np.array(y_true) + assert y_true.ndim == 1 + + y_pred = np.array(y_pred) + assert y_pred.ndim == 1 + + # -- actual computation + i_pos = y_true > 0 + i_neg = ~i_pos + + pos = y_pred[i_pos] + neg = y_pred[i_neg] + + assert np.isfinite(pos).all() + assert np.isfinite(neg).all() if pos.size <= 1: raise ValueError('Not enough positive samples to estimate the variance') if neg.size <= 1: raise ValueError('Not enough negative samples to estimate the variance') + # -- compute d' pos_mean = pos.mean() neg_mean = neg.mean() pos_var = pos.var(ddof=1) @@ -110,8 +100,8 @@ def dprime_from_samp(pos, neg, max_value=np.inf, min_value=-np.inf): return dp -def dprime_from_confusion(M, max_value=np.inf, min_value=-np.inf, **kwargs): - """Computes the d-prime sensitivity index of the given confusion matrix. +def dprime_from_confusion_matrix(M, max_value=np.inf, min_value=-np.inf, **kwargs): + """Computes the d-prime sensitivity indices of the given confusion matrix. This function is designed mostly for when there is no access to internal representations and/or decision making mechanisms (like human data). If no ``collation`` is defined in ``kwargs`` this function computes @@ -121,8 +111,8 @@ def dprime_from_confusion(M, max_value=np.inf, min_value=-np.inf, **kwargs): ---------- M: array-like, shape = [n_classes (true), n_classes (pred)] Confusion matrix, where the element M_{rc} means the number of - times when the classifier guesses that a test sample in the r-th class - belongs to the c-th class. + times when the classifier/subject guesses that a test sample in + the r-th class belongs to the c-th class. max_value: float, optional Maximum possible d-prime value. Default is ``np.inf``. @@ -135,12 +125,11 @@ def dprime_from_confusion(M, max_value=np.inf, min_value=-np.inf, **kwargs): ``fudge_factor``, etc. one can change the behavior of d-prime computation (see ``confusion_stats()`` for details). - Returns ------- dp: array, shape = [n_groupings] Array of d-primes, where each element corresponds to each grouping - defined by `collation`. + defined by `collation` (see ``confusion_stats()`` for details). References ---------- @@ -157,3 +146,29 @@ def dprime_from_confusion(M, max_value=np.inf, min_value=-np.inf, **kwargs): return dp + + """Computes the population d-primes from the given set of confusion matrices. + Note: it is advised to read the documentation of ``dprime_from_confusion()`` + for understanding of ``kwargs``. + + Parameters + ---------- + M: array-like, shape = [n_individuals, n_classes (true), n_classes (pred)] + Set of confusion matrices, where the element M_{irc} means the number of + times when the i-th individual guesses that a test sample in the r-th class + belongs to the c-th class. + + kwargs: named arguments, optional + Passed to ``dprime_from_confusion()``. + + Returns + ------- + dp: array, shape = [n_groupings] + Array of population d-primes, where each element corresponds to each + grouping defined by `collation` (see ``confusion_stats()`` for details). + + References + ---------- + http://en.wikipedia.org/wiki/D' + http://en.wikipedia.org/wiki/Confusion_matrix + """ From 69f89ec3730619e8d37c54a5a1ec77f4f43406cd Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Fri, 20 Jul 2012 23:02:19 -0400 Subject: [PATCH 19/33] COSMIT --- bangmetric/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bangmetric/utils.py b/bangmetric/utils.py index fc75db3..adfbdb9 100644 --- a/bangmetric/utils.py +++ b/bangmetric/utils.py @@ -4,7 +4,7 @@ # # License: BSD -__all__ = ['confusion_stats'] +__all__ = ['confusion_matrix_stats'] import numpy as np @@ -12,7 +12,7 @@ DEFAULT_FUDGE_MODE = 'correction' -def confusion_stats(M, collation=None, \ +def confusion_matrix_stats(M, collation=None, \ fudge_mode=DEFAULT_FUDGE_MODE, fudge_factor=DEFAULT_FUDGE_FACTOR): """Computes classification statistics of sub-confusion matrices inside the given original confusion matrix M. If no ``collation`` is given, From 341d29a158e7a9caf11ebdadd68c73595ff2f774 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Sat, 21 Jul 2012 00:36:57 -0400 Subject: [PATCH 20/33] COSMIT --- bangmetric/dprime.py | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 59c8e94..73757ec 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -17,8 +17,8 @@ def dprime(y_pred=None, y_true=None, pos=None, neg=None, max_value=np.inf, min_v """Computes the d-prime sensitivity index. One must provide either y_pred and y_true or pos and neg. This function computes the d-prime of predictions given by - y_pred and y_true by default. If pos and neg are provided - both y_pred and y_true are ignored and this function + y_pred and y_true by default. If pos and neg are provided, + both y_pred and y_true are ignored, and this function computes the d-prime from positive and negative samples given by pos and neg. @@ -37,7 +37,8 @@ def dprime(y_pred=None, y_true=None, pos=None, neg=None, max_value=np.inf, min_v the positive classifier). neg: array-like, optional - Negative sample values. + Negative sample values. If both pos and neg are + provided, y_true and y_pred are ignored. max_value: float, optional Maximum possible d-prime value. Default is ``np.inf``. @@ -138,7 +139,7 @@ def dprime_from_confusion_matrix(M, max_value=np.inf, min_value=-np.inf, **kwarg """ # M: confusion matrix, row means true classes, col means predicted classes - P, N, TP, _, FP, _ = confusion_stats(M, **kwargs) + P, N, TP, _, FP, _ = confusion_matrix_stats(M, **kwargs) TPR = TP / P FPR = FP / N @@ -146,29 +147,3 @@ def dprime_from_confusion_matrix(M, max_value=np.inf, min_value=-np.inf, **kwarg return dp - - """Computes the population d-primes from the given set of confusion matrices. - Note: it is advised to read the documentation of ``dprime_from_confusion()`` - for understanding of ``kwargs``. - - Parameters - ---------- - M: array-like, shape = [n_individuals, n_classes (true), n_classes (pred)] - Set of confusion matrices, where the element M_{irc} means the number of - times when the i-th individual guesses that a test sample in the r-th class - belongs to the c-th class. - - kwargs: named arguments, optional - Passed to ``dprime_from_confusion()``. - - Returns - ------- - dp: array, shape = [n_groupings] - Array of population d-primes, where each element corresponds to each - grouping defined by `collation` (see ``confusion_stats()`` for details). - - References - ---------- - http://en.wikipedia.org/wiki/D' - http://en.wikipedia.org/wiki/Confusion_matrix - """ From de48e4696ed7fab11003c1eee8a04755a5ecf6a1 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 11:36:40 -0400 Subject: [PATCH 21/33] MISC: small errors and cosmetic changes --- bangmetric/dprime.py | 139 +++++++++++++++++++++++++++---------------- bangmetric/utils.py | 63 ++++++++++---------- 2 files changed, 120 insertions(+), 82 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index 73757ec..b85b075 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -10,35 +10,53 @@ import numpy as np from scipy.stats import norm -from .utils import confusion_stats +from .utils import confusion_matrix_stats +DEFAULT_DPRIME_MODE = 'binary' -def dprime(y_pred=None, y_true=None, pos=None, neg=None, max_value=np.inf, min_value=-np.inf): - """Computes the d-prime sensitivity index. - One must provide either y_pred and y_true or pos and neg. - This function computes the d-prime of predictions given by - y_pred and y_true by default. If pos and neg are provided, - both y_pred and y_true are ignored, and this function - computes the d-prime from positive and negative samples - given by pos and neg. + +def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ + min_value=-np.inf): + """Computes the d-prime sensitivity index of predictions + from various data formats. Depending on the choice of + `mode`, this function can take one of the following format: + + * Binary classification outputs (`mode='binary'`; default) + * Positive and negative samples (`mode='sample'`) + * True positive and false positive rate (`mode='rate'`) Parameters ---------- - y_true: array, shape = [n_samples], optional - True values, interpreted as strictly positive or not - (i.e. converted to binary). - Could be in {-1, +1} or {0, 1} or {False, True}. + A, B: + If `mode` is 'binary': + + A: array, shape = [n_samples], + True values, interpreted as strictly positive or not + (i.e. converted to binary). + Could be in {-1, +1} or {0, 1} or {False, True}. + + B: array, shape = [n_samples], + Predicted values (real). + + If `mode` is 'sample': - y_pred: array, shape = [n_samples], optional - Predicted values (real). + A: array-like, + Positive sample values (e.g., raw projection values + of the positive classifier). - pos: array-like, optional - Positive sample values (e.g., raw projection values of - the positive classifier). + B: array-like, + Negative sample values. - neg: array-like, optional - Negative sample values. If both pos and neg are - provided, y_true and y_pred are ignored. + If `mode` is 'rate': + + A: array-like, shape = [n_groupings] + True positive rates + + B: array-like, shape = [n_groupings] + False positive rates + + mode: {'binary', 'sample', 'rate'}, optional + Directs the interpretation of A and B max_value: float, optional Maximum possible d-prime value. Default is ``np.inf``. @@ -57,11 +75,12 @@ def dprime(y_pred=None, y_true=None, pos=None, neg=None, max_value=np.inf, min_v """ # -- basic checks and conversion - if pos is not None and neg is not None: - pos = np.array(pos) - neg = np.array(neg) + if mode == 'sample': + pos, neg = np.array(A), np.array(B) + + elif mode == 'binary': + y_true, y_pred = A, B - else: assert len(y_true) == len(y_pred) assert np.isfinite(y_true).all() @@ -71,48 +90,64 @@ def dprime(y_pred=None, y_true=None, pos=None, neg=None, max_value=np.inf, min_v y_pred = np.array(y_pred) assert y_pred.ndim == 1 - # -- actual computation i_pos = y_true > 0 i_neg = ~i_pos pos = y_pred[i_pos] neg = y_pred[i_neg] - assert np.isfinite(pos).all() - assert np.isfinite(neg).all() + elif mode == 'rate': + TPR, FPR = np.array(A), np.array(B) + assert TPR.shape == FPR.shape - if pos.size <= 1: - raise ValueError('Not enough positive samples to estimate the variance') - if neg.size <= 1: - raise ValueError('Not enough negative samples to estimate the variance') + else: + raise ValueError('Invalid mode') # -- compute d' - pos_mean = pos.mean() - neg_mean = neg.mean() - pos_var = pos.var(ddof=1) - neg_var = neg.var(ddof=1) + if mode == 'sample' or mode == 'binary': + assert np.isfinite(pos).all() + assert np.isfinite(neg).all() + + if pos.size <= 1: + raise ValueError('Not enough positive samples'\ + 'to estimate the variance') + if neg.size <= 1: + raise ValueError('Not enough negative samples'\ + 'to estimate the variance') + + pos_mean = pos.mean() + neg_mean = neg.mean() + pos_var = pos.var(ddof=1) + neg_var = neg.var(ddof=1) - num = pos_mean - neg_mean - div = np.sqrt((pos_var + neg_var) / 2.) + num = pos_mean - neg_mean + div = np.sqrt((pos_var + neg_var) / 2.) + + dp = num / div + + else: # mode == 'rate' + dp = norm.ppf(TPR) - norm.ppf(FPR) # from Dan's suggestion about clipping d' values... - dp = np.clip(num / div, min_value, max_value) + dp = np.clip(dp, min_value, max_value) return dp -def dprime_from_confusion_matrix(M, max_value=np.inf, min_value=-np.inf, **kwargs): - """Computes the d-prime sensitivity indices of the given confusion matrix. - This function is designed mostly for when there is no access to internal - representations and/or decision making mechanisms (like human data). - If no ``collation`` is defined in ``kwargs`` this function computes +def dprime_from_confusion_matrix(M, max_value=np.inf, \ + min_value=-np.inf, **kwargs): + """Computes the d-prime sensitivity indices of predictions from + the given confusion matrix. This function is designed mostly for + when there is no access to internal representations and/or + decision making mechanisms (like human data). If no ``collation`` + is defined in ``kwargs`` this function computes one vs. rest d-prime for each class. Parameters ---------- - M: array-like, shape = [n_classes (true), n_classes (pred)] + M: array-like, shape = [n_classes (true), n_classes (pred)] Confusion matrix, where the element M_{rc} means the number of - times when the classifier/subject guesses that a test sample in + times when the classifier/subject guesses that a test sample in the r-th class belongs to the c-th class. max_value: float, optional @@ -122,9 +157,10 @@ def dprime_from_confusion_matrix(M, max_value=np.inf, min_value=-np.inf, **kwarg Minimum possible d-prime value. Default is ``-np.inf``. kwargs: named arguments, optional - Passed to ``confusion_stats()``. By passing ``collation``, ``fudge_mode``, - ``fudge_factor``, etc. one can change the behavior of d-prime computation - (see ``confusion_stats()`` for details). + Passed to ``confusion_stats()``. By assigning ``collation``, + ``fudge_mode``, ``fudge_factor``, etc. one can change the + behavior of d-prime computation + (see ``confusion_stats()`` for details). Returns ------- @@ -143,7 +179,6 @@ def dprime_from_confusion_matrix(M, max_value=np.inf, min_value=-np.inf, **kwarg TPR = TP / P FPR = FP / N - dp = np.clip(norm.ppf(TPR) - norm.ppf(FPR), min_value, max_value) - - return dp + return dprime(TPR, FPR, mode='rate', \ + max_value=max_value, min_value=min_value) diff --git a/bangmetric/utils.py b/bangmetric/utils.py index adfbdb9..979de10 100644 --- a/bangmetric/utils.py +++ b/bangmetric/utils.py @@ -14,23 +14,23 @@ def confusion_matrix_stats(M, collation=None, \ fudge_mode=DEFAULT_FUDGE_MODE, fudge_factor=DEFAULT_FUDGE_FACTOR): - """Computes classification statistics of sub-confusion matrices inside + """Computes classification statistics of sub-confusion matrices inside the given original confusion matrix M. If no ``collation`` is given, statistics for each one vs. rest sub-confusion matrix will be computed. Parameters ---------- - M: array-like, shape = [n_classes (true), n_classes (pred)] + M: array-like, shape = [n_classes (true), n_classes (pred)] Confusion matrix, where the element M_{rc} means the number of times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. - collation: None or array-like with shape = [n_groupings, n_classes], optional - Defines how to group entries in `M` to make sub-confusion matrices. + collation: None or array-like of shape = [n_groupings, n_classes], optional + Defines how to group entries in `M` to make sub-confusion matrices. Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, where +1, -1, and 0 designate the corresponding class as a - positive, negative, and ignored class, respectively. For example, - the following `collation` defines a 3-way one vs. rest grouping + positive, negative, and ignored class, respectively. For example, + the following `collation` defines a 3-way one vs. rest grouping (given that `M` is a 3x3 matrix): [[+1, -1, -1], [-1, +1, -1], @@ -47,23 +47,21 @@ def confusion_matrix_stats(M, collation=None, \ 'always': always apply the fudge factor 'none': no fudging --- equivalent to ``fudge_factor=0`` - Returns ------- P: array, shape = [n_groupings] - Array of the number of positives, where each element corresponds to each - grouping (row) defined by `collation`. + Array of the number of positives, where each element corresponds to + each grouping (row) defined by `collation`. N: array, shape = [n_groupings] Same as P, except that this is an array of the number of negatives. TP: array, shape = [n_groupings] - Same as P, except that this is an array of the number of true positives. + Same as P, except an array of the number of true positives. TN: array, shape = [n_groupings] - Same as P, except that this is an array of the number of true negatives. + Same as P, except an array of the number of true negatives. FP: array, shape = [n_groupings] - Same as P, except that this is an array of the number of false positives. + Same as P, except an array of the number of false positives. FN: array, shape = [n_groupings] - Same as P, except that this is an array of the number of false negatives. - + Same as P, except an array of the number of false negatives. References ---------- @@ -77,7 +75,7 @@ def confusion_matrix_stats(M, collation=None, \ assert M.shape[0] == M.shape[1] n_classes = M.shape[0] - if collation is None: + if collation is None: # make it one vs. rest. E.g., for a 3-classes case: # [[+1, -1, -1], # [-1, +1, -1], @@ -88,19 +86,25 @@ def confusion_matrix_stats(M, collation=None, \ collation = np.array(collation, dtype='int8') assert collation.ndim == 2 assert collation.shape[1] == n_classes - + # P0: number of positives, for each class # P: number of positives, for each grouping # N: number of negatives, for each grouping # TP: number of true positives, for each grouping # FP: number of false positives, for each grouping - P0 = np.sum(M, axis=1) - P = np.array([np.sum(P0[coll == +1]) for coll in collation], dtype='float64') - N = np.array([np.sum(P0[coll == -1]) for coll in collation], dtype='float64') - TP = np.array([np.sum(M[coll == +1][:, coll == +1]) for coll in collation], dtype='float64') - TN = np.array([np.sum(M[coll == -1][:, coll == -1]) for coll in collation], dtype='float64') - FP = np.array([np.sum(M[coll == -1][:, coll == +1]) for coll in collation], dtype='float64') - FN = np.array([np.sum(M[coll == +1][:, coll == -1]) for coll in collation], dtype='float64') + P0 = np.sum(M, axis=1) + P = np.array([np.sum(P0[coll == +1]) \ + for coll in collation], dtype='float64') + N = np.array([np.sum(P0[coll == -1]) \ + for coll in collation], dtype='float64') + TP = np.array([np.sum(M[coll == +1][:, coll == +1]) \ + for coll in collation], dtype='float64') + TN = np.array([np.sum(M[coll == -1][:, coll == -1]) \ + for coll in collation], dtype='float64') + FP = np.array([np.sum(M[coll == -1][:, coll == +1]) \ + for coll in collation], dtype='float64') + FN = np.array([np.sum(M[coll == +1][:, coll == -1]) \ + for coll in collation], dtype='float64') # -- application of fudge factor if fudge_mode == 'none': # no fudging @@ -111,8 +115,8 @@ def confusion_matrix_stats(M, collation=None, \ FP += fudge_factor TN += fudge_factor FN += fudge_factor - P += 2.*fudge_factor - N += 2.*fudge_factor + P += 2. * fudge_factor + N += 2. * fudge_factor elif fudge_mode == 'correction': # apply fudge factor only when needed TP[TP == P] = P[TP == P] - fudge_factor # 100% correct @@ -120,14 +124,13 @@ def confusion_matrix_stats(M, collation=None, \ FP[FP == N] = N[FP == N] - fudge_factor # always FAR FP[FP == 0] = fudge_factor # no false alarm - TN[TN == N] = N[TN == N] - fudge_factor - TN[TN == 0] = fudge_factor - FN[FN == P] = P[FN == P] - fudge_factor - FN[FN == 0] = fudge_factor + TN[TN == N] = N[TN == N] - fudge_factor + TN[TN == 0] = fudge_factor + FN[FN == P] = P[FN == P] - fudge_factor + FN[FN == 0] = fudge_factor else: raise ValueError('Invalid fudge_mode') # -- done return P, N, TP, TN, FP, FN - From cad170b30fd6162acc6e8819a0de3f6e70c44e0f Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 11:53:52 -0400 Subject: [PATCH 22/33] MISC: merge dprime_from_confusion_matrix and dprime --- bangmetric/dprime.py | 95 +++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 58 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index b85b075..a5b65de 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -6,7 +6,7 @@ # # License: BSD -__all__ = ['dprime', 'dprime_from_confusion_matrix'] +__all__ = ['dprime'] import numpy as np from scipy.stats import norm @@ -16,7 +16,7 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ - min_value=-np.inf): + min_value=-np.inf, **kwargs): """Computes the d-prime sensitivity index of predictions from various data formats. Depending on the choice of `mode`, this function can take one of the following format: @@ -24,11 +24,12 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ * Binary classification outputs (`mode='binary'`; default) * Positive and negative samples (`mode='sample'`) * True positive and false positive rate (`mode='rate'`) + * Confusion matrix (`mode='confusionmat'`) Parameters ---------- A, B: - If `mode` is 'binary': + If `mode` is 'binary' (default): A: array, shape = [n_samples], True values, interpreted as strictly positive or not @@ -55,8 +56,18 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ B: array-like, shape = [n_groupings] False positive rates + if `mode` is 'confusionmat': + + A: array-like, shape = [n_classes (true), n_classes (pred)] + Confusion matrix, where the element M_{rc} means + the number of times when the classifier or subject + guesses that a test sample in the r-th class + belongs to the c-th class. + + B: ignored + mode: {'binary', 'sample', 'rate'}, optional - Directs the interpretation of A and B + Directs the interpretation of A and B. Default is 'binary'. max_value: float, optional Maximum possible d-prime value. Default is ``np.inf``. @@ -64,14 +75,24 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ min_value: float, optional Minimum possible d-prime value. Default is ``-np.inf``. + kwargs: named arguments, optional + Passed to ``confusion_stats()`` and used only when `mode` + is 'confusionmat'. By assigning ``collation``, + ``fudge_mode``, ``fudge_factor``, etc. one can + change the behavior of d-prime computation + (see ``confusion_stats()`` for details). + Returns ------- - dp: float - d-prime + dp: float or array of shape = [n_groupings] + A d-prime value or array of d-primes, where each element + corresponds to each grouping of positives and negatives + (when `mode` is 'rate' or 'confusionmat') References ---------- http://en.wikipedia.org/wiki/D' + http://en.wikipedia.org/wiki/Confusion_matrix """ # -- basic checks and conversion @@ -100,11 +121,19 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ TPR, FPR = np.array(A), np.array(B) assert TPR.shape == FPR.shape + elif mode == 'confusionmat': + # A: confusion mat + # row means true classes, col means predicted classes + P, N, TP, _, FP, _ = confusion_matrix_stats(A, **kwargs) + + TPR = TP / P + FPR = FP / N + else: raise ValueError('Invalid mode') # -- compute d' - if mode == 'sample' or mode == 'binary': + if mode in ['sample', 'binary']: assert np.isfinite(pos).all() assert np.isfinite(neg).all() @@ -125,60 +154,10 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ dp = num / div - else: # mode == 'rate' + else: # mode is rate or confusionmat dp = norm.ppf(TPR) - norm.ppf(FPR) # from Dan's suggestion about clipping d' values... dp = np.clip(dp, min_value, max_value) return dp - - -def dprime_from_confusion_matrix(M, max_value=np.inf, \ - min_value=-np.inf, **kwargs): - """Computes the d-prime sensitivity indices of predictions from - the given confusion matrix. This function is designed mostly for - when there is no access to internal representations and/or - decision making mechanisms (like human data). If no ``collation`` - is defined in ``kwargs`` this function computes - one vs. rest d-prime for each class. - - Parameters - ---------- - M: array-like, shape = [n_classes (true), n_classes (pred)] - Confusion matrix, where the element M_{rc} means the number of - times when the classifier/subject guesses that a test sample in - the r-th class belongs to the c-th class. - - max_value: float, optional - Maximum possible d-prime value. Default is ``np.inf``. - - min_value: float, optional - Minimum possible d-prime value. Default is ``-np.inf``. - - kwargs: named arguments, optional - Passed to ``confusion_stats()``. By assigning ``collation``, - ``fudge_mode``, ``fudge_factor``, etc. one can change the - behavior of d-prime computation - (see ``confusion_stats()`` for details). - - Returns - ------- - dp: array, shape = [n_groupings] - Array of d-primes, where each element corresponds to each grouping - defined by `collation` (see ``confusion_stats()`` for details). - - References - ---------- - http://en.wikipedia.org/wiki/D' - http://en.wikipedia.org/wiki/Confusion_matrix - """ - - # M: confusion matrix, row means true classes, col means predicted classes - P, N, TP, _, FP, _ = confusion_matrix_stats(M, **kwargs) - - TPR = TP / P - FPR = FP / N - - return dprime(TPR, FPR, mode='rate', \ - max_value=max_value, min_value=min_value) From f0a4f1b0be473aaa6cb0fcf7f8d25ab57041bddb Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 11:55:41 -0400 Subject: [PATCH 23/33] DOC: small changes --- bangmetric/accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bangmetric/accuracy.py b/bangmetric/accuracy.py index f34b06d..6a119a3 100644 --- a/bangmetric/accuracy.py +++ b/bangmetric/accuracy.py @@ -11,7 +11,7 @@ def accuracy(y_true, y_pred, balanced=False): - """Computes the Accuracy of the predictions (also known as the + """Computes the accuracy of the predictions (also known as the zero-one score). Parameters From 5b07e4ca6fa310fa0a4fa561f0425c8383f67d96 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 13:12:26 -0400 Subject: [PATCH 24/33] COSMIT --- bangmetric/utils.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/bangmetric/utils.py b/bangmetric/utils.py index 979de10..1c30c46 100644 --- a/bangmetric/utils.py +++ b/bangmetric/utils.py @@ -10,6 +10,7 @@ DEFAULT_FUDGE_FACTOR = 0.5 DEFAULT_FUDGE_MODE = 'correction' +DTYPE = np.float64 def confusion_matrix_stats(M, collation=None, \ @@ -25,7 +26,8 @@ def confusion_matrix_stats(M, collation=None, \ times when the classifier guesses that a test sample in the r-th class belongs to the c-th class. - collation: None or array-like of shape = [n_groupings, n_classes], optional + collation: None or array-like of shape = [n_groupings, + n_classes], optional (default=None) Defines how to group entries in `M` to make sub-confusion matrices. Entries shoule be {+1, 0, -1}. A row defines one instance of grouping, where +1, -1, and 0 designate the corresponding class as a @@ -37,13 +39,13 @@ def confusion_matrix_stats(M, collation=None, \ [-1, -1, +1]] If `None` (default), one vs. rest grouping is assumed. - fudge_factor: float, optional + fudge_factor: float, optional (default=0.5) A small factor to avoid TPR, FPR, TNR, or FNR becoming 0 or 1. - Mostly intended for d-prime calculation. Default is 0.5. + Mostly intended for d-prime calculation. - fudge_mode: str, optional + fudge_mode: str, optional (default='correction') Determins how to apply the fudge factor. Can be one of: - 'correction': apply only when needed (default) + 'correction': apply only when needed 'always': always apply the fudge factor 'none': no fudging --- equivalent to ``fudge_factor=0`` @@ -94,17 +96,17 @@ def confusion_matrix_stats(M, collation=None, \ # FP: number of false positives, for each grouping P0 = np.sum(M, axis=1) P = np.array([np.sum(P0[coll == +1]) \ - for coll in collation], dtype='float64') + for coll in collation], dtype=DTYPE) N = np.array([np.sum(P0[coll == -1]) \ - for coll in collation], dtype='float64') + for coll in collation], dtype=DTYPE) TP = np.array([np.sum(M[coll == +1][:, coll == +1]) \ - for coll in collation], dtype='float64') + for coll in collation], dtype=DTYPE) TN = np.array([np.sum(M[coll == -1][:, coll == -1]) \ - for coll in collation], dtype='float64') + for coll in collation], dtype=DTYPE) FP = np.array([np.sum(M[coll == -1][:, coll == +1]) \ - for coll in collation], dtype='float64') + for coll in collation], dtype=DTYPE) FN = np.array([np.sum(M[coll == +1][:, coll == -1]) \ - for coll in collation], dtype='float64') + for coll in collation], dtype=DTYPE) # -- application of fudge factor if fudge_mode == 'none': # no fudging From 0748c3a5bd84fb67b73cd5b5b2d501172284953e Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 14:06:17 -0400 Subject: [PATCH 25/33] ENH: added metrics for human data --- bangmetric/__init__.py | 2 + bangmetric/human_metric.py | 86 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 bangmetric/human_metric.py diff --git a/bangmetric/__init__.py b/bangmetric/__init__.py index e4b0765..26ae973 100644 --- a/bangmetric/__init__.py +++ b/bangmetric/__init__.py @@ -5,3 +5,5 @@ from rmse import * # pyflakes.ignore from kernel_analysis import * # pyflakes.ignore from nk import * # pyflakes.ignore +from utils import * # pyflakes.ignore +from human_metric import * # pyflakes.ignore diff --git a/bangmetric/human_metric.py b/bangmetric/human_metric.py new file mode 100644 index 0000000..906a28a --- /dev/null +++ b/bangmetric/human_metric.py @@ -0,0 +1,86 @@ +"""Metrics designed to compute the similarity to human data""" + +# Authors: Ha Hong +# +# License: BSD + +__all__ = ['central_ratio', 'consistency'] + +import numpy as np +from .correlation import spearman + +DTYPE = np.float64 + + +def central_ratio(num, dnm, center=np.median, finite=True): + """Computes the central tendency (median, by default) of the ratios + between `num` and `dnm`. By default, this function gives the + "Turing ratio" used in the paper by Majaj, Hong, Solomon, and DiCarlo. + + Parameters + ---------- + num: array-like + Numerators of ratios + + dnm: array-lie, shape = `num.shape()` + Denominators of ratios. `num` and `dnm` must have the same shape. + + center: function, optional (default=np.median) + Function to compute the central tendency. + + finite: boolean, optional (default=True) + If True, only finite numbers in `num` and `dnm` will be used for + the computation of the central tendency. + """ + + num = np.array(num, dtype=DTYPE) + dnm = np.array(dnm, dtype=DTYPE) + assert num.shape == dnm.shape + + num = num.ravel() + dnm = dnm.ravel() + + if finite: + fi = np.isfinite(dnm) & np.isfinite(num) + num = num[fi] + dnm = dnm[fi] + + return center(num / dnm) + + +def consistency(A, B, corrcoef=spearman, finite=True): + """Computes the consistency (Spearman rank correlation coefficient, + by default) between two sets of data points (e.g., d' scores) `A` + and `B`. By default, this function gives the "consistency" + used in the paper by Majaj, Hong, Solomon, and DiCarlo. + + Parameters + ---------- + A: array-like + A set of data points + + B: array-lie, shape = `A.shape()` + Another set of data points to compare with `A`. + `A` and `B` must have the same shape. + + corrcoef: function, optional (default=bangmetric.spearman) + Function to compute the "consistency." + + finite: boolean, optional (default=True) + If True, only finite numbers in `A` and `B` will be used for + the computation of the consistency. + """ + + A = np.array(A, dtype=DTYPE) + B = np.array(B, dtype=DTYPE) + assert A.shape == B.shape + + A = A.ravel() + B = B.ravel() + + if finite: + fi = np.isfinite(B) & np.isfinite(A) + A = A[fi] + B = B[fi] + + return corrcoef(A, B) From 95ed1fc0b3972159e69e01c6537f76ddb9a671b2 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 14:27:29 -0400 Subject: [PATCH 26/33] COSMIT --- bangmetric/human_metric.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bangmetric/human_metric.py b/bangmetric/human_metric.py index 906a28a..866c1bb 100644 --- a/bangmetric/human_metric.py +++ b/bangmetric/human_metric.py @@ -12,7 +12,7 @@ DTYPE = np.float64 -def central_ratio(num, dnm, center=np.median, finite=True): +def central_ratio(num, dnm, centerfn=np.median, finite=True): """Computes the central tendency (median, by default) of the ratios between `num` and `dnm`. By default, this function gives the "Turing ratio" used in the paper by Majaj, Hong, Solomon, and DiCarlo. @@ -25,7 +25,7 @@ def central_ratio(num, dnm, center=np.median, finite=True): dnm: array-lie, shape = `num.shape()` Denominators of ratios. `num` and `dnm` must have the same shape. - center: function, optional (default=np.median) + centerfn: function, optional (default=np.median) Function to compute the central tendency. finite: boolean, optional (default=True) @@ -45,10 +45,10 @@ def central_ratio(num, dnm, center=np.median, finite=True): num = num[fi] dnm = dnm[fi] - return center(num / dnm) + return centerfn(num / dnm) -def consistency(A, B, corrcoef=spearman, finite=True): +def consistency(A, B, consistencyfn=spearman, finite=True): """Computes the consistency (Spearman rank correlation coefficient, by default) between two sets of data points (e.g., d' scores) `A` and `B`. By default, this function gives the "consistency" @@ -63,7 +63,7 @@ def consistency(A, B, corrcoef=spearman, finite=True): Another set of data points to compare with `A`. `A` and `B` must have the same shape. - corrcoef: function, optional (default=bangmetric.spearman) + consistencyfn: function, optional (default=bangmetric.spearman) Function to compute the "consistency." finite: boolean, optional (default=True) @@ -83,4 +83,4 @@ def consistency(A, B, corrcoef=spearman, finite=True): A = A[fi] B = B[fi] - return corrcoef(A, B) + return consistencyfn(A, B) From ab6df56decdf2059a97e419f938dc8210285e505 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 15:14:01 -0400 Subject: [PATCH 27/33] ENH: added confusion matrix support to accuracy() --- bangmetric/accuracy.py | 99 ++++++++++++++++++++++++++++++++---------- 1 file changed, 75 insertions(+), 24 deletions(-) diff --git a/bangmetric/accuracy.py b/bangmetric/accuracy.py index 6a119a3..6d67d12 100644 --- a/bangmetric/accuracy.py +++ b/bangmetric/accuracy.py @@ -8,49 +8,100 @@ __all__ = ['accuracy'] import numpy as np +from .utils import confusion_matrix_stats +DEFAULT_ACCURACY_MODE = 'binary' -def accuracy(y_true, y_pred, balanced=False): + +def accuracy(A, B=None, mode=DEFAULT_ACCURACY_MODE, \ + balanced=False, collation=None): """Computes the accuracy of the predictions (also known as the - zero-one score). + zero-one score). Depending on the choice of `mode`, this + function can take one of the following data format: + + * Binary classification outputs (`mode='binary'`; default) + * Confusion matrix (`mode='confusionmat'`) Parameters ---------- - y_true: array, shape = [n_samples] - True values, interpreted as strictly positive or not - (i.e. converted to binary). + A, B: + If `mode` is 'binary' (default): + + A: array, shape = [n_samples] + True values, interpreted as strictly positive or not + (i.e. converted to binary). + + B: array, shape = [n_samples] + Predicted values, interpreted as strictly positive or not + (i.e. converted to binary). - y_pred: array, shape = [n_samples] - Predicted values, interpreted as strictly positive or not - (i.e. converted to binary). + if `mode` is 'confusionmat': + + A: array-like, shape = [n_classes (true), n_classes (pred)] + Confusion matrix, where the element M_{rc} means + the number of times when the classifier or subject + guesses that a test sample in the r-th class + belongs to the c-th class. + + B: ignored balanced: bool, optional (default=False) Returns the balanced accuracy (equal weight for positive and negative values). + collation: None or array-like of shape = [n_groupings, + n_classes], optional (default=None) + Defines how to group entries in `M` to make sub-confusion matrices + when `mode` is 'confusionmat'. See `confusion_matrix_stats()` + for details. + Returns ------- - acc: float - Accuracy (zero-one score). + acc: float or array of shape = [n_groupings] + An accuracy score (zero-one score) or array of accuracies, + where each element corresponds to each grouping of + positives and negatives (when `mode` is 'confusionmat'). + + References + ---------- + http://en.wikipedia.org/wiki/Accuracy """ - assert len(y_true) == len(y_pred) - assert np.isfinite(y_true).all() - assert np.isfinite(y_pred).all() - # -- "binarize" the arguments - y_true = np.array(y_true) > 0 - assert y_true.ndim == 1 + if mode == 'binary': + y_true, y_pred = A, B + assert len(y_true) == len(y_pred) + assert np.isfinite(y_true).all() + assert np.isfinite(y_pred).all() + + # -- "binarize" the arguments + y_true = np.array(y_true) > 0 + assert y_true.ndim == 1 + + y_pred = np.array(y_pred) > 0 + assert y_pred.ndim == 1 + + i_pos = y_true > 0 + i_neg = ~i_pos - y_pred = np.array(y_pred) > 0 - assert y_pred.ndim == 1 + P = float(i_pos.sum()) + N = float(i_neg.sum()) + TP = float((y_true[i_pos] == y_pred[i_pos]).sum()) + TN = float((y_true[i_neg] == y_pred[i_neg]).sum()) + + elif mode == 'confusionmat': + # A: confusion mat + # row means true classes, col means predicted classes + P, N, TP, TN, _, _ = confusion_matrix_stats(A, \ + collation=collation, fudge_mode='none') + + else: + raise ValueError('Invalid mode') if balanced: - pos = y_true > 0 - neg = ~pos - pos_acc = (y_true[pos] == y_pred[pos]).mean() - neg_acc = (y_true[neg] == y_pred[neg]).mean() - acc = (pos_acc + neg_acc) / 2. + sensitivity = TP / P + specificity = TN / N + acc = (sensitivity + specificity) / 2. else: - acc = (y_true == y_pred).mean() + acc = (TP + TN) / (P + N) return acc From b1dedff7dd75dde00375f50efc88a69d5d4f1256 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 15:15:42 -0400 Subject: [PATCH 28/33] DOC: misc changes --- bangmetric/dprime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index a5b65de..f2be4fa 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -76,11 +76,11 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ Minimum possible d-prime value. Default is ``-np.inf``. kwargs: named arguments, optional - Passed to ``confusion_stats()`` and used only when `mode` + Passed to ``confusion_matrix_stats()`` and used only when `mode` is 'confusionmat'. By assigning ``collation``, ``fudge_mode``, ``fudge_factor``, etc. one can change the behavior of d-prime computation - (see ``confusion_stats()`` for details). + (see ``confusion_matrix_stats()`` for details). Returns ------- From 2e34b76ab6f6abc4e6286c5b4b49398224b07ed8 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 15:38:30 -0400 Subject: [PATCH 29/33] TST: fixed bugs in reference value --- bangmetric/tests/test_dprime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bangmetric/tests/test_dprime.py b/bangmetric/tests/test_dprime.py index 374db1a..002f512 100644 --- a/bangmetric/tests/test_dprime.py +++ b/bangmetric/tests/test_dprime.py @@ -18,7 +18,7 @@ def test_basic(): y_true = np.array([False, True, True, True, False, False, False, True]) y_pred = np.array([0.491, -0.1, 0.64, 1.52, -0.23, -0.23, 1.579, 0.76]) dp = dprime(y_true, y_pred) - reference = 0.47387910220727386 + reference = 0.39541092958803298 assert abs(dp - reference) < ATOL @@ -27,7 +27,7 @@ def test_basic100(): y_true = rng.binomial(1, 0.5, size=100) y_pred = rng.randn(y_true.size) dp = dprime(y_true, y_pred) - reference = -0.39852816153409176 + reference = -0.20652941441924857 assert abs(dp - reference) < ATOL From f3fd043a6a227292bdc6541339caa2d270b6c718 Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Tue, 24 Jul 2012 22:12:47 -0400 Subject: [PATCH 30/33] MISC: small changes to clip ppf values in dprime() --- bangmetric/dprime.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/bangmetric/dprime.py b/bangmetric/dprime.py index f2be4fa..658381d 100644 --- a/bangmetric/dprime.py +++ b/bangmetric/dprime.py @@ -15,8 +15,10 @@ DEFAULT_DPRIME_MODE = 'binary' -def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ - min_value=-np.inf, **kwargs): +def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE,\ + max_value=np.inf, min_value=-np.inf,\ + max_ppf_value=np.inf, min_ppf_value=-np.inf,\ + **kwargs): """Computes the d-prime sensitivity index of predictions from various data formats. Depending on the choice of `mode`, this function can take one of the following format: @@ -66,14 +68,22 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ B: ignored - mode: {'binary', 'sample', 'rate'}, optional - Directs the interpretation of A and B. Default is 'binary'. + mode: {'binary', 'sample', 'rate'}, optional, (default='binary') + Directs the interpretation of A and B. - max_value: float, optional - Maximum possible d-prime value. Default is ``np.inf``. + max_value: float, optional (default=np.inf) + Maximum possible d-prime value. - min_value: float, optional - Minimum possible d-prime value. Default is ``-np.inf``. + min_value: float, optional (default=-np.inf) + Minimum possible d-prime value. + + max_ppf_value: float, optional (default=np.inf) + Maximum possible ppf value. + Used only when mode is 'rate' or 'confusionmat'. + + min_ppf_value: float, optional (default=-np.inf). + Minimum possible ppf value. + Used only when mode is 'rate' or 'confusionmat'. kwargs: named arguments, optional Passed to ``confusion_matrix_stats()`` and used only when `mode` @@ -155,7 +165,11 @@ def dprime(A, B=None, mode=DEFAULT_DPRIME_MODE, max_value=np.inf,\ dp = num / div else: # mode is rate or confusionmat - dp = norm.ppf(TPR) - norm.ppf(FPR) + ppfTPR = norm.ppf(TPR) + ppfFPR = norm.ppf(FPR) + ppfTPR = np.clip(ppfTPR, min_ppf_value, max_ppf_value) + ppfFPR = np.clip(ppfFPR, min_ppf_value, max_ppf_value) + dp = ppfTPR - ppfFPR # from Dan's suggestion about clipping d' values... dp = np.clip(dp, min_value, max_value) From b58a0fe265fb1c71a42e2dd1b26a494851cc095a Mon Sep 17 00:00:00 2001 From: Ha Hong Date: Wed, 25 Jul 2012 00:45:22 -0400 Subject: [PATCH 31/33] DOC: small typos.. --- bangmetric/human_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bangmetric/human_metric.py b/bangmetric/human_metric.py index 866c1bb..9045226 100644 --- a/bangmetric/human_metric.py +++ b/bangmetric/human_metric.py @@ -22,7 +22,7 @@ def central_ratio(num, dnm, centerfn=np.median, finite=True): num: array-like Numerators of ratios - dnm: array-lie, shape = `num.shape()` + dnm: array-like, shape = `num.shape()` Denominators of ratios. `num` and `dnm` must have the same shape. centerfn: function, optional (default=np.median) @@ -59,7 +59,7 @@ def consistency(A, B, consistencyfn=spearman, finite=True): A: array-like A set of data points - B: array-lie, shape = `A.shape()` + B: array-like, shape = `A.shape()` Another set of data points to compare with `A`. `A` and `B` must have the same shape. From 9f0cbd75131504dd5d3c1a7881a1b3f1c41ab4f6 Mon Sep 17 00:00:00 2001 From: Charles Cadieu Date: Fri, 26 Oct 2012 15:33:20 -0400 Subject: [PATCH 32/33] fixed a bug: np.sort makes a copy while array.sort is inplace --- bangmetric/kernel_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bangmetric/kernel_analysis.py b/bangmetric/kernel_analysis.py index 2094e27..c45ef13 100644 --- a/bangmetric/kernel_analysis.py +++ b/bangmetric/kernel_analysis.py @@ -106,7 +106,7 @@ def kanalysis(X, Y_true, n_components='all', quantiles=DEFAULT_QUANTILES): # Sort them l2_squared_sorted = l2_squared.ravel() - np.sort(l2_squared_sorted) + l2_squared_sorted.sort() # ------------------------------------------------------------------------ # -- Compute Kernel Analysis for each quantile From d575111d2af839baddaf29c9509f3152badbd851 Mon Sep 17 00:00:00 2001 From: Jonas Kubilius Date: Wed, 7 Dec 2016 16:43:17 -0500 Subject: [PATCH 33/33] updated installation --- README.md | 5 + bangmetric/__init__.py | 2 + requirements.txt | 2 - setup.py | 358 ++++++++++++----------------------------- 4 files changed, 113 insertions(+), 254 deletions(-) create mode 100644 README.md delete mode 100644 requirements.txt mode change 100755 => 100644 setup.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..bac209a --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# bangmetric + +# License + +New BSD \ No newline at end of file diff --git a/bangmetric/__init__.py b/bangmetric/__init__.py index 26ae973..2f7a683 100644 --- a/bangmetric/__init__.py +++ b/bangmetric/__init__.py @@ -7,3 +7,5 @@ from nk import * # pyflakes.ignore from utils import * # pyflakes.ignore from human_metric import * # pyflakes.ignore + +__version__ = '0.0.1' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 7f2ecdb..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -numpy>=1.6.1 -scikit-learn>=0.10 diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 8802ad3..e989deb --- a/setup.py +++ b/setup.py @@ -1,252 +1,106 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" distribute- and pip-enabled setup.py """ - -import logging -import os -import re - -# ----- overrides ----- - -# set these to anything but None to override the automatic defaults -packages = None -package_name = None -package_data = None -scripts = None -requirements_file = None -requirements = None -dependency_links = None -use_numpy = True - -# --------------------- - - -# ----- control flags ----- - -# fallback to setuptools if distribute isn't found -setup_tools_fallback = False - -# don't include subdir named 'tests' in package_data -skip_tests = True - -# print some extra debugging info -debug = True - -# ------------------------- - -if debug: - logging.basicConfig(level=logging.DEBUG) -# distribute import and testing -try: - import distribute_setup - distribute_setup.use_setuptools() - logging.debug("distribute_setup.py imported and used") -except ImportError: - # fallback to setuptools? - # distribute_setup.py was not in this directory - if not (setup_tools_fallback): - import setuptools - if not (hasattr(setuptools, '_distribute') and \ - setuptools._distribute): - raise ImportError(\ - "distribute was not found and fallback " \ - "to setuptools was not allowed") - else: - logging.debug("distribute_setup.py not found, \ - defaulted to system distribute") - else: - logging.debug("distribute_setup.py not found, " \ - "defaulting to system setuptools") - -import setuptools - - -def find_scripts(): - return [s for s in setuptools.findall('scripts/') \ - if os.path.splitext(s)[1] != '.pyc'] - - -def package_to_path(package): - """ - Convert a package (as found by setuptools.find_packages) - e.g. "foo.bar" to usable path - e.g. "foo/bar" - - No idea if this works on windows - """ - return package.replace('.', '/') - - -def find_subdirectories(package): - """ - Get the subdirectories within a package - This will include resources (non-submodules) and submodules - """ - try: - subdirectories = os.walk(package_to_path(package)).next()[1] - except StopIteration: - subdirectories = [] - return subdirectories - - -def subdir_findall(dir, subdir): - """ - Find all files in a subdirectory and return paths relative to dir - - This is similar to (and uses) setuptools.findall - However, the paths returned are in the form needed for package_data - """ - strip_n = len(dir.split('/')) - path = '/'.join((dir, subdir)) - return ['/'.join(s.split('/')[strip_n:]) for s in setuptools.findall(path)] - - -def find_package_data(packages): - """ - For a list of packages, find the package_data - - This function scans the subdirectories of a package and considers all - non-submodule subdirectories as resources, including them in - the package_data - - Returns a dictionary suitable for setup(package_data=) - """ - package_data = {} - for package in packages: - package_data[package] = [] - for subdir in find_subdirectories(package): - if '.'.join((package, subdir)) in packages: # skip submodules - logging.debug("skipping submodule %s/%s" % (package, subdir)) - continue - if skip_tests and (subdir == 'tests'): # skip tests - logging.debug("skipping tests %s/%s" % (package, subdir)) - continue - package_data[package] += \ - subdir_findall(package_to_path(package), subdir) - return package_data - - -def parse_requirements(file_name): - """ - from: - http://cburgmer.posterous.com/pip-requirementstxt-and-setuppy - """ - requirements = [] - with open(file_name, 'r') as f: - for line in f: - if re.match(r'(\s*#)|(\s*$)', line): - continue - if re.match(r'\s*-e\s+', line): - requirements.append(re.sub(r'\s*-e\s+.*#egg=(.*)$',\ - r'\1', line).strip()) - elif re.match(r'\s*-f\s+', line): - pass - else: - requirements.append(line.strip()) - return requirements - - -def parse_dependency_links(file_name): - """ - from: - http://cburgmer.posterous.com/pip-requirementstxt-and-setuppy - """ - dependency_links = [] - with open(file_name) as f: - for line in f: - if re.match(r'\s*-[ef]\s+', line): - dependency_links.append(re.sub(r'\s*-[ef]\s+',\ - '', line)) - return dependency_links - -# ----------- Override defaults here ---------------- -if packages is None: - packages = setuptools.find_packages() - -if len(packages) == 0: - raise Exception("No valid packages found") - -if package_name is None: - package_name = packages[0] - -if package_data is None: - package_data = find_package_data(packages) - -if scripts is None: - scripts = find_scripts() - -if requirements_file is None: - requirements_file = 'requirements.txt' - -if os.path.exists(requirements_file): - if requirements is None: - requirements = parse_requirements(requirements_file) - if dependency_links is None: - dependency_links = parse_dependency_links(requirements_file) -else: - if requirements is None: - requirements = [] - if dependency_links is None: - dependency_links = [] - -if debug: - logging.debug("Module name: %s" % package_name) - for package in packages: - logging.debug("Package: %s" % package) - logging.debug("\tData: %s" % str(package_data[package])) - logging.debug("Scripts:") - for script in scripts: - logging.debug("\tScript: %s" % script) - logging.debug("Requirements:") - for req in requirements: - logging.debug("\t%s" % req) - logging.debug("Dependency links:") - for dl in dependency_links: - logging.debug("\t%s" % dl) - -from distutils.core import Command -class PyTest(Command): - user_options = [] - def initialize_options(self): - pass - def finalize_options(self): - pass - def run(self): - import sys,subprocess - errno = subprocess.call([sys.executable, 'runtests.py']) - raise SystemExit(errno) - - -if __name__ == '__main__': - - sub_packages = packages - - if use_numpy: - from numpy.distutils.misc_util import Configuration - config = Configuration(package_name, '', None) - - for sub_package in sub_packages: - print 'adding', sub_package - config.add_subpackage(sub_package) - - from numpy.distutils.core import setup - kwargs = config.todict() - kwargs['cmdclass'] = dict(test=PyTest) - setup(**kwargs) - - else: - setuptools.setup( - name=package_name, - version='dev', - packages=packages, - scripts=scripts, - - package_data=package_data, - include_package_data=True, - - install_requires=requirements, - dependency_links=dependency_links, - - cmdclass=dict(test=PyTest), - ) +"""A setuptools based setup module. + +See: +https://packaging.python.org/en/latest/distributing.html +https://github.com/pypa/sampleproject +""" + +# Always prefer setuptools over distutils +from setuptools import setup, find_packages +# To use a consistent encoding +from codecs import open +from os import path + +import bangmetric + + +here = path.abspath(path.dirname(__file__)) + +# Get the long description from the README file +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='bangmetric', + + # Versions should comply with PEP440. For a discussion on single-sourcing + # the version across setup.py and the project code, see + # https://packaging.python.org/en/latest/single_source_version.html + version=bangmetric.__version__, + + description='', + long_description=long_description, + + # The project's main homepage. + url='https://github.com/dicarlolab/bangmetric', + + # Author details + author='DiCarlo Lab', + + # Choose your license + license='New BSD', + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + + # Pick your license as you wish (should match "license" above) + 'License :: OSI Approved :: New BSD License', + + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + ], + + # What does your project relate to? + keywords='tensorflow deep learning', + + # You can just specify the packages manually here if your project is + # simple. Or you can use find_packages(). + packages=find_packages(exclude=['contrib', 'docs', 'tests']), + + # Alternatively, if you want to distribute just a my_module.py, uncomment + # this: + # py_modules=["my_module"], + + # List run-time dependencies here. These will be installed by pip when + # your project is installed. For an analysis of "install_requires" vs pip's + # requirements files see: + # https://packaging.python.org/en/latest/requirements.html + install_requires=['numpy', 'scipy', 'scikit-learn'], + + # List additional groups of dependencies here (e.g. development + # dependencies). You can install these using the following syntax, + # for example: + # $ pip install -e .[dev,test] + # extras_require={ + # 'dev': ['check-manifest'], + # 'test': ['coverage'], + # }, + + # If there are data files included in your packages that need to be + # installed, specify them here. If using Python 2.6 or less, then these + # have to be included in MANIFEST.in as well. + # package_data={ + # 'sample': ['package_data.dat'], + # }, + + # Although 'package_data' is the preferred approach, in some case you may + # need to place data files outside of your packages. See: + # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa + # In this case, 'data_file' will be installed into '/my_data' + # data_files=[('my_data', ['data/data_file'])], + + # To provide executable scripts, use entry points in preference to the + # "scripts" keyword. Entry points provide cross-platform support and allow + # pip to create the appropriate form of executable for the target platform. + # entry_points={ + # 'console_scripts': [ + # 'sample=sample:main', + # ], + # }, +) \ No newline at end of file