add roc curve

nci · Oct 6, 2023 · e3e6d0d · e3e6d0d
1 parent 7614b23
commit e3e6d0d
Show file tree

Hide file tree

Showing 19 changed files with 1,732 additions and 23 deletions.
diff --git a/docs/api.md b/docs/api.md
@@ -25,11 +25,14 @@
 .. autofunction:: scores.probability.crps_cdf_brier_decomposition
 .. autofunction:: scores.probability.murphy
 .. autofunction:: scores.probability.murphy_thetas
+.. autofunction:: scores.probability.roc_curve_data
 ```
 
 ## scores.categorical
 ```{eval-rst}
 .. autofunction:: scores.categorical.firm
+.. autofunction:: scores.continuous.probability_of_detection
+.. autofunction:: scores.continuous.probability_of_false_detection
 ```
 
 ## scores.stats

diff --git a/docs/summary_table_of_scores.md b/docs/summary_table_of_scores.md
@@ -1,3 +1,3 @@
-| continuous                   | probability               | categorical | statistical tests      |
-| ----------                   | -----------               | ----------- | -----------            |
-| MAE, MSE, RMSE, Murphy score | CRPS, Murphy score        | FIRM        | Diebold Mariano (with the Harvey et al. 1997 and the Hering and Genton 2011 modifications) |
+| continuous                   | probability                    | categorical            | statistical tests      |
+| ----------                   | -----------                    | -----------            | -----------            |
+| MAE, MSE, RMSE, Murphy score | CRPS, Murphy score, ROC        | FIRM, POD, POFD        | Diebold Mariano (with the Harvey et al. 1997 and the Hering and Genton 2011 modifications) |
diff --git a/src/scores/categorical/__init__.py b/src/scores/categorical/__init__.py
@@ -1,4 +1,8 @@
 """
 Import the functions from the implementations into the public API
 """
+from scores.categorical.binary_impl import (
+    probability_of_detection,
+    probability_of_false_detection,
+)
 from scores.categorical.multicategorical_impl import firm
diff --git a/src/scores/categorical/binary_impl.py b/src/scores/categorical/binary_impl.py
@@ -0,0 +1,149 @@
+"""
+This module contains methods for binary categories
+"""
+from typing import Optional, Sequence
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from scores.functions import apply_weights
+from scores.processing import check_binary
+from scores.utils import gather_dimensions
+
+
+def probability_of_detection(
+    fcst: xr.DataArray,
+    obs: xr.DataArray,
+    reduce_dims: Optional[Sequence[str]] = None,
+    preserve_dims: Optional[Sequence[str]] = None,
+    weights: Optional[xr.DataArray] = None,
+    check_args: Optional[bool] = True,
+) -> xr.DataArray:
+    """
+    Calculates the Probability of Detection (POD), also known as the Hit Rate.
+    This is the proportion of observed events (obs = 1) that were correctly
+    forecast as an event (fcst = 1).
+
+    Args:
+        fcst: An array containing binary values in the set {0, 1, np.nan}
+        obs: An array containing binary values in the set {0, 1, np.nan}
+        reduce_dims: Optionally specify which dimensions to sum when
+            calculating the POD. All other dimensions will be not summed. As a
+            special case, 'all' will allow all dimensions to be summed. Only one
+            of `reduce_dims` and `preserve_dims` can be supplied. The default behaviour
+            if neither are supplied is to sum across all dims.
+        preserve_dims: Optionally specify which dimensions to not sum
+            when calculating the POD. All other dimensions will be summed.
+            As a special case, 'all' will allow all dimensions to be
+            not summed. In this case, the result will be in the same
+            shape/dimensionality as the forecast, and the errors will be
+            the POD score at each point (i.e. single-value comparison
+            against observed), and the forecast and observed dimensions
+            must match precisely. Only one of `reduce_dims` and `preserve_dims` can be
+            supplied. The default behaviour if neither are supplied is to reduce all dims.
+        weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude,
+            by population, custom)
+        check_args: Checks if `fcst and `obs` data only contains values in the set
+            {0, 1, np.nan}. You may want to skip this check if you are sure about your
+            input data and want to improve the performance when working with dask.
+
+    Returns:
+        A DataArray of the Probability of Detection.
+
+    Raises:
+        ValueError: if there are values in `fcst` and `obs` that are not in the
+            set {0, 1, np.nan} and `check_args` is true.
+
+    """
+    # fcst & obs must be 0s and 1s
+    if check_args:
+        check_binary(fcst, "fcst")
+        check_binary(obs, "obs")
+    dims_to_sum = gather_dimensions(fcst.dims, obs.dims, reduce_dims, preserve_dims)
+
+    misses = (obs == 1) & (fcst == 0)
+    hits = (obs == 1) & (fcst == 1)
+
+    # preserve NaNs
+    misses = misses.where((~np.isnan(fcst)) & (~np.isnan(obs)))
+    hits = hits.where((~np.isnan(fcst)) & (~np.isnan(obs)))
+
+    misses = apply_weights(misses, weights)
+    hits = apply_weights(hits, weights)
+
+    misses = misses.sum(dim=dims_to_sum)
+    hits = hits.sum(dim=dims_to_sum)
+
+    pod = hits / (hits + misses)
+
+    pod.name = "ctable_probability_of_detection"
+    return pod
+
+
+def probability_of_false_detection(
+    fcst: xr.DataArray,
+    obs: xr.DataArray,
+    reduce_dims: Optional[Sequence[str]] = None,
+    preserve_dims: Optional[Sequence[str]] = None,
+    weights: Optional[xr.DataArray] = None,
+    check_args: Optional[bool] = True,
+) -> xr.DataArray:
+    """
+    Calculates the Probability of False Detection (POFD), also known as False
+    Alarm Rate (not to be confused with the False Alarm Ratio). The POFD is
+    the proportion of observed non-events (obs = 0) that were incorrectly
+    forecast as event (i.e. fcst = 1).
+
+    Args:
+        fcst: An array containing binary values in the set {0, 1, np.nan}
+        obs: An array containing binary values in the set {0, 1, np.nan}
+        reduce_dims: Optionally specify which dimensions to sum when
+            calculating the POFD. All other dimensions will be not summed. As a
+            special case, 'all' will allow all dimensions to be summed. Only one
+            of `reduce_dims` and `preserve_dims` can be supplied. The default behaviour
+            if neither are supplied is to sum across all dims.
+        preserve_dims: Optionally specify which dimensions to not sum
+            when calculating the POFD. All other dimensions will be summed.
+            As a special case, 'all' will allow all dimensions to be
+            not summed. In this case, the result will be in the same
+            shape/dimensionality as the forecast, and the errors will be
+            the POD score at each point (i.e. single-value comparison
+            against observed), and the forecast and observed dimensions
+            must match precisely. Only one of `reduce_dims` and `preserve_dims` can be
+            supplied. The default behaviour if neither are supplied is to reduce all dims.
+        weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude,
+            by population, custom)
+        check_args: Checks if `fcst and `obs` data only contains values in the set
+            {0, 1, np.nan}. You may want to skip this check if you are sure about your
+            input data and want to improve the performance when working with dask.
+
+    Returns:
+        A DataArray of the Probability of False Detection.
+
+    Raises:
+        ValueError: if there are values in `fcst` and `obs` that are not in the
+            set {0, 1, np.nan} and `check_args` is true.
+    """
+    # fcst & obs must be 0s and 1s
+    if check_args:
+        check_binary(fcst, "fcst")
+        check_binary(obs, "obs")
+    dims_to_sum = gather_dimensions(fcst.dims, obs.dims, reduce_dims, preserve_dims)
+
+    false_alarms = (obs == 0) & (fcst == 1)
+    correct_negatives = (obs == 0) & (fcst == 0)
+
+    # preserve NaNs
+    false_alarms = false_alarms.where((~np.isnan(fcst)) & (~np.isnan(obs)))
+    correct_negatives = correct_negatives.where((~np.isnan(fcst)) & (~np.isnan(obs)))
+
+    false_alarms = apply_weights(false_alarms, weights)
+    correct_negatives = apply_weights(correct_negatives, weights)
+
+    false_alarms = false_alarms.sum(dim=dims_to_sum)
+    correct_negatives = correct_negatives.sum(dim=dims_to_sum)
+
+    pofd = false_alarms / (false_alarms + correct_negatives)
+    pofd.name = "ctable_probability_of_false_detection"
+    return pofd
diff --git a/src/scores/categorical/multicategorical_impl.py b/src/scores/categorical/multicategorical_impl.py
@@ -222,4 +222,4 @@ def _single_category_score(
         }
     )
     score = score.transpose(*fcst.dims)
-    return score
+    return score
diff --git a/src/scores/continuous/__init__.py b/src/scores/continuous/__init__.py
@@ -1,5 +1,5 @@
 """
 Import the functions from the implementations into the public API
 """
-from scores.continuous.standard_impl import mse, rmse, mae
 from scores.continuous.murphy_impl import murphy_score, murphy_thetas
+from scores.continuous.standard_impl import mae, mse, rmse
diff --git a/src/scores/continuous/murphy_impl.py b/src/scores/continuous/murphy_impl.py
@@ -5,8 +5,9 @@
 
 import numpy as np
 import xarray as xr
-from scores.utils import gather_dimensions
+
 from scores.processing import broadcast_and_match_nan
+from scores.utils import gather_dimensions
 
 QUANTILE = "quantile"
 HUBER = "huber"

diff --git a/src/scores/probability/__init__.py b/src/scores/probability/__init__.py
@@ -7,3 +7,4 @@
     crps_cdf,
     crps_cdf_brier_decomposition,
 )
+from scores.probability.roc_impl import roc_curve_data
diff --git a/src/scores/probability/roc_impl.py b/src/scores/probability/roc_impl.py
@@ -0,0 +1,124 @@
+"""
+Implementation of Reciever Operating Characteristic (ROC) calculations
+"""
+from typing import Iterable, Optional, Sequence
+
+import numpy as np
+import xarray as xr
+
+from scores.categorical import probability_of_detection, probability_of_false_detection
+from scores.processing import binary_discretise
+from scores.utils import gather_dimensions
+
+
+def roc_curve_data(
+    fcst: xr.DataArray,
+    obs: xr.DataArray,
+    thresholds: Iterable[float],
+    reduce_dims: Optional[Sequence[str]] = None,
+    preserve_dims: Optional[Sequence[str]] = None,
+    weights: Optional[xr.DataArray] = None,
+    check_args: bool = True,
+) -> xr.Dataset:
+    """
+    Calculates data required for plotting a Receiver (Relative) Operating Characteristic (ROC)
+      curve including the AUC. The ROC curve is used as a way to measure the discrimination
+      ability of a particular forecast.
+
+      The AUC is the probability that the forecast probability of a random event is higher
+        than the forecast probability of a random non-event.
+
+    Args:
+        fcst: An array of probabilistic forecasts for a binary event in the range [0, 1].
+        obs: An array of binary values where 1 is an event and 0 is a non-event.
+        thresholds: Monotonic increasing values between 0 and 1, the thresholds at and
+          above which to convert the probabilistic forecast to a value of 1 (an 'event')
+        reduce_dims: Optionally specify which dimensions to reduce when
+            calculating the ROC curve data. All other dimensions will be preserved. As a
+            special case, 'all' will allow all dimensions to be reduced. Only one
+            of `reduce_dims` and `preserve_dims` can be supplied. The default behaviour
+            if neither are supplied is to reduce all dims.
+        preserve_dims: Optionally specify which dimensions to preserve
+            when calculating ROC curve data. All other dimensions will be reduced.
+            As a special case, 'all' will allow all dimensions to be
+            preserved. In this case, the result will be in the same
+            shape/dimensionality as the forecast, and the values will be
+            the ROC curve at each point (i.e. single-value comparison
+            against observed) for each threshold, and the forecast and observed dimensions
+            must match precisely. Only one of `reduce_dims` and `preserve_dims` can be
+            supplied. The default behaviour if neither are supplied is to reduce all dims.
+        weights: Optionally provide an array for weighted averaging (e.g. by area, by latitude,
+            by population, custom).
+        check_args: Checks if `obs` data only contains values in the set
+            {0, 1, np.nan}. You may want to skip this check if you are sure about your
+            input data and want to improve the performance when working with dask.
+
+    Returns:
+        An xarray.Dataset with data variables:
+
+        - 'POD' (the probability of detection)
+        - 'POFD' (the probability of false detection)
+        - 'AUC' (the area under the ROC curve)
+
+        `POD` and `POFD` have dimensions `dims` + 'threshold', while `AUC` has
+        dimensions `dims`.
+
+    Notes:
+        The probabilistic `fcst` is converted to a deterministic forecast
+        for each threshold in `thresholds`. If a value in `fcst` is greater
+        than or equal to the threshold, then it is converted into a
+        'forecast event' (fcst = 1), and a 'forecast non-event' (fcst = 0)
+        otherwise. The probability of detection (POD) and probability of false
+        detection (POFD) are calculated for the converted forecast. From the
+        POD and POFD data, the area under the ROC curve is calculated.
+
+        Ideally concave ROC curves should be generated rather than traditional
+        ROC curves.
+
+    Raises:
+        ValueError: if `fcst` contains values outside of the range [0, 1]
+        ValueError: if `obs` contains non-nan values not in the set {0, 1}
+        ValueError: if 'threshold' is a dimension in `fcst`.
+        ValueError: if values in `thresholds` are not montonic increasing or are outside
+          the range [0, 1]
+    """
+    if fcst.max().item() > 1 or fcst.min().item() < 0:
+        raise ValueError("`fcst` contains values outside of the range [0, 1]")
+
+    if np.max(thresholds) > 1 or np.min(thresholds) < 0:
+        raise ValueError("`thresholds` contains values outside of the range [0, 1]")
+
+    if not np.all(np.array(thresholds)[1:] >= np.array(thresholds)[:-1]):
+        raise ValueError("`thresholds` is not monotonic increasing between 0 and 1")
+
+    # make a discrete forecast for each threshold in thresholds
+    # discrete_fcst has an extra dimension 'threshold'
+    discrete_fcst = binary_discretise(fcst, thresholds, ">=")
+
+    all_dims = set(fcst.dims).union(set(obs.dims))
+    final_reduce_dims = gather_dimensions(fcst.dims, obs.dims, reduce_dims, preserve_dims)
+    final_preserve_dims = all_dims - set(final_reduce_dims)
+    auc_dims = () if final_preserve_dims is None else tuple(final_preserve_dims)
+    final_preserve_dims = auc_dims + ("threshold",)
+
+    pod = probability_of_detection(
+        discrete_fcst, obs, preserve_dims=final_preserve_dims, weights=weights, check_args=check_args
+    )
+    pofd = probability_of_false_detection(
+        discrete_fcst, obs, preserve_dims=final_preserve_dims, weights=weights, check_args=check_args
+    )
+
+    # Need to ensure ordering of dims is consistent for xr.apply_ufunc
+    pod = pod.transpose(*final_preserve_dims)
+    pofd = pofd.transpose(*final_preserve_dims)
+
+    auc = -1 * xr.apply_ufunc(
+        np.trapz,
+        pod,
+        pofd,
+        input_core_dims=[pod.dims, pofd.dims],
+        output_core_dims=[auc_dims],
+        dask="allowed",
+    )
+
+    return xr.Dataset({"POD": pod, "POFD": pofd, "AUC": auc})