tobac-project · JuliaKukulies · Nov 7, 2023 · May 23, 2023 · May 23, 2023 · May 25, 2023
diff --git a/doc/segmentation_out_vars_statistics.csv b/doc/segmentation_out_vars_statistics.csv
@@ -0,0 +1,7 @@
+Variable Name,Description,Units,Type
+feature_mean,Mean of feature data points ,same as input field,float
+feature_max,Maximum value of feature data points ,same as input field,float
+feature_min,Minimum value of feature data points ,same as input field,float
+feature_sum,Sum of feature data points ,same as input field,float
+major_axis_length,The length of the major axis of the ellipse that has the same normalized second central moments as the feature area,"number of grid cells, multiply by dx to get distance unit",float
+feature_percentiles,Percentiles from 0 to 100 (with increment 1) of feature data distribution ,same as input field,ndarray
diff --git a/doc/segmentation_output.rst b/doc/segmentation_output.rst
@@ -9,3 +9,12 @@ Segmentation also outputs the same `pandas` dataframe as obtained by Feature Det
    :file: ./segmentation_out_vars.csv
    :widths: 3, 35, 3, 3
    :header-rows: 1
+
+One can optionally get the bulk statistics of the data points belonging to each segmented feature (i.e. either the 2D area or the 3D volume assigned to the feature). This is done by setting `statistics=True` when calling :ufunc:`tobac.segmentation.segmentation` and will add the following columns to the output dataframe: 
+
+.. csv-table:: tobac Segmentation Output Variables
+   :file: ./segmentation_out_vars_statistics.csv
+   :widths: 3, 35, 3, 3
+   :header-rows: 1
+
+Note that these statistics refer to the data fields that are used as input for the segmentation. It is possible to run the segmentation with different input (see :doc:`transform segmentation`) data to get statistics of a feature based on different variables (e.g. get statistics of cloud top temperatures as well as rain rates for a certain storm object). 
diff --git a/examples/Example_Precip_Tracking/Example_Precip_Tracking.ipynb b/examples/Example_Precip_Tracking/Example_Precip_Tracking.ipynb
diff --git a/tobac/segmentation.py b/tobac/segmentation.py
@@ -34,6 +34,7 @@
 
 import skimage
 import numpy as np
+import pandas as pd
 
 from . import utils as tb_utils
 from .utils import periodic_boundaries as pbc_utils
@@ -263,6 +264,7 @@ def segmentation_3D(
     max_distance=None,
     PBC_flag="none",
     seed_3D_flag="column",
+    statistics=False,
 ):
     """Wrapper for the segmentation()-function."""
 
@@ -277,6 +279,7 @@ def segmentation_3D(
         max_distance=max_distance,
         PBC_flag=PBC_flag,
         seed_3D_flag=seed_3D_flag,
+        statistics=statistics,
     )
 
 
@@ -291,6 +294,7 @@ def segmentation_2D(
     max_distance=None,
     PBC_flag="none",
     seed_3D_flag="column",
+    statistics=False,
 ):
     """Wrapper for the segmentation()-function."""
     return segmentation(
@@ -304,6 +308,7 @@ def segmentation_2D(
         max_distance=max_distance,
         PBC_flag=PBC_flag,
         seed_3D_flag=seed_3D_flag,
+        statistics=statistics,
     )
 
 
@@ -322,6 +327,7 @@ def segmentation_timestep(
     seed_3D_size=5,
     segment_number_below_threshold=0,
     segment_number_unassigned=0,
+    statistics=False,
 ):
     """Perform watershedding for an individual time step of the data. Works
     for both 2D and 3D data
@@ -383,6 +389,9 @@ def segmentation_timestep(
         the marker to use to indicate a segmentation point is below the threshold.
     segment_number_unassigned: int
         the marker to use to indicate a segmentation point is above the threshold but unsegmented.
+    statistics: boolean, optional
+        Default is False. If True, bulk statistics for the data points assigned to each feature are saved in output.
+
 
     Returns
     -------
@@ -1011,6 +1020,47 @@ def segmentation_timestep(
                 row["feature"]
             ]
 
+            if statistics:
+                (
+                    feature_mean,
+                    feature_max,
+                    feature_min,
+                    feature_percentiles,
+                    feature_sum,
+                    feature_axis,
+                ) = tb_utils.general.get_statistics(
+                    row["feature"], segmentation_mask.copy(), field_in.data.copy()
+                )
+                # write the statistics to feature output dataframe
+                features_out.loc[
+                    features_out.feature == row["feature"], "feature_mean"
+                ] = feature_mean
+                features_out.loc[
+                    features_out.feature == row["feature"], "feature_max"
+                ] = feature_max
+                features_out.loc[
+                    features_out.feature == row["feature"], "feature_min"
+                ] = feature_min
+                features_out.loc[
+                    features_out.feature == row["feature"], "feature_sum"
+                ] = feature_sum
+                features_out.loc[
+                    features_out.feature == row["feature"], "major_axis_length"
+                ] = feature_axis
+
+                # save percentiles of data distribution within feature
+                if index == features_out[features_out.ncells > 0].index[0]:
+                    # here, we need to initialize the column first since .loc indexing does not work with pd.Series
+                    features_out["feature_percentiles"] = np.nan
+                # store numpy array with percentiles as cell in data frame
+                df = pd.DataFrame({"feature_percentiles": [feature_percentiles]})
+                # get row index rather than pd.Dataframe index value since we need to use .iloc indexing
+                row_idx = np.where(features_out.feature == row["feature"])[0]
+                features_out.iloc[
+                    row_idx,
+                    features_out.columns.get_loc("feature_percentiles"),
+                ] = df.apply(lambda r: tuple(r), axis=1)
+
     return segmentation_out, features_out
 
 
@@ -1093,6 +1143,7 @@ def segmentation(
     seed_3D_size=5,
     segment_number_below_threshold=0,
     segment_number_unassigned=0,
+    statistics=False,
 ):
     """Use watershedding to determine region above a threshold
         value around initial seeding position for all time steps of
@@ -1114,6 +1165,9 @@ def segmentation(
         dxy : float
             Grid spacing of the input data.
 
+        statistics : boolean, optional
+            Default is False. If True, bulk statistics for the data points assigned to each feature are saved in output.
+
         Output:
         segmentation_out: iris.cube.Cube
                        Cloud mask, 0 outside and integer numbers according to track inside the cloud
@@ -1163,6 +1217,8 @@ def segmentation(
             the marker to use to indicate a segmentation point is below the threshold.
         segment_number_unassigned: int
             the marker to use to indicate a segmentation point is above the threshold but unsegmented.
+    statistics: boolean, optional
+        Default is False. If True, bulk statistics for the data points assigned to each feature are saved in output.
 
 
         Returns
@@ -1222,6 +1278,7 @@ def segmentation(
             seed_3D_size=seed_3D_size,
             segment_number_unassigned=segment_number_unassigned,
             segment_number_below_threshold=segment_number_below_threshold,
+            statistics=statistics,
         )
         segmentation_out_list.append(segmentation_out_i)
         features_out_list.append(features_out_i)

diff --git a/tobac/tests/test_segmentation.py b/tobac/tests/test_segmentation.py
@@ -620,13 +620,24 @@ def test_segmentation_multiple_features():
 
     # perform segmentation
     out_seg_mask, out_df = segmentation.segmentation_timestep(
-        field_in=test_data_iris, features_in=fd_output, dxy=test_dxy, threshold=1.5
+        field_in=test_data_iris,
+        features_in=fd_output,
+        dxy=test_dxy,
+        threshold=1.5,
+        statistics=True,
     )
     out_seg_mask_arr = out_seg_mask.core_data()
 
     # assure that the number of grid cells belonging to each feature (ncells) are consistent with segmentation mask
     assert int(out_df[out_df.feature == 1].ncells.values) == size_feature1
     assert int(out_df[out_df.feature == 2].ncells.values) == size_feature2
+    # assure that bulk statistic columns are created in output
+    assert out_df.columns.size - fd_output.columns.size > 1
+    # assure that statistics are calculated everywhere where an area for ncells is found
+    assert (
+        out_df.ncells[out_df["ncells"] > 0].shape
+        == out_df.ncells[out_df["feature_mean"] > 0].shape
+    )
 
 
 # TODO: add more tests to make sure buddy box code is run.

diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py
@@ -398,3 +398,60 @@ def test_combine_tobac_feats():
     )
     assert np.all(list(combined_feat["old_feat_column"].values) == [1, 1])
     assert np.all(list(combined_feat["feature"].values) == [1, 2])
+
+
+def test_get_statistics():
+    """
+    Test to assure that bulk statistics for identified features are computed as expected.
+
+    """
+    featureID = 1
+
+    ### Test 2D data
+    data_2D = tb_test.make_simple_sample_data_2D()[10].data
+    data_2D[data_2D > 8] = 10
+    data_2D[data_2D < 8] = 0
+    segmentation_mask = data_2D.copy()
+    segmentation_mask[segmentation_mask > 8] = 1
+    segmentation_mask = segmentation_mask.astype(int)
+
+    # get bulk statistics of identified features
+    (
+        feature_mean,
+        feature_max,
+        feature_min,
+        feature_percentiles,
+        feature_sum,
+        feature_axis,
+    ) = tb_utils.general.get_statistics(featureID, segmentation_mask, data_2D)
+
+    # expected results
+    assert feature_mean == feature_max == feature_min == 10
+    assert feature_percentiles.size == 101
+    assert np.unique(feature_percentiles)[0] == 10
+    assert feature_sum == 10 * data_2D[data_2D == 10].size
+
+    ### Test 3D data
+    data_3D = tb_test.make_sample_data_3D_3blobs()[10].data
+
+    data_3D[data_3D > 8] = 10
+    data_3D[data_3D < 8] = 0
+    segmentation_mask_3D = data_3D.copy()
+    segmentation_mask_3D[segmentation_mask_3D > 8] = 1
+    segmentation_mask_3D = segmentation_mask_3D.astype(int)
+
+    # get bulk statistics
+    (
+        feature_mean,
+        feature_max,
+        feature_min,
+        feature_percentiles,
+        feature_sum,
+        feature_axis,
+    ) = tb_utils.general.get_statistics(featureID, segmentation_mask_3D, data_3D)
+
+    # expected results
+    assert feature_mean == feature_max == feature_min == 10
+    assert feature_percentiles.size == 101
+    assert np.unique(feature_percentiles)[0] == 10
+    assert feature_sum == 10 * data_3D[data_3D == 10].size
diff --git a/tobac/utils/general.py b/tobac/utils/general.py
@@ -551,3 +551,63 @@ def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None):
     combined_sorted["feature"] = np.arange(1, len(combined_sorted) + 1)
     combined_sorted = combined_sorted.reset_index(drop=True)
     return combined_sorted
+
+
+def get_statistics(feature_ID, segmentation_mask, field_in):
+    """
+     Derive bulk statistics of all data point that are attributed to a certain feature
+     after segmentation.
+
+     Parameters
+     ----------
+     feature_ID: int
+         The ID of a certain feature for which to extract the statistics.
+     segmentation_mask: ndarray
+         2D or 3D segmentation mask for the timestep wherein the feature occurs.
+     field_in: ndarray
+         2D or 3D field with data points for specific timestep (should have the same shape as the segmentation mask).
+
+     Returns
+     -------
+
+     feature_mean: float
+         mean value of data points within feature
+     feature_max: float
+         max value of data points within feature
+     feature_min: float
+         min value of data points within feature
+     feature_percentiles: ndarray
+         percentiles from 0 to 100 of data distribution within feature
+    feature_sum: float
+         sum of all data points within feature (e.g. total precipitation)
+    feature_axis: float
+         length of major axis of feature
+
+    """
+    from skimage.measure import regionprops
+
+    # get data points that belong to feature
+    data_points = field_in[segmentation_mask == feature_ID]
+
+    # get statistics for these data points
+    feature_mean = np.nanmean(data_points)
+    feature_max = np.nanmax(data_points)
+    feature_min = np.nanmin(data_points)
+    feature_percentiles = np.percentile(data_points, range(101))
+    feature_sum = np.nansum(data_points)
+
+    # get other region properties
+    segmentation_mask[
+        segmentation_mask != feature_ID
+    ] = 0  # set segmentation mask for other features to 0
+    regions = regionprops(segmentation_mask)
+    feature_axis = regions[0].major_axis_length
+
+    return (
+        feature_mean,
+        feature_max,
+        feature_min,
+        feature_percentiles,
+        feature_sum,
+        feature_axis,
+    )