From 78dedf0aef1303ac2a71d577e84366b55b16da9e Mon Sep 17 00:00:00 2001
From: Tom Vo <tomvothecoder@gmail.com>
Date: Wed, 21 Feb 2024 09:51:24 -0800
Subject: [PATCH] [PR]: Update documentation on temporal averaging, usage of
 bounds, and generation of weights (#601)

Co-authored-by: Stephen Po-Chedley <pochedley@gmail.com>
---
 xcdat/axis.py     | 10 +++---
 xcdat/bounds.py   | 16 ++++++---
 xcdat/dataset.py  | 61 +++++++++++++++-------------------
 xcdat/temporal.py | 85 ++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 123 insertions(+), 49 deletions(-)

diff --git a/xcdat/axis.py b/xcdat/axis.py
index 1fc36008..b870fc5f 100644
--- a/xcdat/axis.py
+++ b/xcdat/axis.py
@@ -241,13 +241,11 @@ def swap_lon_axis(
     dataset : xr.Dataset
          The Dataset containing a longitude axis.
     to : Tuple[float, float]
-        The orientation to swap the Dataset's longitude axis to.
-
-        Supported orientations:
-
-          * (-180, 180): represents [-180, 180) in math notation
-          * (0, 360): represents [0, 360) in math notation
+        The orientation to swap the Dataset's longitude axis to. Supported
+        orientations include:
 
+        * (-180, 180): represents [-180, 180) in math notation
+        * (0, 360): represents [0, 360) in math notation
     sort_ascending : bool
         After swapping, sort in ascending order (True), or keep existing order
         (False).
diff --git a/xcdat/bounds.py b/xcdat/bounds.py
index f59ea3ff..13dd6e43 100644
--- a/xcdat/bounds.py
+++ b/xcdat/bounds.py
@@ -146,7 +146,10 @@ def add_missing_bounds(  # noqa: C901
              ``"time_bnds"`` and ``ds.time_bnds`` is present in the dataset.
 
         5. For the "T" axis, its coordinates must be composed of datetime-like
-           objects (``np.datetime64`` or ``cftime``).
+           objects (``np.datetime64`` or ``cftime``). This method designed to
+           operate on time axes that have constant temporal resolution with
+           annual, monthly, daily, or sub-daily time frequencies. Alternate
+           frequencies (e.g., pentad) are not supported.
 
         Parameters
         ----------
@@ -326,9 +329,14 @@ def add_time_bounds(
     ) -> xr.Dataset:
         """Add bounds for an axis using its coordinate points.
 
-        This method loops over the time axis coordinate variables and attempts
-        to add bounds for each of them if they don't exist. To add time bounds
-        for the time axis, its coordinates must be the following criteria:
+        This method designed to operate on time axes that have constant temporal
+        resolution with annual, monthly, daily, or sub-daily time frequencies.
+        Alternate frequencies (e.g., pentad) are not supported. It loops over
+        the time axis coordinate variables and attempts to add bounds for each
+        of them if they don't exist.
+
+        To add time bounds for the time axis, its coordinates must be the
+        following criteria:
 
         1. Coordinates are single dimensional, not multidimensional
         2. Coordinates are a length > 1 (not singleton)
diff --git a/xcdat/dataset.py b/xcdat/dataset.py
index 147d9db7..f3fe3e01 100644
--- a/xcdat/dataset.py
+++ b/xcdat/dataset.py
@@ -80,7 +80,6 @@ def open_dataset(
           of the coordinates. If desired, refer to
           :py:func:`xarray.Dataset.bounds.add_time_bounds` if you require more
           granular configuration for how "T" bounds are generated.
-
     decode_times: bool, optional
         If True, attempt to decode times encoded in the standard NetCDF
         datetime format into cftime.datetime objects. Otherwise, leave them
@@ -95,11 +94,10 @@ def open_dataset(
         Either `(-180, 180)` or `(0, 360)`, by default None. Supported options
         include:
 
-          * None:  use the current orientation (if the longitude axis exists)
-          * (-180, 180): represents [-180, 180) in math notation
-          * (0, 360): represents [0, 360) in math notation
-
-    kwargs : Dict[str, Any]
+        * None:  use the current orientation (if the longitude axis exists)
+        * (-180, 180): represents [-180, 180) in math notation
+        * (0, 360): represents [0, 360) in math notation
+    **kwargs : Dict[str, Any]
         Additional arguments passed on to ``xarray.open_dataset``. Refer to the
         [1]_ xarray docs for accepted keyword arguments.
 
@@ -155,25 +153,24 @@ def open_mfdataset(
         Paths to dataset files. Paths can be given as strings or as pathlib.Path
         objects. Supported options include:
 
-          * Directory path (e.g., ``"path/to/files"``), which is converted
-            to a string glob of `*.nc` files
-          * String glob (e.g., ``"path/to/files/*.nc"``), which is expanded
-            to a 1-dimensional list of file paths
-          * File path to dataset (e.g., ``"path/to/files/file1.nc"``)
-          * List of file paths (e.g., ``["path/to/files/file1.nc", ...]``).
-            If concatenation along more than one dimension is desired, then
-            ``paths`` must be a nested list-of-lists (see [2]_
-            ``xarray.combine_nested`` for details).
-          * File path to an XML file with a ``directory`` attribute (e.g.,
-            ``"path/to/files"``). If ``directory`` is set to a blank string
-            (""), then the current directory is substituted ("."). This option
-            is intended to support the CDAT CDML dialect of XML files, but it
-            can work with any XML file that has the ``directory`` attribute.
-            Refer to [4]_ for more information on CDML. NOTE: This feature is
-            deprecated in v0.6.0 and will be removed in the subsequent release.
-            CDAT (including cdms2/CDML) is in maintenance only mode and marked
-            for end-of-life by the end of 2023.
-
+        * Directory path (e.g., ``"path/to/files"``), which is converted
+          to a string glob of `*.nc` files
+        * String glob (e.g., ``"path/to/files/*.nc"``), which is expanded
+          to a 1-dimensional list of file paths
+        * File path to dataset (e.g., ``"path/to/files/file1.nc"``)
+        * List of file paths (e.g., ``["path/to/files/file1.nc", ...]``).
+          If concatenation along more than one dimension is desired, then
+          ``paths`` must be a nested list-of-lists (see [2]_
+          ``xarray.combine_nested`` for details).
+        * File path to an XML file with a ``directory`` attribute (e.g.,
+          ``"path/to/files"``). If ``directory`` is set to a blank string
+          (""), then the current directory is substituted ("."). This option
+          is intended to support the CDAT CDML dialect of XML files, but it
+          can work with any XML file that has the ``directory`` attribute.
+          Refer to [4]_ for more information on CDML. NOTE: This feature is
+          deprecated in v0.6.0 and will be removed in the subsequent release.
+          CDAT (including cdms2/CDML) is in maintenance only mode and marked
+          for end-of-life by the end of 2023.
     add_bounds: List[CFAxisKey] | None | bool
         List of CF axes to try to add bounds for (if missing), by default
         ["X", "Y"]. Set to None to not add any missing bounds. Please note that
@@ -185,7 +182,6 @@ def open_mfdataset(
           of the coordinates. If desired, refer to
           :py:func:`xarray.Dataset.bounds.add_time_bounds` if you require more
           granular configuration for how "T" bounds are generated.
-
     data_var: Optional[str], optional
         The key of the data variable to keep in the Dataset, by default None.
     decode_times: bool, optional
@@ -201,10 +197,9 @@ def open_mfdataset(
         The orientation to use for the Dataset's longitude axis (if it exists),
         by default None. Supported options include:
 
-          * None:  use the current orientation (if the longitude axis exists)
-          * (-180, 180): represents [-180, 180) in math notation
-          * (0, 360): represents [0, 360) in math notation
-
+        * None:  use the current orientation (if the longitude axis exists)
+        * (-180, 180): represents [-180, 180) in math notation
+        * (0, 360): represents [0, 360) in math notation
     data_vars: {"minimal", "different", "all" or list of str}, optional
         These data variables will be concatenated together:
           * "minimal": Only data variables in which the dimension already
@@ -222,15 +217,14 @@ def open_mfdataset(
         data variables in a manner where only data variables in which the
         dimension already appears are included. For example, the time dimension
         will not be concatenated to the dimensions of non-time data variables
-        such as "lat_bnds" or "lon_bnds". `data_vars="minimal"` is required for
+        such as "lat_bnds" or "lon_bnds". ``data_vars="minimal"`` is required for
         some xCDAT functions, including spatial averaging where a reduction is
         performed using the lat/lon bounds.
-
     preprocess : Optional[Callable], optional
         If provided, call this function on each dataset prior to concatenation.
         You can find the file-name from which each dataset was loaded in
         ``ds.encoding["source"]``.
-    kwargs : Dict[str, Any]
+    **kwargs : Dict[str, Any]
         Additional arguments passed on to ``xarray.open_mfdataset``. Refer to
         the [3]_ xarray docs for accepted keyword arguments.
 
@@ -587,7 +581,6 @@ def _postprocess_dataset(
         * If desired, use :py:func:`xarray.Dataset.bounds.add_time_bounds`
           if you require more granular configuration for how "T" bounds
           are generated
-
     lon_orient: Optional[Tuple[float, float]], optional
         The orientation to use for the Dataset's longitude axis (if it exists),
         by default None.
diff --git a/xcdat/temporal.py b/xcdat/temporal.py
index 05e7db57..834eb394 100644
--- a/xcdat/temporal.py
+++ b/xcdat/temporal.py
@@ -175,7 +175,6 @@ def average(self, data_var: str, weighted: bool = True, keep_weights: bool = Fal
         ----------
         data_var: str
             The key of the data variable for calculating averages
-
         weighted : bool, optional
             Calculate averages using weights, by default True.
 
@@ -187,6 +186,12 @@ def average(self, data_var: str, weighted: bool = True, keep_weights: bool = Fal
 
             The weight of masked (missing) data is excluded when averages are
             taken. This is the same as giving them a weight of 0.
+
+            Note that weights are assigned by the labeled time point. If the
+            dataset includes timepoints that span across typical boundaries
+            (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020
+            and end in June 2020), the weights will not be assigned properly.
+            See explanation in the Notes section below.
         keep_weights : bool, optional
             If calculating averages using weights, keep the weights in the
             final dataset output, by default False.
@@ -197,6 +202,20 @@ def average(self, data_var: str, weighted: bool = True, keep_weights: bool = Fal
             Dataset with the average of the data variable and the time dimension
             removed.
 
+        Notes
+        -----
+        When using weighted averages, the weights are assigned based on the
+        timepoint value. For example, a time point of 2020-06-15 with bounds
+        (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020
+        (e.g., for an annual average calculation). This would be expected
+        behavior, but it's possible that data could span across typical temporal
+        boundaries. For example, a time point of 2020-06-01 with bounds
+        (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight
+        would be assigned to June, 2020, which would be incorrect (15 days of
+        weight should be assigned to May and 15 days of weight should be
+        assigned to June). This issue could plausibly arise when using pentad
+        data.
+
         Examples
         --------
 
@@ -224,6 +243,7 @@ def group_average(
     ):
         """Returns a Dataset with average of a data variable by time group.
 
+        Data is grouped into the labeled time point for the averaging operation.
         Time bounds are used for generating weights to calculate weighted group
         averages (refer to the ``weighted`` parameter documentation below).
 
@@ -239,7 +259,6 @@ def group_average(
             * "month": groups by (year, month) for monthly averages.
             * "day": groups by (year, month, day) for daily averages.
             * "hour": groups by (year, month, day, hour) for hourly averages.
-
         weighted : bool, optional
             Calculate averages using weights, by default True.
 
@@ -251,6 +270,12 @@ def group_average(
 
             The weight of masked (missing) data is excluded when averages are
             calculated. This is the same as giving them a weight of 0.
+
+            Note that weights are assigned by the labeled time point. If the
+            dataset includes timepoints that span across typical boundaries
+            (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020
+            and end in June 2020), the weights will not be assigned properly.
+            See explanation in the Notes section below.
         keep_weights : bool, optional
             If calculating averages using weights, keep the weights in the
             final dataset output, by default False.
@@ -299,6 +324,20 @@ def group_average(
         xr.Dataset
             Dataset with the average of a data variable by time group.
 
+        Notes
+        -----
+        When using weighted averages, the weights are assigned based on the
+        timepoint value. For example, a time point of 2020-06-15 with bounds
+        (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020
+        (e.g., for an annual average calculation). This would be expected
+        behavior, but it's possible that data could span across typical temporal
+        boundaries. For example, a time point of 2020-06-01 with bounds
+        (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight
+        would be assigned to June, 2020, which would be incorrect (15 days of
+        weight should be assigned to May and 15 days of weight should be
+        assigned to June). This issue could plausibly arise when using pentad
+        data.
+
         Examples
         --------
 
@@ -370,6 +409,7 @@ def climatology(
     ):
         """Returns a Dataset with the climatology of a data variable.
 
+        Data is grouped into the labeled time point for the averaging operation.
         Time bounds are used for generating weights to calculate weighted
         climatology (refer to the ``weighted`` parameter documentation below).
 
@@ -388,7 +428,6 @@ def climatology(
               present) are dropped to avoid inconsistencies when calculating
               climatologies. Refer to [1]_ for more details on this
               implementation decision.
-
         weighted : bool, optional
             Calculate averages using weights, by default True.
 
@@ -400,6 +439,12 @@ def climatology(
 
             The weight of masked (missing) data is excluded when averages are
             taken. This is the same as giving them a weight of 0.
+
+            Note that weights are assigned by the labeled time point. If the
+            dataset includes timepoints that span across typical boundaries
+            (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020
+            and end in June 2020), the weights will not be assigned properly.
+            See explanation in the Notes section below.
         keep_weights : bool, optional
             If calculating averages using weights, keep the weights in the
             final dataset output, by default False.
@@ -458,6 +503,20 @@ def climatology(
         ----------
         .. [1] https://github.com/xCDAT/xcdat/discussions/332
 
+        Notes
+        -----
+        When using weighted averages, the weights are assigned based on the
+        timepoint value. For example, a time point of 2020-06-15 with bounds
+        (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020
+        (e.g., for an annual average calculation). This would be expected
+        behavior, but it's possible that data could span across typical temporal
+        boundaries. For example, a time point of 2020-06-01 with bounds
+        (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight
+        would be assigned to June, 2020, which would be incorrect (15 days of
+        weight should be assigned to May and 15 days of weight should be
+        assigned to June). This issue could plausibly arise when using pentad
+        data.
+
         Examples
         --------
 
@@ -544,7 +603,6 @@ def departures(
         ----------
         data_var: str
             The key of the data variable for calculating departures.
-
         freq : Frequency
             The frequency of time to group by.
 
@@ -556,7 +614,6 @@ def departures(
               present) are dropped to avoid inconsistencies when calculating
               climatologies. Refer to [2]_ for more details on this
               implementation decision.
-
         weighted : bool, optional
             Calculate averages using weights, by default True.
 
@@ -568,6 +625,12 @@ def departures(
 
             The weight of masked (missing) data is excluded when averages are
             taken. This is the same as giving them a weight of 0.
+
+            Note that weights are assigned by the labeled time point. If the
+            dataset includes timepoints that span across typical boundaries
+            (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020
+            and end in June 2020), the weights will not be assigned properly.
+            See explanation in the Notes section below.
         keep_weights : bool, optional
             If calculating averages using weights, keep the weights in the
             final dataset output, by default False.
@@ -625,6 +688,18 @@ def departures(
 
         Notes
         -----
+        When using weighted averages, the weights are assigned based on the
+        timepoint value. For example, a time point of 2020-06-15 with bounds
+        (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020
+        (e.g., for an annual average calculation). This would be expected
+        behavior, but it's possible that data could span across typical temporal
+        boundaries. For example, a time point of 2020-06-01 with bounds
+        (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight
+        would be assigned to June, 2020, which would be incorrect (15 days of
+        weight should be assigned to May and 15 days of weight should be
+        assigned to June). This issue could plausibly arise when using pentad
+        data.
+
         This method uses xarray's grouped arithmetic as a shortcut for mapping
         over all unique labels. Grouped arithmetic works by assigning a grouping
         label to each time coordinate of the observation data based on the