From 78dedf0aef1303ac2a71d577e84366b55b16da9e Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Wed, 21 Feb 2024 09:51:24 -0800 Subject: [PATCH] [PR]: Update documentation on temporal averaging, usage of bounds, and generation of weights (#601) Co-authored-by: Stephen Po-Chedley --- xcdat/axis.py | 10 +++--- xcdat/bounds.py | 16 ++++++--- xcdat/dataset.py | 61 +++++++++++++++------------------- xcdat/temporal.py | 85 ++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 123 insertions(+), 49 deletions(-) diff --git a/xcdat/axis.py b/xcdat/axis.py index 1fc36008..b870fc5f 100644 --- a/xcdat/axis.py +++ b/xcdat/axis.py @@ -241,13 +241,11 @@ def swap_lon_axis( dataset : xr.Dataset The Dataset containing a longitude axis. to : Tuple[float, float] - The orientation to swap the Dataset's longitude axis to. - - Supported orientations: - - * (-180, 180): represents [-180, 180) in math notation - * (0, 360): represents [0, 360) in math notation + The orientation to swap the Dataset's longitude axis to. Supported + orientations include: + * (-180, 180): represents [-180, 180) in math notation + * (0, 360): represents [0, 360) in math notation sort_ascending : bool After swapping, sort in ascending order (True), or keep existing order (False). diff --git a/xcdat/bounds.py b/xcdat/bounds.py index f59ea3ff..13dd6e43 100644 --- a/xcdat/bounds.py +++ b/xcdat/bounds.py @@ -146,7 +146,10 @@ def add_missing_bounds( # noqa: C901 ``"time_bnds"`` and ``ds.time_bnds`` is present in the dataset. 5. For the "T" axis, its coordinates must be composed of datetime-like - objects (``np.datetime64`` or ``cftime``). + objects (``np.datetime64`` or ``cftime``). This method designed to + operate on time axes that have constant temporal resolution with + annual, monthly, daily, or sub-daily time frequencies. Alternate + frequencies (e.g., pentad) are not supported. Parameters ---------- @@ -326,9 +329,14 @@ def add_time_bounds( ) -> xr.Dataset: """Add bounds for an axis using its coordinate points. - This method loops over the time axis coordinate variables and attempts - to add bounds for each of them if they don't exist. To add time bounds - for the time axis, its coordinates must be the following criteria: + This method designed to operate on time axes that have constant temporal + resolution with annual, monthly, daily, or sub-daily time frequencies. + Alternate frequencies (e.g., pentad) are not supported. It loops over + the time axis coordinate variables and attempts to add bounds for each + of them if they don't exist. + + To add time bounds for the time axis, its coordinates must be the + following criteria: 1. Coordinates are single dimensional, not multidimensional 2. Coordinates are a length > 1 (not singleton) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 147d9db7..f3fe3e01 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -80,7 +80,6 @@ def open_dataset( of the coordinates. If desired, refer to :py:func:`xarray.Dataset.bounds.add_time_bounds` if you require more granular configuration for how "T" bounds are generated. - decode_times: bool, optional If True, attempt to decode times encoded in the standard NetCDF datetime format into cftime.datetime objects. Otherwise, leave them @@ -95,11 +94,10 @@ def open_dataset( Either `(-180, 180)` or `(0, 360)`, by default None. Supported options include: - * None: use the current orientation (if the longitude axis exists) - * (-180, 180): represents [-180, 180) in math notation - * (0, 360): represents [0, 360) in math notation - - kwargs : Dict[str, Any] + * None: use the current orientation (if the longitude axis exists) + * (-180, 180): represents [-180, 180) in math notation + * (0, 360): represents [0, 360) in math notation + **kwargs : Dict[str, Any] Additional arguments passed on to ``xarray.open_dataset``. Refer to the [1]_ xarray docs for accepted keyword arguments. @@ -155,25 +153,24 @@ def open_mfdataset( Paths to dataset files. Paths can be given as strings or as pathlib.Path objects. Supported options include: - * Directory path (e.g., ``"path/to/files"``), which is converted - to a string glob of `*.nc` files - * String glob (e.g., ``"path/to/files/*.nc"``), which is expanded - to a 1-dimensional list of file paths - * File path to dataset (e.g., ``"path/to/files/file1.nc"``) - * List of file paths (e.g., ``["path/to/files/file1.nc", ...]``). - If concatenation along more than one dimension is desired, then - ``paths`` must be a nested list-of-lists (see [2]_ - ``xarray.combine_nested`` for details). - * File path to an XML file with a ``directory`` attribute (e.g., - ``"path/to/files"``). If ``directory`` is set to a blank string - (""), then the current directory is substituted ("."). This option - is intended to support the CDAT CDML dialect of XML files, but it - can work with any XML file that has the ``directory`` attribute. - Refer to [4]_ for more information on CDML. NOTE: This feature is - deprecated in v0.6.0 and will be removed in the subsequent release. - CDAT (including cdms2/CDML) is in maintenance only mode and marked - for end-of-life by the end of 2023. - + * Directory path (e.g., ``"path/to/files"``), which is converted + to a string glob of `*.nc` files + * String glob (e.g., ``"path/to/files/*.nc"``), which is expanded + to a 1-dimensional list of file paths + * File path to dataset (e.g., ``"path/to/files/file1.nc"``) + * List of file paths (e.g., ``["path/to/files/file1.nc", ...]``). + If concatenation along more than one dimension is desired, then + ``paths`` must be a nested list-of-lists (see [2]_ + ``xarray.combine_nested`` for details). + * File path to an XML file with a ``directory`` attribute (e.g., + ``"path/to/files"``). If ``directory`` is set to a blank string + (""), then the current directory is substituted ("."). This option + is intended to support the CDAT CDML dialect of XML files, but it + can work with any XML file that has the ``directory`` attribute. + Refer to [4]_ for more information on CDML. NOTE: This feature is + deprecated in v0.6.0 and will be removed in the subsequent release. + CDAT (including cdms2/CDML) is in maintenance only mode and marked + for end-of-life by the end of 2023. add_bounds: List[CFAxisKey] | None | bool List of CF axes to try to add bounds for (if missing), by default ["X", "Y"]. Set to None to not add any missing bounds. Please note that @@ -185,7 +182,6 @@ def open_mfdataset( of the coordinates. If desired, refer to :py:func:`xarray.Dataset.bounds.add_time_bounds` if you require more granular configuration for how "T" bounds are generated. - data_var: Optional[str], optional The key of the data variable to keep in the Dataset, by default None. decode_times: bool, optional @@ -201,10 +197,9 @@ def open_mfdataset( The orientation to use for the Dataset's longitude axis (if it exists), by default None. Supported options include: - * None: use the current orientation (if the longitude axis exists) - * (-180, 180): represents [-180, 180) in math notation - * (0, 360): represents [0, 360) in math notation - + * None: use the current orientation (if the longitude axis exists) + * (-180, 180): represents [-180, 180) in math notation + * (0, 360): represents [0, 360) in math notation data_vars: {"minimal", "different", "all" or list of str}, optional These data variables will be concatenated together: * "minimal": Only data variables in which the dimension already @@ -222,15 +217,14 @@ def open_mfdataset( data variables in a manner where only data variables in which the dimension already appears are included. For example, the time dimension will not be concatenated to the dimensions of non-time data variables - such as "lat_bnds" or "lon_bnds". `data_vars="minimal"` is required for + such as "lat_bnds" or "lon_bnds". ``data_vars="minimal"`` is required for some xCDAT functions, including spatial averaging where a reduction is performed using the lat/lon bounds. - preprocess : Optional[Callable], optional If provided, call this function on each dataset prior to concatenation. You can find the file-name from which each dataset was loaded in ``ds.encoding["source"]``. - kwargs : Dict[str, Any] + **kwargs : Dict[str, Any] Additional arguments passed on to ``xarray.open_mfdataset``. Refer to the [3]_ xarray docs for accepted keyword arguments. @@ -587,7 +581,6 @@ def _postprocess_dataset( * If desired, use :py:func:`xarray.Dataset.bounds.add_time_bounds` if you require more granular configuration for how "T" bounds are generated - lon_orient: Optional[Tuple[float, float]], optional The orientation to use for the Dataset's longitude axis (if it exists), by default None. diff --git a/xcdat/temporal.py b/xcdat/temporal.py index 05e7db57..834eb394 100644 --- a/xcdat/temporal.py +++ b/xcdat/temporal.py @@ -175,7 +175,6 @@ def average(self, data_var: str, weighted: bool = True, keep_weights: bool = Fal ---------- data_var: str The key of the data variable for calculating averages - weighted : bool, optional Calculate averages using weights, by default True. @@ -187,6 +186,12 @@ def average(self, data_var: str, weighted: bool = True, keep_weights: bool = Fal The weight of masked (missing) data is excluded when averages are taken. This is the same as giving them a weight of 0. + + Note that weights are assigned by the labeled time point. If the + dataset includes timepoints that span across typical boundaries + (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020 + and end in June 2020), the weights will not be assigned properly. + See explanation in the Notes section below. keep_weights : bool, optional If calculating averages using weights, keep the weights in the final dataset output, by default False. @@ -197,6 +202,20 @@ def average(self, data_var: str, weighted: bool = True, keep_weights: bool = Fal Dataset with the average of the data variable and the time dimension removed. + Notes + ----- + When using weighted averages, the weights are assigned based on the + timepoint value. For example, a time point of 2020-06-15 with bounds + (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020 + (e.g., for an annual average calculation). This would be expected + behavior, but it's possible that data could span across typical temporal + boundaries. For example, a time point of 2020-06-01 with bounds + (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight + would be assigned to June, 2020, which would be incorrect (15 days of + weight should be assigned to May and 15 days of weight should be + assigned to June). This issue could plausibly arise when using pentad + data. + Examples -------- @@ -224,6 +243,7 @@ def group_average( ): """Returns a Dataset with average of a data variable by time group. + Data is grouped into the labeled time point for the averaging operation. Time bounds are used for generating weights to calculate weighted group averages (refer to the ``weighted`` parameter documentation below). @@ -239,7 +259,6 @@ def group_average( * "month": groups by (year, month) for monthly averages. * "day": groups by (year, month, day) for daily averages. * "hour": groups by (year, month, day, hour) for hourly averages. - weighted : bool, optional Calculate averages using weights, by default True. @@ -251,6 +270,12 @@ def group_average( The weight of masked (missing) data is excluded when averages are calculated. This is the same as giving them a weight of 0. + + Note that weights are assigned by the labeled time point. If the + dataset includes timepoints that span across typical boundaries + (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020 + and end in June 2020), the weights will not be assigned properly. + See explanation in the Notes section below. keep_weights : bool, optional If calculating averages using weights, keep the weights in the final dataset output, by default False. @@ -299,6 +324,20 @@ def group_average( xr.Dataset Dataset with the average of a data variable by time group. + Notes + ----- + When using weighted averages, the weights are assigned based on the + timepoint value. For example, a time point of 2020-06-15 with bounds + (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020 + (e.g., for an annual average calculation). This would be expected + behavior, but it's possible that data could span across typical temporal + boundaries. For example, a time point of 2020-06-01 with bounds + (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight + would be assigned to June, 2020, which would be incorrect (15 days of + weight should be assigned to May and 15 days of weight should be + assigned to June). This issue could plausibly arise when using pentad + data. + Examples -------- @@ -370,6 +409,7 @@ def climatology( ): """Returns a Dataset with the climatology of a data variable. + Data is grouped into the labeled time point for the averaging operation. Time bounds are used for generating weights to calculate weighted climatology (refer to the ``weighted`` parameter documentation below). @@ -388,7 +428,6 @@ def climatology( present) are dropped to avoid inconsistencies when calculating climatologies. Refer to [1]_ for more details on this implementation decision. - weighted : bool, optional Calculate averages using weights, by default True. @@ -400,6 +439,12 @@ def climatology( The weight of masked (missing) data is excluded when averages are taken. This is the same as giving them a weight of 0. + + Note that weights are assigned by the labeled time point. If the + dataset includes timepoints that span across typical boundaries + (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020 + and end in June 2020), the weights will not be assigned properly. + See explanation in the Notes section below. keep_weights : bool, optional If calculating averages using weights, keep the weights in the final dataset output, by default False. @@ -458,6 +503,20 @@ def climatology( ---------- .. [1] https://github.com/xCDAT/xcdat/discussions/332 + Notes + ----- + When using weighted averages, the weights are assigned based on the + timepoint value. For example, a time point of 2020-06-15 with bounds + (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020 + (e.g., for an annual average calculation). This would be expected + behavior, but it's possible that data could span across typical temporal + boundaries. For example, a time point of 2020-06-01 with bounds + (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight + would be assigned to June, 2020, which would be incorrect (15 days of + weight should be assigned to May and 15 days of weight should be + assigned to June). This issue could plausibly arise when using pentad + data. + Examples -------- @@ -544,7 +603,6 @@ def departures( ---------- data_var: str The key of the data variable for calculating departures. - freq : Frequency The frequency of time to group by. @@ -556,7 +614,6 @@ def departures( present) are dropped to avoid inconsistencies when calculating climatologies. Refer to [2]_ for more details on this implementation decision. - weighted : bool, optional Calculate averages using weights, by default True. @@ -568,6 +625,12 @@ def departures( The weight of masked (missing) data is excluded when averages are taken. This is the same as giving them a weight of 0. + + Note that weights are assigned by the labeled time point. If the + dataset includes timepoints that span across typical boundaries + (e.g., a timepoint on 2020-06-01 with bounds that begin in May 2020 + and end in June 2020), the weights will not be assigned properly. + See explanation in the Notes section below. keep_weights : bool, optional If calculating averages using weights, keep the weights in the final dataset output, by default False. @@ -625,6 +688,18 @@ def departures( Notes ----- + When using weighted averages, the weights are assigned based on the + timepoint value. For example, a time point of 2020-06-15 with bounds + (2020-06-01, 2020-06-30) has 30 days of weight assigned to June, 2020 + (e.g., for an annual average calculation). This would be expected + behavior, but it's possible that data could span across typical temporal + boundaries. For example, a time point of 2020-06-01 with bounds + (2020-05-16, 2020-06-15) would have 30 days of weight, but this weight + would be assigned to June, 2020, which would be incorrect (15 days of + weight should be assigned to May and 15 days of weight should be + assigned to June). This issue could plausibly arise when using pentad + data. + This method uses xarray's grouped arithmetic as a shortcut for mapping over all unique labels. Grouped arithmetic works by assigning a grouping label to each time coordinate of the observation data based on the