From 8dba291d3f049f899636f9cfb1f8f483f215eddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:38:08 +0100 Subject: [PATCH 1/9] docs: more informative docstring for set_priority_coords --- primap2/csg/_wrapper.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/primap2/csg/_wrapper.py b/primap2/csg/_wrapper.py index a62aee3..07d7afd 100644 --- a/primap2/csg/_wrapper.py +++ b/primap2/csg/_wrapper.py @@ -10,12 +10,19 @@ def set_priority_coords( ds: xr.Dataset, dims: dict[str, dict[str, str]], ) -> xr.Dataset: - """Set values for priority coordinates in output dataset - - coords: Dictionary - Format is 'name': {'value': value, 'terminology': terminology} - terminology is optional + """Set values for priority coordinates. + Parameters + ---------- + ds: cr.Dataset + Dataset to change + dims: dict + Dictionary containing coordinate names as keys and as values a dictionary + with the value to be set and optionally a terminology. + Examples: + {"source": {"value": "PRIMAP-hist"}} sets the "source" to "PRIMAP-hist". + {"area": {"value": "WORLD", "terminology": "ISO3_primap"}} adds the dimension + "area (ISO3_primap)" to "WORLD". """ for dim in dims.keys(): terminology = dims[dim].get("terminology", None) From 4ecbcbb9099930e69fbf35ccbff53c141d55fa56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:38:55 +0100 Subject: [PATCH 2/9] refactor: enforce giving names for keyword arguments in create_composite_source --- primap2/csg/_wrapper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/primap2/csg/_wrapper.py b/primap2/csg/_wrapper.py index 07d7afd..f847cff 100644 --- a/primap2/csg/_wrapper.py +++ b/primap2/csg/_wrapper.py @@ -33,6 +33,7 @@ def set_priority_coords( def create_composite_source( input_ds: xr.Dataset, + *, priority_definition: PriorityDefinition, strategy_definition: StrategyDefinition, result_prio_coords: dict[str, dict[str, str]], @@ -46,7 +47,6 @@ def create_composite_source( This is a wrapper around `primap2.csg.compose` that prepares the input data and sets result values for the priority coordinates. - Parameters ---------- input_ds @@ -97,9 +97,7 @@ def create_composite_source( ------- xr.Dataset with composed data according to the given priority and strategy definitions - """ - # limit input data to these values if limit_coords is not None: if "variable" in limit_coords.keys(): From 414c2c633380279b39e8386c1e7c9a4a2cbcbaf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:40:00 +0100 Subject: [PATCH 3/9] docs: move method information to method docstrings. --- primap2/csg/_strategies/gaps.py | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/primap2/csg/_strategies/gaps.py b/primap2/csg/_strategies/gaps.py index 58b753c..22f921b 100644 --- a/primap2/csg/_strategies/gaps.py +++ b/primap2/csg/_strategies/gaps.py @@ -12,23 +12,16 @@ class Gap: Attributes ---------- - type : + type type of the gap possible types: 'start': start of timeseries boundary (nan, nan, X, X) 'end': end of timeseries boundary (X, X, nan, nan) 'gap': gap (X, nan, nan, X) - left : + left left end of the gap - right : + right right end of the gap - - Methods - _______ - get_date_slice() - Return a xr.loc type filter for 'time' with a slice from left to right - end of the gap - """ type: str = None @@ -37,6 +30,8 @@ class Gap: right: np.datetime64 = None # right end of the gap def get_date_slice(self) -> dict[str, slice]: + """Return a xr.loc type filter for 'time' with a slice from left to right + end of the gap.""" return {"time": slice(self.left, self.right)} @@ -71,15 +66,6 @@ class FitParameters: minimal number of points to calculate the trend. Default is 1, but if the degree of the fit polynomial is higher than 1, the minimal number of data points the degree of the fit polynomial - - Methods - ------- - log_string(fallback=False): - Create a string with the classes parameters - get_fallback(): - Return FitParameters object with the `fit_degree` set to the `fallback_degree` - of the original object. - """ fit_degree: int = 1 @@ -97,6 +83,7 @@ def __attrs_post_init__(self): ) def log_string(self, fallback: bool = False) -> str: + """Create a string with the classes parameters.""" log_str = ( f"fit_degree: {self.fit_degree}, " f"trend_length: {self.trend_length}, " @@ -110,6 +97,8 @@ def log_string(self, fallback: bool = False) -> str: return log_str def get_fallback(self): + """Return FitParameters object with the `fit_degree` set to the `fallback_degree` + of the original object.""" return FitParameters( fit_degree=self.fallback_degree, trend_length=self.trend_length, @@ -130,7 +119,6 @@ def get_gaps(ts: xr.DataArray) -> list[Gap]: Returns ------- list of Gaps - """ ts_roll = ts.rolling(time=3, min_periods=1, center=True).sum() gaps = [] @@ -194,7 +182,6 @@ def calculate_boundary_trend_with_fallback( Tuple with calculated trend values for left and right boundary of the gap. If trend calculation is not possible, `None` is returned so the calling strategy can raise the StrategyUnableToProcess error. - """ trend_ts = calculate_boundary_trend( ts, @@ -246,9 +233,7 @@ def calculate_boundary_trend( Tuple with calculated trend values for left and right boundary of the gap. If trend calculation is not possible, `None` is returned so the calling strategy can raise the StrategyUnableToProcess error. - """ - if gap.type == "gap": # right boundary right = calculate_right_boundary_trend( @@ -308,7 +293,6 @@ def calculate_right_boundary_trend( Calculated trend value for boundary point. If trend calculation is not possible, `None` is returned so the calling strategy can raise the StrategyUnableToProcess error. - """ point_to_modify = get_shifted_time_value(ts, original_value=boundary, shift=1) trend_index = pd.date_range( From 27b2742d71d1d3f51204b0a0d8898b602e546027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:40:45 +0100 Subject: [PATCH 4/9] refactor: more efficient implementation of get_shifted_time_value --- primap2/csg/_strategies/gaps.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/primap2/csg/_strategies/gaps.py b/primap2/csg/_strategies/gaps.py index 22f921b..bf13c9f 100644 --- a/primap2/csg/_strategies/gaps.py +++ b/primap2/csg/_strategies/gaps.py @@ -472,15 +472,13 @@ def get_shifted_time_value( Returns ------- time coordinate value at desired relative position - """ - # TODO: the following is not very elegant. I struggle with tasks like getting the coordinate - # value of the next item in xarray - mask = ts.copy() - mask.data = mask.data * np.nan - mask.pr.loc[{"time": original_value}] = 1 - mask = mask.shift(time=shift, fill_value=np.nan) - return mask.coords["time"].where(mask == 1, drop=True).data[0] + # For actually getting the index of a value in an index, it is easiest to work with + # the underlying numpy arrays. + time_points = ts["time"].values + original_index = np.where(time_points == original_value)[0][0] + new_index = original_index + shift + return time_points[new_index] def timeseries_coord_repr(ts: xr.DataArray) -> str: From 726b992719db1b6946f2c819481f2d42724a9211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:41:26 +0100 Subject: [PATCH 5/9] refactor: merge calculate_left_boundary_trend and calculate_right_boundary_trend into single function --- primap2/csg/_strategies/gaps.py | 85 +++++++++------------------------ primap2/tests/csg/test_gaps.py | 30 +++++++----- 2 files changed, 41 insertions(+), 74 deletions(-) diff --git a/primap2/csg/_strategies/gaps.py b/primap2/csg/_strategies/gaps.py index bf13c9f..6b847c9 100644 --- a/primap2/csg/_strategies/gaps.py +++ b/primap2/csg/_strategies/gaps.py @@ -1,3 +1,5 @@ +import typing + import numpy as np import pandas as pd import xarray as xr @@ -236,29 +238,33 @@ def calculate_boundary_trend( """ if gap.type == "gap": # right boundary - right = calculate_right_boundary_trend( + right = calculate_boundary_trend_inner( ts, + side="right", boundary=gap.right, fit_params=fit_params, ) # left boundary - left = calculate_left_boundary_trend( + left = calculate_boundary_trend_inner( ts, + side="left", boundary=gap.left, fit_params=fit_params, ) elif gap.type == "end": # left boundary - left = calculate_left_boundary_trend( + left = calculate_boundary_trend_inner( ts, + side="left", boundary=gap.left, fit_params=fit_params, ) right = left elif gap.type == "start": # right boundary - right = calculate_right_boundary_trend( + right = calculate_boundary_trend_inner( ts, + side="right", boundary=gap.right, fit_params=fit_params, ) @@ -269,20 +275,23 @@ def calculate_boundary_trend( return np.array([left, right]) -def calculate_right_boundary_trend( +def calculate_boundary_trend_inner( ts: xr.DataArray, + side: typing.Literal["left", "right"], boundary: np.datetime64, fit_params: FitParameters, ) -> float: """ - Replace right boundary point by trend value + Calculate trend value for leftmost or rightmost boundary point. Parameters ---------- ts : Time-series to calculate trend for + side : "left" or "right" + If the left or right boundary point should be processed. boundary : - boundary point (last NaN value) + time index boundary point (last NaN value) fit_params : FitParameters object which holds all parameters for the fit. This function does not handle fallback options thus the fallback @@ -294,62 +303,12 @@ def calculate_right_boundary_trend( calculation is not possible, `None` is returned so the calling strategy can raise the StrategyUnableToProcess error. """ - point_to_modify = get_shifted_time_value(ts, original_value=boundary, shift=1) - trend_index = pd.date_range( - start=point_to_modify, - periods=fit_params.trend_length, - freq=fit_params.trend_length_unit, + point_to_modify = get_shifted_time_value( + ts, original_value=boundary, shift=1 if side == "right" else -1 ) - trend_index = trend_index.intersection(ts.coords["time"]) - ts_fit = ts.pr.loc[{"time": trend_index}] - - if len(ts_fit.where(ts_fit.notnull(), drop=True)) >= fit_params.min_trend_points: - fit = ts_fit.polyfit(dim="time", deg=fit_params.fit_degree, skipna=True) - value = xr.polyval( - ts_fit.coords["time"].pr.loc[{"time": point_to_modify}], - fit.polyfit_coefficients, - ) - return float(value.data) - else: - logger.info( - f"Not enough values to calculate fit for right boundary at " - f"{point_to_modify}.\n" - f"{fit_params.log_string(fallback=False)}" - f"Timeseries info: {timeseries_coord_repr(ts)}" - ) - return np.nan - - -def calculate_left_boundary_trend( - ts: xr.DataArray, - boundary: np.datetime64, - fit_params: FitParameters, -) -> float: - """ - Replace left boundary point by trend value - - The function assumes equally spaced - - Parameters - ---------- - ts : - Time-series to calculate trend for - boundary : - boundary point (last NaN value) - fit_params : - FitParameters object which holds all parameters for the fit. This function does - not handle fallback options thus the fallback attribute is ignored. - - Returns - ------- - Calculated trend value for boundary point. If trend - calculation is not possible, `None` is returned so the calling strategy can - raise the StrategyUnableToProcess error. - - """ - point_to_modify = get_shifted_time_value(ts, original_value=boundary, shift=-1) trend_index = pd.date_range( - end=point_to_modify, + start=point_to_modify if side == "right" else None, + end=point_to_modify if side == "left" else None, periods=fit_params.trend_length, freq=fit_params.trend_length_unit, ) @@ -365,7 +324,7 @@ def calculate_left_boundary_trend( return float(value.data) else: logger.info( - f"Not enough values to calculate fit for left boundary at " + f"Not enough values to calculate fit for {side} boundary at " f"{point_to_modify}.\n" f"{fit_params.log_string(fallback=False)}" f"Timeseries info: {timeseries_coord_repr(ts)}" @@ -483,7 +442,7 @@ def get_shifted_time_value( def timeseries_coord_repr(ts: xr.DataArray) -> str: """Make short string representation for coordinate values for logging""" - dims = set(ts.coords._names) - {"time"} + dims = set(ts.coords.keys()) - {"time"} coords: dict[str, str] = {str(k): ts[k].item() for k in dims} coords = dict(sorted(coords.items())) return repr(coords) diff --git a/primap2/tests/csg/test_gaps.py b/primap2/tests/csg/test_gaps.py index 8047999..fa2b3fb 100644 --- a/primap2/tests/csg/test_gaps.py +++ b/primap2/tests/csg/test_gaps.py @@ -9,9 +9,8 @@ FitParameters, Gap, calculate_boundary_trend, + calculate_boundary_trend_inner, calculate_boundary_trend_with_fallback, - calculate_left_boundary_trend, - calculate_right_boundary_trend, calculate_scaling_factor, fill_gap, get_gaps, @@ -202,7 +201,9 @@ def test_get_shifted_time_value(test_ts): assert shifted_value == np.datetime64("1952-01-01") -def test_calculate_left_boundary_trend(test_ts, fit_params_linear, fit_params_constant, caplog): +def test_calculate_boundary_trend_inner_left( + test_ts, fit_params_linear, fit_params_constant, caplog +): gaps = get_gaps(test_ts) # linear trend for a left boundary @@ -211,8 +212,9 @@ def test_calculate_left_boundary_trend(test_ts, fit_params_linear, fit_params_co data_to_interpolate = test_ts.pr.loc[{"time": slice("1996", "2005")}].data coeff = np.polyfit(range(0, 10), data_to_interpolate, deg=fit_degree) expected_value = np.polyval(coeff, 9) - trend_value = calculate_left_boundary_trend( + trend_value = calculate_boundary_trend_inner( test_ts, + side="left", boundary=gaps[1].left, fit_params=fit_params_linear, ) @@ -223,8 +225,9 @@ def test_calculate_left_boundary_trend(test_ts, fit_params_linear, fit_params_co # expected result coeff = np.polyfit(range(0, 10), data_to_interpolate, deg=fit_degree) expected_value = np.polyval(coeff, 9) - trend_value = calculate_left_boundary_trend( + trend_value = calculate_boundary_trend_inner( test_ts, + side="left", boundary=gaps[1].left, fit_params=fit_params_constant, ) @@ -234,8 +237,8 @@ def test_calculate_left_boundary_trend(test_ts, fit_params_linear, fit_params_co test_ts.loc[{"time": slice("1997", "2002")}] = ( test_ts.loc[{"time": slice("1997", "2002")}] * np.nan ) - trend_value = calculate_left_boundary_trend( - test_ts, boundary=gaps[1].left, fit_params=fit_params_linear + trend_value = calculate_boundary_trend_inner( + test_ts, side="left", boundary=gaps[1].left, fit_params=fit_params_linear ) assert np.isnan(trend_value) @@ -249,7 +252,9 @@ def test_calculate_left_boundary_trend(test_ts, fit_params_linear, fit_params_co assert log_str in caplog.text -def test_calculate_right_boundary_trend(test_ts, fit_params_linear, fit_params_constant, caplog): +def test_calculate_boundary_trend_inner_right( + test_ts, fit_params_linear, fit_params_constant, caplog +): gaps = get_gaps(test_ts) # linear trend for a right boundary @@ -259,8 +264,9 @@ def test_calculate_right_boundary_trend(test_ts, fit_params_linear, fit_params_c coeff = np.polyfit(range(0, 10), data_to_interpolate, deg=fit_degree) expected_value = np.polyval(coeff, 0) - trend_value = calculate_right_boundary_trend( + trend_value = calculate_boundary_trend_inner( test_ts, + side="right", boundary=gaps[0].right, fit_params=fit_params_linear, ) @@ -273,8 +279,9 @@ def test_calculate_right_boundary_trend(test_ts, fit_params_linear, fit_params_c coeff = np.polyfit(range(0, 10), data_to_interpolate, deg=fit_degree) expected_value = np.polyval(coeff, 0) - trend_value = calculate_right_boundary_trend( + trend_value = calculate_boundary_trend_inner( test_ts, + side="right", boundary=gaps[0].right, fit_params=fit_params_constant, ) @@ -285,8 +292,9 @@ def test_calculate_right_boundary_trend(test_ts, fit_params_linear, fit_params_c test_ts.loc[{"time": slice("1958", "1964")}] = (test_ts.loc)[ {"time": slice("1958", "1964")} ] * np.nan - trend_value = calculate_right_boundary_trend( + trend_value = calculate_boundary_trend_inner( test_ts, + side="right", boundary=gaps[0].right, fit_params=fit_params_linear, ) From eaaa9728164817cde6cc18d8cf04275a4cf9b22b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:57:35 +0100 Subject: [PATCH 6/9] docs: regenerate api docs --- docs/source/api/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 3a87a52..e49727d 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -57,6 +57,7 @@ source priorities and matching algorithms. .. autosummary:: :toctree: generated_csg/ + csg.FitParameters csg.GlobalLSStrategy csg.LocalTrendsStrategy csg.PriorityDefinition From 5ca472757b17743a44cf40fd79c6eebab0e65cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:58:05 +0100 Subject: [PATCH 7/9] docs: regenerate example yaml file after sec_cats were removed --- docs/source/data_reading/test_csv_data_sec_cat_if.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/source/data_reading/test_csv_data_sec_cat_if.yaml b/docs/source/data_reading/test_csv_data_sec_cat_if.yaml index 6c26cb4..17d5cfa 100644 --- a/docs/source/data_reading/test_csv_data_sec_cat_if.yaml +++ b/docs/source/data_reading/test_csv_data_sec_cat_if.yaml @@ -2,9 +2,6 @@ attrs: area: area (ISO3) cat: category (IPCC2006) scen: scenario (general) - sec_cats: - - Class (class) - - Type (type) data_file: test_csv_data_sec_cat_if.csv dimensions: '*': From d679f8709364b74f61bca517fd560a1e5004c90c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:58:14 +0100 Subject: [PATCH 8/9] docs: typos --- primap2/csg/_strategies/local_trends.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/primap2/csg/_strategies/local_trends.py b/primap2/csg/_strategies/local_trends.py index 0cd29c0..ca50d70 100644 --- a/primap2/csg/_strategies/local_trends.py +++ b/primap2/csg/_strategies/local_trends.py @@ -37,14 +37,14 @@ class LocalTrendsStrategy: where :math:`\\textrm{fill_ts}_t(t_b)` is the trend value calculated for :math:`\\textrm{fill_ts}(t_b)` and equally for :math:`\\textrm{ts}_t(t_b)`. :math:`t_b` is the last (in case of a right boundary) or first (in case of a left - boundary) non-NaN data pint in :math:`\\textrm{ts}`. The trend value is calculated + boundary) non-NaN data point in :math:`\\textrm{ts}`. The trend value is calculated using a linear trend of length `trend_length` or less data points if a time-series does not cover the full period. By setting `min_trend_points` a minimal number of points necessary for the trend calculation can be set. If less points are available a :py:class:`StrategyUnableToProcess` error will be raised. This enables the user to define a fallback strategy, e.g. single point matching. TODO: for the case of gaps this leads to the situation that we can't use trends on - one side of the gap and single year matching as fallback on the other + one side of the gap and single year matching as fallback on the other By setting `trend_length` to 1 single year matching is used. @@ -81,7 +81,7 @@ class LocalTrendsStrategy: Filling multiple gaps and boundaries with this function is scientifically questionable as they will all use different scaling factors and thus don't use a consistent model to harmonize one time-series :math:`\\textrm{fill_ts}(t)` to :math:`\\textrm{ts}(t)`. - Use with case. + Use with care. Attributes ---------- From 46b3a2602b33f58723a0e9a62bfea78a93bafcfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 29 Nov 2024 12:58:28 +0100 Subject: [PATCH 9/9] docs: add LocalTrendsStrategy to CSG usage docs --- docs/source/usage/csg.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/usage/csg.md b/docs/source/usage/csg.md index f3b9297..bde091c 100644 --- a/docs/source/usage/csg.md +++ b/docs/source/usage/csg.md @@ -223,3 +223,4 @@ complete_result_ds Currently the following filling strategies are implemented * Global least square matching: {py:class}`primap2.csg.GlobalLSStrategy` * Straight substitution: {py:class}`primap2.csg.SubstitutionStrategy` +* Local trend matching: {py:class}`primap2.csg.LocalTrendsStrategy`