primap-community · JGuetschow · Dec 10, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
@@ -57,6 +57,7 @@ source priorities and matching algorithms.
 .. autosummary::
     :toctree: generated_csg/
 
+    csg.FitParameters
     csg.GlobalLSStrategy
     csg.LocalTrendsStrategy
     csg.PriorityDefinition

diff --git a/docs/source/data_reading/test_csv_data_sec_cat_if.yaml b/docs/source/data_reading/test_csv_data_sec_cat_if.yaml
@@ -2,9 +2,6 @@ attrs:
   area: area (ISO3)
   cat: category (IPCC2006)
   scen: scenario (general)
-  sec_cats:
-  - Class (class)
-  - Type (type)
 data_file: test_csv_data_sec_cat_if.csv
 dimensions:
   '*':

diff --git a/docs/source/usage/csg.md b/docs/source/usage/csg.md
@@ -223,3 +223,4 @@ complete_result_ds
 Currently the following filling strategies are implemented
 * Global least square matching: {py:class}`primap2.csg.GlobalLSStrategy`
 * Straight substitution: {py:class}`primap2.csg.SubstitutionStrategy`
+* Local trend matching: {py:class}`primap2.csg.LocalTrendsStrategy`
diff --git a/primap2/csg/_strategies/gaps.py b/primap2/csg/_strategies/gaps.py
@@ -1,3 +1,5 @@
+import typing
+
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -12,23 +14,16 @@ class Gap:
 
     Attributes
     ----------
-    type :
+    type
         type of the gap
         possible types:
             'start': start of timeseries boundary (nan, nan, X, X)
             'end': end of timeseries boundary (X, X, nan, nan)
             'gap': gap (X, nan, nan, X)
-    left :
+    left
         left end of the gap
-    right :
+    right
         right end of the gap
-
-    Methods
-    _______
-    get_date_slice()
-        Return a xr.loc type filter for 'time' with a slice from left to right
-        end of the gap
-
     """
 
     type: str = None
@@ -37,6 +32,8 @@ class Gap:
     right: np.datetime64 = None  # right end of the gap
 
     def get_date_slice(self) -> dict[str, slice]:
+        """Return a xr.loc type filter for 'time' with a slice from left to right
+        end of the gap."""
         return {"time": slice(self.left, self.right)}
 
 
@@ -71,15 +68,6 @@ class FitParameters:
         minimal number of points to calculate the trend. Default is 1, but if the degree
         of the fit polynomial is higher than 1, the minimal number of data points
         the degree of the fit polynomial
-
-    Methods
-    -------
-    log_string(fallback=False):
-        Create a string with the classes parameters
-    get_fallback():
-        Return FitParameters object with the `fit_degree` set to the `fallback_degree`
-        of the original object.
-
     """
 
     fit_degree: int = 1
@@ -97,6 +85,7 @@ def __attrs_post_init__(self):
             )
 
     def log_string(self, fallback: bool = False) -> str:
+        """Create a string with the classes parameters."""
         log_str = (
             f"fit_degree: {self.fit_degree}, "
             f"trend_length: {self.trend_length}, "
@@ -110,6 +99,8 @@ def log_string(self, fallback: bool = False) -> str:
         return log_str
 
     def get_fallback(self):
+        """Return FitParameters object with the `fit_degree` set to the `fallback_degree`
+        of the original object."""
         return FitParameters(
             fit_degree=self.fallback_degree,
             trend_length=self.trend_length,
@@ -130,7 +121,6 @@ def get_gaps(ts: xr.DataArray) -> list[Gap]:
     Returns
     -------
         list of Gaps
-
     """
     ts_roll = ts.rolling(time=3, min_periods=1, center=True).sum()
     gaps = []
@@ -194,7 +184,6 @@ def calculate_boundary_trend_with_fallback(
         Tuple with calculated trend values for left and right boundary of the gap. If trend
         calculation is not possible, `None` is returned so the calling strategy can
         raise the StrategyUnableToProcess error.
-
     """
     trend_ts = calculate_boundary_trend(
         ts,
@@ -246,34 +235,36 @@ def calculate_boundary_trend(
         Tuple with calculated trend values for left and right boundary of the gap. If trend
         calculation is not possible, `None` is returned so the calling strategy can
         raise the StrategyUnableToProcess error.
-
     """
-
     if gap.type == "gap":
         # right boundary
-        right = calculate_right_boundary_trend(
+        right = calculate_boundary_trend_inner(
             ts,
+            side="right",
             boundary=gap.right,
             fit_params=fit_params,
         )
         # left boundary
-        left = calculate_left_boundary_trend(
+        left = calculate_boundary_trend_inner(
             ts,
+            side="left",
             boundary=gap.left,
             fit_params=fit_params,
         )
     elif gap.type == "end":
         # left boundary
-        left = calculate_left_boundary_trend(
+        left = calculate_boundary_trend_inner(
             ts,
+            side="left",
             boundary=gap.left,
             fit_params=fit_params,
         )
         right = left
     elif gap.type == "start":
         # right boundary
-        right = calculate_right_boundary_trend(
+        right = calculate_boundary_trend_inner(
             ts,
+            side="right",
             boundary=gap.right,
             fit_params=fit_params,
         )
@@ -284,20 +275,23 @@ def calculate_boundary_trend(
     return np.array([left, right])
 
 
-def calculate_right_boundary_trend(
+def calculate_boundary_trend_inner(
     ts: xr.DataArray,
+    side: typing.Literal["left", "right"],
     boundary: np.datetime64,
     fit_params: FitParameters,
 ) -> float:
     """
-    Replace right boundary point by trend value
+    Calculate trend value for leftmost or rightmost boundary point.
 
     Parameters
     ----------
     ts :
         Time-series to calculate trend for
+    side : "left" or "right"
+        If the left or right boundary point should be processed.
     boundary :
-        boundary point (last NaN value)
+        time index boundary point (last NaN value)
     fit_params :
         FitParameters object which holds all parameters for the fit.
         This function does not handle fallback options thus the fallback
@@ -308,64 +302,13 @@ def calculate_right_boundary_trend(
         Calculated trend value for boundary point. If trend
         calculation is not possible, `None` is returned so the calling strategy can
         raise the StrategyUnableToProcess error.
-
     """
-    point_to_modify = get_shifted_time_value(ts, original_value=boundary, shift=1)
-    trend_index = pd.date_range(
-        start=point_to_modify,
-        periods=fit_params.trend_length,
-        freq=fit_params.trend_length_unit,
+    point_to_modify = get_shifted_time_value(
+        ts, original_value=boundary, shift=1 if side == "right" else -1
     )
-    trend_index = trend_index.intersection(ts.coords["time"])
-    ts_fit = ts.pr.loc[{"time": trend_index}]
-
-    if len(ts_fit.where(ts_fit.notnull(), drop=True)) >= fit_params.min_trend_points:
-        fit = ts_fit.polyfit(dim="time", deg=fit_params.fit_degree, skipna=True)
-        value = xr.polyval(
-            ts_fit.coords["time"].pr.loc[{"time": point_to_modify}],
-            fit.polyfit_coefficients,
-        )
-        return float(value.data)
-    else:
-        logger.info(
-            f"Not enough values to calculate fit for right boundary at "
-            f"{point_to_modify}.\n"
-            f"{fit_params.log_string(fallback=False)}"
-            f"Timeseries info: {timeseries_coord_repr(ts)}"
-        )
-        return np.nan
-
-
-def calculate_left_boundary_trend(
-    ts: xr.DataArray,
-    boundary: np.datetime64,
-    fit_params: FitParameters,
-) -> float:
-    """
-    Replace left boundary point by trend value
-
-    The function assumes equally spaced
-
-    Parameters
-    ----------
-    ts :
-        Time-series to calculate trend for
-    boundary :
-        boundary point (last NaN value)
-    fit_params :
-        FitParameters object which holds all parameters for the fit. This function does
-        not handle fallback options thus the fallback attribute is ignored.
-
-    Returns
-    -------
-        Calculated trend value for boundary point. If trend
-        calculation is not possible, `None` is returned so the calling strategy can
-        raise the StrategyUnableToProcess error.
-
-    """
-    point_to_modify = get_shifted_time_value(ts, original_value=boundary, shift=-1)
     trend_index = pd.date_range(
-        end=point_to_modify,
+        start=point_to_modify if side == "right" else None,
+        end=point_to_modify if side == "left" else None,
         periods=fit_params.trend_length,
         freq=fit_params.trend_length_unit,
     )
@@ -381,7 +324,7 @@ def calculate_left_boundary_trend(
         return float(value.data)
     else:
         logger.info(
-            f"Not enough values to calculate fit for left boundary at "
+            f"Not enough values to calculate fit for {side} boundary at "
             f"{point_to_modify}.\n"
             f"{fit_params.log_string(fallback=False)}"
             f"Timeseries info: {timeseries_coord_repr(ts)}"
@@ -488,20 +431,18 @@ def get_shifted_time_value(
     Returns
     -------
         time coordinate value at desired relative position
-
     """
-    # TODO: the following is not very elegant. I struggle with tasks like getting the coordinate
-    #  value of the next item in xarray
-    mask = ts.copy()
-    mask.data = mask.data * np.nan
-    mask.pr.loc[{"time": original_value}] = 1
-    mask = mask.shift(time=shift, fill_value=np.nan)
-    return mask.coords["time"].where(mask == 1, drop=True).data[0]
+    # For actually getting the index of a value in an index, it is easiest to work with
+    # the underlying numpy arrays.
+    time_points = ts["time"].values
+    original_index = np.where(time_points == original_value)[0][0]
+    new_index = original_index + shift
+    return time_points[new_index]
 
 
 def timeseries_coord_repr(ts: xr.DataArray) -> str:
     """Make short string representation for coordinate values for logging"""
-    dims = set(ts.coords._names) - {"time"}
+    dims = set(ts.coords.keys()) - {"time"}
     coords: dict[str, str] = {str(k): ts[k].item() for k in dims}
     coords = dict(sorted(coords.items()))
     return repr(coords)
diff --git a/primap2/csg/_strategies/local_trends.py b/primap2/csg/_strategies/local_trends.py
@@ -37,14 +37,14 @@ class LocalTrendsStrategy:
     where :math:`\\textrm{fill_ts}_t(t_b)` is the trend value calculated for
     :math:`\\textrm{fill_ts}(t_b)` and equally for :math:`\\textrm{ts}_t(t_b)`.
     :math:`t_b` is the last (in case of a right boundary) or first (in case of a left
-    boundary) non-NaN data pint in :math:`\\textrm{ts}`. The trend value is calculated
+    boundary) non-NaN data point in :math:`\\textrm{ts}`. The trend value is calculated
     using a linear trend of length `trend_length` or less data points if a time-series
     does not cover the full period. By setting `min_trend_points` a minimal number of
     points necessary for the trend calculation can be set. If less points are available a
     :py:class:`StrategyUnableToProcess` error will be raised. This enables the user to
     define a fallback strategy, e.g. single point matching.
     TODO: for the case of gaps this leads to the situation that we can't use trends on
-     one side of the gap and single year matching as fallback on the other
+    one side of the gap and single year matching as fallback on the other
 
     By setting `trend_length` to 1 single year matching is used.
 
@@ -81,7 +81,7 @@ class LocalTrendsStrategy:
     Filling multiple gaps and boundaries with this function is scientifically questionable
     as they will all use different scaling factors and thus don't use a consistent model to
     harmonize one time-series :math:`\\textrm{fill_ts}(t)` to :math:`\\textrm{ts}(t)`.
-    Use with case.
+    Use with care.
 
     Attributes
     ----------

diff --git a/primap2/csg/_wrapper.py b/primap2/csg/_wrapper.py
@@ -10,12 +10,19 @@ def set_priority_coords(
     ds: xr.Dataset,
     dims: dict[str, dict[str, str]],
 ) -> xr.Dataset:
-    """Set values for priority coordinates in output dataset
-
-    coords: Dictionary
-        Format is 'name': {'value': value, 'terminology': terminology}
-        terminology is optional
+    """Set values for priority coordinates.
 
+    Parameters
+    ----------
+    ds: cr.Dataset
+        Dataset to change
+    dims: dict
+        Dictionary containing coordinate names as keys and as values a dictionary
+        with the value to be set and optionally a terminology.
+        Examples:
+        {"source": {"value": "PRIMAP-hist"}} sets the "source" to "PRIMAP-hist".
+        {"area": {"value": "WORLD", "terminology": "ISO3_primap"}} adds the dimension
+        "area (ISO3_primap)" to "WORLD".
     """
     for dim in dims.keys():
         terminology = dims[dim].get("terminology", None)
@@ -26,6 +33,7 @@ def set_priority_coords(
 
 def create_composite_source(
     input_ds: xr.Dataset,
+    *,
     priority_definition: PriorityDefinition,
     strategy_definition: StrategyDefinition,
     result_prio_coords: dict[str, dict[str, str]],
@@ -39,7 +47,6 @@ def create_composite_source(
     This is a wrapper around `primap2.csg.compose` that prepares the input data and sets result
     values for the priority coordinates.
 
-
     Parameters
     ----------
     input_ds
@@ -90,9 +97,7 @@ def create_composite_source(
     -------
         xr.Dataset with composed data according to the given priority and strategy
         definitions
-
     """
-
     # limit input data to these values
     if limit_coords is not None:
         if "variable" in limit_coords.keys():