Merge pull request #291 from primap-community/1-to-n-mapping

primap-community · Nov 11, 2024 · fcdaf32 · fcdaf32
2 parents 032e77c + 3da2728
commit fcdaf32
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 92 deletions.
diff --git a/changelog/291.improvement.md b/changelog/291.improvement.md
@@ -0,0 +1 @@
+In the conversion function, disable splitting into multiple categories, instead create an aggregated category.
diff --git a/primap2/_convert.py b/primap2/_convert.py
@@ -166,6 +166,9 @@ def _fill_category(
             already_converted = set(output_selection[new_dim]).intersection(
                 set(already_converted_categories)
             )
+            # if there are several categories on the target side
+            # we can still convert because it will
+            # create a new category
             if already_converted:
                 logger.warning(
                     f"For category {category!r}, would want to use a "
@@ -175,35 +178,26 @@ def _fill_category(
                 )
                 continue
 
-            try:
-                effective_input_weights = derive_weights(
-                    dim=dim,
-                    category=category,
-                    rule=rule,
-                    operation_type="input",
-                )
-                effective_output_weights = derive_weights(
-                    dim=new_dim,
-                    category=category,
-                    rule=rule,
-                    operation_type="output",
-                )
-            except WeightingInfoMissing as err:
-                logger.warning(str(err))
-                continue
-
             # the left-hand side of the conversion formula summed up
-            lhs = (input_factors * effective_input_weights * self._da.loc[input_selection]).sum(
-                dim=dim
-            )
+            lhs = (input_factors * self._da.loc[input_selection]).sum(dim=dim)
             # the right-hand side of the conversion formula split up
-            rhs = lhs / output_factors / effective_output_weights
-
-            da.loc[output_selection] = rhs
-
-            if not rule.is_restricted:
-                # stop processing rules for this category
+            rhs = lhs / output_factors
+
+            # if there is more than one category on the target side
+            if len(output_selection[new_dim]) > 1:
+                new_category = create_category_name(rule)
+                new_categories = [*da.indexes["category (IPCC2006)"], new_category]
+                da = da.reindex({"category (IPCC2006)": new_categories}, fill_value=np.nan)
+                new_output_selection = output_selection.copy()
+                new_output_selection[new_dim] = new_category
+                da.loc[new_output_selection] = rhs.sum(dim=new_dim)
                 return output_selection[new_dim], da
+            else:
+                da.loc[output_selection] = rhs
+
+                if not rule.is_restricted:
+                    # stop processing rules for this category
+                    return output_selection[new_dim], da
 
         logger.debug(
             f"No unrestricted rule to derive data for {category!r} applied, some or "
@@ -394,67 +388,6 @@ def factors_categories_to_xarray(
     return selection, factors
 
 
-class WeightingInfoMissing(ValueError):
-    """Some information to derive weighting factors for a rule is missing."""
-
-    def __init__(
-        self,
-        category: climate_categories.Category,
-        rule: climate_categories.ConversionRule,
-        message: str,
-    ):
-        full_message = (
-            f"Can not derive data for category {category!r} using rule"
-            f" '{rule}': {message} Skipping this rule."
-        )
-        ValueError.__init__(self, full_message)
-
-
-def derive_weights(
-    *,
-    dim: str,
-    category: climate_categories.Category,
-    rule: climate_categories.ConversionRule,
-    operation_type: str,
-) -> xr.DataArray | float:
-    """Derive the weights to use for applying a specific rule.
-
-    Parameters
-    ----------
-    dim: str
-        Dimension which contains the categories.
-    category: climate_categories.Category
-        Category which should be derived.
-    rule: climate_categories.ConversionRule
-        Rule that should be used to derive the category.
-    operation_type: ``input`` or ``output``
-        If weights for the source data (input) or the result data (output) should
-        be derived.
-
-    Returns
-    -------
-    factors: float or xr.DataArray
-        Object which can be multiplied with the input or output DataArray to apply
-        weights.
-    """
-    # TODO this may change again in the next PR
-    if operation_type == "input":
-        return 1.0
-    elif operation_type == "output":
-        if rule.cardinality_b == "one":
-            return 1.0
-        else:
-            raise NotImplementedError(
-                "Splitting input categories into multiple"
-                " output categories is currently not supported. "
-                f"{rule.csv_original_text=}, {category=}"
-            )
-    else:
-        raise NotImplementedError(
-            f"operation_type must be either input or output. Got {operation_type}"
-        )
-
-
 def prepare_auxiliary_dimensions(
     conversion: climate_categories.Conversion,
     auxiliary_dimensions: dict[str, str] | None,
@@ -497,3 +430,24 @@ def prepare_auxiliary_dimensions(
     return {
         climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
     }
+
+
+def create_category_name(rule: climate_categories.ConversionRule):
+    """
+    Create a category name based on the provided rule.
+
+    Parameters
+    ----------
+    rule : climate_categories.ConversionRule
+        rule to convert between categories from two different categorizations.
+
+    Returns
+    -------
+        The generated category name.
+    """
+    factor_to_string = {1: "+", -1: "-"}
+    components = [factor_to_string[i[1]] + i[0].codes[0] for i in rule.factors_categories_b.items()]
+    # remove the first "+" sign in the name (leave "-" sign in)
+    if components[0][0] == "+":
+        components[0] = components[0][1:]
+    return "A_(" + "".join(components) + ")"
diff --git a/primap2/tests/data/BURDI_conversion.csv b/primap2/tests/data/BURDI_conversion.csv
@@ -17,16 +17,19 @@ BURDI,IPCC2006_PRIMAP,comment
 2.C,2.C
 2.F,2.F
 2.G + 2.D, 2.H
+2.G, 2.H.3
 3,2.D
 4,M.AG
 4.A,3.A.1
 4.B,3.A.2
 4.C,3.C.7
+4.D, M.3.C.45.AG
 4.D + 4.C + 4.E + 4.F + 4.G,3.C
 4.E,3.C.1.c
 4.F,3.C.1.b
 4.G,3.C.8
 5,M.LULUCF
+4+5,3
 6,4
 6.A,4.A
 6.B,4.D

diff --git a/primap2/tests/data/simple_categorisation_b.yaml b/primap2/tests/data/simple_categorisation_b.yaml
@@ -25,3 +25,12 @@ categories:
     alternative_codes:
       - B
       - CatB
+  3:
+    title: Category 3
+    comment: The third category
+  4:
+    title: Category 4
+    comment: The fourth category
+  5:
+    title: Category 5
+    comment: The fifth category
diff --git a/primap2/tests/data/test_create_category_name_conversion.csv b/primap2/tests/data/test_create_category_name_conversion.csv
@@ -0,0 +1,6 @@
+# references: test
+# last_update: 2024-10-14
+A,B,comment
+1,1+2, no comment
+2,-3+4
+3,5-1
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
@@ -16,6 +16,7 @@ def get_test_data_filepath(fname: str):
     return importlib.resources.files("primap2.tests.data").joinpath(fname)
 
 
+@pytest.mark.xfail
 def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     # make a data set with IPCC1996 categories
     da = empty_ds["CO2"]
@@ -40,7 +41,6 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
         )
 
 
-@pytest.mark.xfail
 def test_convert_ipcc(empty_ds: xr.Dataset):
     # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy
     # to see
@@ -64,12 +64,48 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
         conversion=conversion,
         auxiliary_dimensions={"gas": "source (gas)"},
     )
-
+    # rule 1 -> 1
     assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # rule 2 + 3 -> 2
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # rule 1.A.2.f -> 1.A.2.f + 1.A.2.g + 1.A.2.h + 1.A.2.i + 1.A.2.j + 1.A.2.k + 1.A.2.l + 1.A.2.m
+    autocat = "A_(1.A.2.f+1.A.2.g+1.A.2.h+1.A.2.i+1.A.2.j+1.A.2.k+1.A.2.l+1.A.2.m)"
+    assert (
+        (result.pr.loc[{"category": autocat}] == 8.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    )
+    # rule 4.D for N2O only -> 3.C.4 + 3.C.5
+    autocat = "A_(3.C.4+3.C.5)"
+    assert (
+        (
+            result.pr.loc[{"category": autocat, "source (gas)": "N2O"}]
+            == 2.0 * primap2.ureg("Gg CO2 / year")
+        )
+        .all()
+        .item()
+    )
+    # all other gases should be nan
+    all_gases_but_N2O = list(result.indexes["source (gas)"])
+    all_gases_but_N2O.remove("N2O")
+    assert np.isnan(
+        result.pr.loc[{"category": autocat, "source (gas)": all_gases_but_N2O}].values
+    ).all()
+    # rule 7 -> 5
+    assert (result.pr.loc[{"category": "5"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # rule 2.F.6 -> 2.E + 2.F.6 + 2.G.1 + 2.G.2 + 2.G.4,
+    # rule 2.F.6 + 3.D -> 2.E + 2.F.6 + 2.G - ignored because 2.F.G already converted
+    # rule 2.G -> 2.H.3 - 1-to-1-conversion
+    autocat = "A_(2.E+2.F.6+2.G.1+2.G.2+2.G.4)"
+    assert (
+        (result.pr.loc[{"category": autocat}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    )
+    assert "A_(2.E+2.F.6+2.G)" not in list(result.indexes["category (IPCC2006)"])
+    assert (
+        (result.pr.loc[{"category": "2.H.3"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    )
 
 
 # test with new conversion and two existing categorisations
+@pytest.mark.xfail
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
     filepath = get_test_data_filepath("BURDI_conversion.csv")
@@ -150,12 +186,24 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     assert (
         (result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
     )
-    # 2.E + 2.B = 2.E, 2.E should not be part of new data set
+    # rule 2.E + 2.B -> 2.B
+    # 2.E is part of PRIMAP categories, but cannot be retrieved from conversion
     assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
     # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
     assert (
         (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
     )
+    # 4.D -> M.3.C.45.AG
+    # TODO This category is only available on M3C45AG branch in climate categories
+    # test locally with:
+    # `source venv/bin/activate`
+    # `pip install -e ../climate_categories`
+    # Will pass after climate categories release
+    assert (
+        (result.pr.loc[{"category": "M.3.C.45.AG"}] == 1.0 * primap2.ureg("Gg CO2 / year"))
+        .all()
+        .item()
+    )
 
 
 # test with new conversion and new categorisations
@@ -198,5 +246,35 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
 
     # check result has 2 categories (input categorisation had 3)
-    # TODO this is ambiguous when order changes
-    assert result.shape == (2, 21, 4, 1)
+    # TODO this is ambiguous, order may change
+    assert result.shape == (5, 21, 4, 1)
+
+
+def test_create_category_name():
+    # make categorisation A from yaml
+    categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml"))
+
+    # make categorisation B from yaml
+    categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml"))
+
+    # categories not part of climate categories so we need to add them manually
+    cats = {
+        "A": categorisation_a,
+        "B": categorisation_b,
+    }
+
+    # make conversion from csv
+    conv = cc.Conversion.from_csv(
+        get_test_data_filepath("test_create_category_name_conversion.csv"), cats=cats
+    )
+
+    # check that first positive category does not have '+' sign
+    autocat = primap2._convert.create_category_name(conv.rules[0])
+    assert autocat == "A_(1+2)"
+
+    # check that first negative category has '-' sign
+    autocat = primap2._convert.create_category_name(conv.rules[1])
+    assert autocat == "A_(-3+4)"
+
+    autocat = primap2._convert.create_category_name(conv.rules[2])
+    assert autocat == "A_(5-1)"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		In the conversion function, disable splitting into multiple categories, instead create an aggregated category.