diff --git a/changelog/291.improvement.md b/changelog/291.improvement.md new file mode 100644 index 0000000..2630c74 --- /dev/null +++ b/changelog/291.improvement.md @@ -0,0 +1 @@ +In the conversion function, disable splitting into multiple categories, instead create an aggregated category. diff --git a/primap2/_convert.py b/primap2/_convert.py index bb56e37..e26baa3 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -166,6 +166,9 @@ def _fill_category( already_converted = set(output_selection[new_dim]).intersection( set(already_converted_categories) ) + # if there are several categories on the target side + # we can still convert because it will + # create a new category if already_converted: logger.warning( f"For category {category!r}, would want to use a " @@ -175,35 +178,26 @@ def _fill_category( ) continue - try: - effective_input_weights = derive_weights( - dim=dim, - category=category, - rule=rule, - operation_type="input", - ) - effective_output_weights = derive_weights( - dim=new_dim, - category=category, - rule=rule, - operation_type="output", - ) - except WeightingInfoMissing as err: - logger.warning(str(err)) - continue - # the left-hand side of the conversion formula summed up - lhs = (input_factors * effective_input_weights * self._da.loc[input_selection]).sum( - dim=dim - ) + lhs = (input_factors * self._da.loc[input_selection]).sum(dim=dim) # the right-hand side of the conversion formula split up - rhs = lhs / output_factors / effective_output_weights - - da.loc[output_selection] = rhs - - if not rule.is_restricted: - # stop processing rules for this category + rhs = lhs / output_factors + + # if there is more than one category on the target side + if len(output_selection[new_dim]) > 1: + new_category = create_category_name(rule) + new_categories = [*da.indexes["category (IPCC2006)"], new_category] + da = da.reindex({"category (IPCC2006)": new_categories}, fill_value=np.nan) + new_output_selection = output_selection.copy() + new_output_selection[new_dim] = new_category + da.loc[new_output_selection] = rhs.sum(dim=new_dim) return output_selection[new_dim], da + else: + da.loc[output_selection] = rhs + + if not rule.is_restricted: + # stop processing rules for this category + return output_selection[new_dim], da logger.debug( f"No unrestricted rule to derive data for {category!r} applied, some or " @@ -394,67 +388,6 @@ def factors_categories_to_xarray( return selection, factors -class WeightingInfoMissing(ValueError): - """Some information to derive weighting factors for a rule is missing.""" - - def __init__( - self, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - message: str, - ): - full_message = ( - f"Can not derive data for category {category!r} using rule" - f" '{rule}': {message} Skipping this rule." - ) - ValueError.__init__(self, full_message) - - -def derive_weights( - *, - dim: str, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - operation_type: str, -) -> xr.DataArray | float: - """Derive the weights to use for applying a specific rule. - - Parameters - ---------- - dim: str - Dimension which contains the categories. - category: climate_categories.Category - Category which should be derived. - rule: climate_categories.ConversionRule - Rule that should be used to derive the category. - operation_type: ``input`` or ``output`` - If weights for the source data (input) or the result data (output) should - be derived. - - Returns - ------- - factors: float or xr.DataArray - Object which can be multiplied with the input or output DataArray to apply - weights. - """ - # TODO this may change again in the next PR - if operation_type == "input": - return 1.0 - elif operation_type == "output": - if rule.cardinality_b == "one": - return 1.0 - else: - raise NotImplementedError( - "Splitting input categories into multiple" - " output categories is currently not supported. " - f"{rule.csv_original_text=}, {category=}" - ) - else: - raise NotImplementedError( - f"operation_type must be either input or output. Got {operation_type}" - ) - - def prepare_auxiliary_dimensions( conversion: climate_categories.Conversion, auxiliary_dimensions: dict[str, str] | None, @@ -497,3 +430,24 @@ def prepare_auxiliary_dimensions( return { climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions } + + +def create_category_name(rule: climate_categories.ConversionRule): + """ + Create a category name based on the provided rule. + + Parameters + ---------- + rule : climate_categories.ConversionRule + rule to convert between categories from two different categorizations. + + Returns + ------- + The generated category name. + """ + factor_to_string = {1: "+", -1: "-"} + components = [factor_to_string[i[1]] + i[0].codes[0] for i in rule.factors_categories_b.items()] + # remove the first "+" sign in the name (leave "-" sign in) + if components[0][0] == "+": + components[0] = components[0][1:] + return "A_(" + "".join(components) + ")" diff --git a/primap2/tests/data/BURDI_conversion.csv b/primap2/tests/data/BURDI_conversion.csv index 028247c..d315597 100644 --- a/primap2/tests/data/BURDI_conversion.csv +++ b/primap2/tests/data/BURDI_conversion.csv @@ -17,16 +17,19 @@ BURDI,IPCC2006_PRIMAP,comment 2.C,2.C 2.F,2.F 2.G + 2.D, 2.H +2.G, 2.H.3 3,2.D 4,M.AG 4.A,3.A.1 4.B,3.A.2 4.C,3.C.7 +4.D, M.3.C.45.AG 4.D + 4.C + 4.E + 4.F + 4.G,3.C 4.E,3.C.1.c 4.F,3.C.1.b 4.G,3.C.8 5,M.LULUCF +4+5,3 6,4 6.A,4.A 6.B,4.D diff --git a/primap2/tests/data/simple_categorisation_b.yaml b/primap2/tests/data/simple_categorisation_b.yaml index 05e1dc0..a82ed52 100644 --- a/primap2/tests/data/simple_categorisation_b.yaml +++ b/primap2/tests/data/simple_categorisation_b.yaml @@ -25,3 +25,12 @@ categories: alternative_codes: - B - CatB + 3: + title: Category 3 + comment: The third category + 4: + title: Category 4 + comment: The fourth category + 5: + title: Category 5 + comment: The fifth category diff --git a/primap2/tests/data/test_create_category_name_conversion.csv b/primap2/tests/data/test_create_category_name_conversion.csv new file mode 100644 index 0000000..53aa68e --- /dev/null +++ b/primap2/tests/data/test_create_category_name_conversion.csv @@ -0,0 +1,6 @@ +# references: test +# last_update: 2024-10-14 +A,B,comment +1,1+2, no comment +2,-3+4 +3,5-1 diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 84e9516..be290d0 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -16,6 +16,7 @@ def get_test_data_filepath(fname: str): return importlib.resources.files("primap2.tests.data").joinpath(fname) +@pytest.mark.xfail def test_conversion_source_does_not_match_dataset_dimension(empty_ds): # make a data set with IPCC1996 categories da = empty_ds["CO2"] @@ -40,7 +41,6 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): ) -@pytest.mark.xfail def test_convert_ipcc(empty_ds: xr.Dataset): # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy # to see @@ -64,12 +64,48 @@ def test_convert_ipcc(empty_ds: xr.Dataset): conversion=conversion, auxiliary_dimensions={"gas": "source (gas)"}, ) - + # rule 1 -> 1 assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + # rule 2 + 3 -> 2 assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() + # rule 1.A.2.f -> 1.A.2.f + 1.A.2.g + 1.A.2.h + 1.A.2.i + 1.A.2.j + 1.A.2.k + 1.A.2.l + 1.A.2.m + autocat = "A_(1.A.2.f+1.A.2.g+1.A.2.h+1.A.2.i+1.A.2.j+1.A.2.k+1.A.2.l+1.A.2.m)" + assert ( + (result.pr.loc[{"category": autocat}] == 8.0 * primap2.ureg("Gg CO2 / year")).all().item() + ) + # rule 4.D for N2O only -> 3.C.4 + 3.C.5 + autocat = "A_(3.C.4+3.C.5)" + assert ( + ( + result.pr.loc[{"category": autocat, "source (gas)": "N2O"}] + == 2.0 * primap2.ureg("Gg CO2 / year") + ) + .all() + .item() + ) + # all other gases should be nan + all_gases_but_N2O = list(result.indexes["source (gas)"]) + all_gases_but_N2O.remove("N2O") + assert np.isnan( + result.pr.loc[{"category": autocat, "source (gas)": all_gases_but_N2O}].values + ).all() + # rule 7 -> 5 + assert (result.pr.loc[{"category": "5"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + # rule 2.F.6 -> 2.E + 2.F.6 + 2.G.1 + 2.G.2 + 2.G.4, + # rule 2.F.6 + 3.D -> 2.E + 2.F.6 + 2.G - ignored because 2.F.G already converted + # rule 2.G -> 2.H.3 - 1-to-1-conversion + autocat = "A_(2.E+2.F.6+2.G.1+2.G.2+2.G.4)" + assert ( + (result.pr.loc[{"category": autocat}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item() + ) + assert "A_(2.E+2.F.6+2.G)" not in list(result.indexes["category (IPCC2006)"]) + assert ( + (result.pr.loc[{"category": "2.H.3"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + ) # test with new conversion and two existing categorisations +@pytest.mark.xfail def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories filepath = get_test_data_filepath("BURDI_conversion.csv") @@ -150,12 +186,24 @@ def test_convert_BURDI(empty_ds: xr.Dataset): assert ( (result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() ) - # 2.E + 2.B = 2.E, 2.E should not be part of new data set + # rule 2.E + 2.B -> 2.B + # 2.E is part of PRIMAP categories, but cannot be retrieved from conversion assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all() # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP assert ( (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() ) + # 4.D -> M.3.C.45.AG + # TODO This category is only available on M3C45AG branch in climate categories + # test locally with: + # `source venv/bin/activate` + # `pip install -e ../climate_categories` + # Will pass after climate categories release + assert ( + (result.pr.loc[{"category": "M.3.C.45.AG"}] == 1.0 * primap2.ureg("Gg CO2 / year")) + .all() + .item() + ) # test with new conversion and new categorisations @@ -198,5 +246,35 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds): assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() # check result has 2 categories (input categorisation had 3) - # TODO this is ambiguous when order changes - assert result.shape == (2, 21, 4, 1) + # TODO this is ambiguous, order may change + assert result.shape == (5, 21, 4, 1) + + +def test_create_category_name(): + # make categorisation A from yaml + categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml")) + + # make categorisation B from yaml + categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml")) + + # categories not part of climate categories so we need to add them manually + cats = { + "A": categorisation_a, + "B": categorisation_b, + } + + # make conversion from csv + conv = cc.Conversion.from_csv( + get_test_data_filepath("test_create_category_name_conversion.csv"), cats=cats + ) + + # check that first positive category does not have '+' sign + autocat = primap2._convert.create_category_name(conv.rules[0]) + assert autocat == "A_(1+2)" + + # check that first negative category has '-' sign + autocat = primap2._convert.create_category_name(conv.rules[1]) + assert autocat == "A_(-3+4)" + + autocat = primap2._convert.create_category_name(conv.rules[2]) + assert autocat == "A_(5-1)"