Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

one to many mapping #291

Merged
merged 13 commits into from
Nov 11, 2024
1 change: 1 addition & 0 deletions changelog/291.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
In the conversion function, disable splitting into multiple categories, instead create an aggregated category.
128 changes: 41 additions & 87 deletions primap2/_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ def _fill_category(
already_converted = set(output_selection[new_dim]).intersection(
set(already_converted_categories)
)
# if there are several categories on the target side
# we can still convert because it will
# create a new category
if already_converted:
logger.warning(
f"For category {category!r}, would want to use a "
Expand All @@ -175,35 +178,26 @@ def _fill_category(
)
continue

try:
effective_input_weights = derive_weights(
dim=dim,
category=category,
rule=rule,
operation_type="input",
)
effective_output_weights = derive_weights(
dim=new_dim,
category=category,
rule=rule,
operation_type="output",
)
except WeightingInfoMissing as err:
logger.warning(str(err))
continue

# the left-hand side of the conversion formula summed up
lhs = (input_factors * effective_input_weights * self._da.loc[input_selection]).sum(
dim=dim
)
lhs = (input_factors * self._da.loc[input_selection]).sum(dim=dim)
# the right-hand side of the conversion formula split up
rhs = lhs / output_factors / effective_output_weights

da.loc[output_selection] = rhs

if not rule.is_restricted:
# stop processing rules for this category
rhs = lhs / output_factors

# if there is more than one category on the target side
if len(output_selection[new_dim]) > 1:
new_category = create_category_name(rule)
new_categories = [*da.indexes["category (IPCC2006)"], new_category]
da = da.reindex({"category (IPCC2006)": new_categories}, fill_value=np.nan)
new_output_selection = output_selection.copy()
new_output_selection[new_dim] = new_category
da.loc[new_output_selection] = rhs.sum(dim=new_dim)
return output_selection[new_dim], da
else:
da.loc[output_selection] = rhs

if not rule.is_restricted:
# stop processing rules for this category
return output_selection[new_dim], da

logger.debug(
f"No unrestricted rule to derive data for {category!r} applied, some or "
Expand Down Expand Up @@ -394,67 +388,6 @@ def factors_categories_to_xarray(
return selection, factors


class WeightingInfoMissing(ValueError):
"""Some information to derive weighting factors for a rule is missing."""

def __init__(
self,
category: climate_categories.Category,
rule: climate_categories.ConversionRule,
message: str,
):
full_message = (
f"Can not derive data for category {category!r} using rule"
f" '{rule}': {message} Skipping this rule."
)
ValueError.__init__(self, full_message)


def derive_weights(
*,
dim: str,
category: climate_categories.Category,
rule: climate_categories.ConversionRule,
operation_type: str,
) -> xr.DataArray | float:
"""Derive the weights to use for applying a specific rule.

Parameters
----------
dim: str
Dimension which contains the categories.
category: climate_categories.Category
Category which should be derived.
rule: climate_categories.ConversionRule
Rule that should be used to derive the category.
operation_type: ``input`` or ``output``
If weights for the source data (input) or the result data (output) should
be derived.

Returns
-------
factors: float or xr.DataArray
Object which can be multiplied with the input or output DataArray to apply
weights.
"""
# TODO this may change again in the next PR
if operation_type == "input":
return 1.0
elif operation_type == "output":
if rule.cardinality_b == "one":
return 1.0
else:
raise NotImplementedError(
"Splitting input categories into multiple"
" output categories is currently not supported. "
f"{rule.csv_original_text=}, {category=}"
)
else:
raise NotImplementedError(
f"operation_type must be either input or output. Got {operation_type}"
)


def prepare_auxiliary_dimensions(
conversion: climate_categories.Conversion,
auxiliary_dimensions: dict[str, str] | None,
Expand Down Expand Up @@ -497,3 +430,24 @@ def prepare_auxiliary_dimensions(
return {
climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
}


def create_category_name(rule: climate_categories.ConversionRule):
"""
Create a category name based on the provided rule.

Parameters
----------
rule : climate_categories.ConversionRule
rule to convert between categories from two different categorizations.

Returns
-------
The generated category name.
"""
factor_to_string = {1: "+", -1: "-"}
components = [factor_to_string[i[1]] + i[0].codes[0] for i in rule.factors_categories_b.items()]
# remove the first "+" sign in the name (leave "-" sign in)
if components[0][0] == "+":
components[0] = components[0][1:]
return "A_(" + "".join(components) + ")"
3 changes: 3 additions & 0 deletions primap2/tests/data/BURDI_conversion.csv
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,19 @@ BURDI,IPCC2006_PRIMAP,comment
2.C,2.C
2.F,2.F
2.G + 2.D, 2.H
2.G, 2.H.3
3,2.D
4,M.AG
4.A,3.A.1
4.B,3.A.2
4.C,3.C.7
4.D, M.3.C.45.AG
4.D + 4.C + 4.E + 4.F + 4.G,3.C
4.E,3.C.1.c
4.F,3.C.1.b
4.G,3.C.8
5,M.LULUCF
4+5,3
6,4
6.A,4.A
6.B,4.D
Expand Down
9 changes: 9 additions & 0 deletions primap2/tests/data/simple_categorisation_b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,12 @@ categories:
alternative_codes:
- B
- CatB
3:
title: Category 3
comment: The third category
4:
title: Category 4
comment: The fourth category
5:
title: Category 5
comment: The fifth category
6 changes: 6 additions & 0 deletions primap2/tests/data/test_create_category_name_conversion.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# references: test
# last_update: 2024-10-14
A,B,comment
1,1+2, no comment
2,-3+4
3,5-1
88 changes: 83 additions & 5 deletions primap2/tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
return importlib.resources.files("primap2.tests.data").joinpath(fname)


@pytest.mark.xfail
def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
# make a data set with IPCC1996 categories
da = empty_ds["CO2"]
Expand All @@ -40,7 +41,6 @@
)


@pytest.mark.xfail
def test_convert_ipcc(empty_ds: xr.Dataset):
# build a DA categorized by IPCC1996 and with 1 everywhere so results are easy
# to see
Expand All @@ -64,12 +64,48 @@
conversion=conversion,
auxiliary_dimensions={"gas": "source (gas)"},
)

# rule 1 -> 1
assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 2 + 3 -> 2
assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 1.A.2.f -> 1.A.2.f + 1.A.2.g + 1.A.2.h + 1.A.2.i + 1.A.2.j + 1.A.2.k + 1.A.2.l + 1.A.2.m
autocat = "A_(1.A.2.f+1.A.2.g+1.A.2.h+1.A.2.i+1.A.2.j+1.A.2.k+1.A.2.l+1.A.2.m)"
assert (
(result.pr.loc[{"category": autocat}] == 8.0 * primap2.ureg("Gg CO2 / year")).all().item()
)
# rule 4.D for N2O only -> 3.C.4 + 3.C.5
autocat = "A_(3.C.4+3.C.5)"
assert (
(
result.pr.loc[{"category": autocat, "source (gas)": "N2O"}]
== 2.0 * primap2.ureg("Gg CO2 / year")
)
.all()
.item()
)
# all other gases should be nan
all_gases_but_N2O = list(result.indexes["source (gas)"])
all_gases_but_N2O.remove("N2O")
assert np.isnan(
result.pr.loc[{"category": autocat, "source (gas)": all_gases_but_N2O}].values
).all()
# rule 7 -> 5
assert (result.pr.loc[{"category": "5"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 2.F.6 -> 2.E + 2.F.6 + 2.G.1 + 2.G.2 + 2.G.4,
# rule 2.F.6 + 3.D -> 2.E + 2.F.6 + 2.G - ignored because 2.F.G already converted
# rule 2.G -> 2.H.3 - 1-to-1-conversion
autocat = "A_(2.E+2.F.6+2.G.1+2.G.2+2.G.4)"
assert (
(result.pr.loc[{"category": autocat}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item()
)
assert "A_(2.E+2.F.6+2.G)" not in list(result.indexes["category (IPCC2006)"])
assert (
(result.pr.loc[{"category": "2.H.3"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
)


# test with new conversion and two existing categorisations
@pytest.mark.xfail
def test_convert_BURDI(empty_ds: xr.Dataset):
# make a sample conversion object in climate categories
filepath = get_test_data_filepath("BURDI_conversion.csv")
Expand Down Expand Up @@ -150,12 +186,24 @@
assert (
(result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
)
# 2.E + 2.B = 2.E, 2.E should not be part of new data set
# rule 2.E + 2.B -> 2.B
# 2.E is part of PRIMAP categories, but cannot be retrieved from conversion
assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
# cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
assert (
(result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
)
# 4.D -> M.3.C.45.AG
# TODO This category is only available on M3C45AG branch in climate categories
# test locally with:
# `source venv/bin/activate`
# `pip install -e ../climate_categories`
# Will pass after climate categories release
assert (

Check warning on line 202 in primap2/tests/test_convert.py

View check run for this annotation

Codecov / codecov/patch

primap2/tests/test_convert.py#L202

Added line #L202 was not covered by tests
(result.pr.loc[{"category": "M.3.C.45.AG"}] == 1.0 * primap2.ureg("Gg CO2 / year"))
.all()
.item()
)


# test with new conversion and new categorisations
Expand Down Expand Up @@ -198,5 +246,35 @@
assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()

# check result has 2 categories (input categorisation had 3)
# TODO this is ambiguous when order changes
assert result.shape == (2, 21, 4, 1)
# TODO this is ambiguous, order may change
assert result.shape == (5, 21, 4, 1)


def test_create_category_name():
# make categorisation A from yaml
categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml"))

# make categorisation B from yaml
categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml"))

# categories not part of climate categories so we need to add them manually
cats = {
"A": categorisation_a,
"B": categorisation_b,
}

# make conversion from csv
conv = cc.Conversion.from_csv(
get_test_data_filepath("test_create_category_name_conversion.csv"), cats=cats
)

# check that first positive category does not have '+' sign
autocat = primap2._convert.create_category_name(conv.rules[0])
assert autocat == "A_(1+2)"

# check that first negative category has '-' sign
autocat = primap2._convert.create_category_name(conv.rules[1])
assert autocat == "A_(-3+4)"

autocat = primap2._convert.create_category_name(conv.rules[2])
assert autocat == "A_(5-1)"