Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

one-to-many conversion #284

Merged
merged 11 commits into from
Nov 7, 2024
109 changes: 22 additions & 87 deletions primap2/_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ def _fill_category(
already_converted = set(output_selection[new_dim]).intersection(
set(already_converted_categories)
)
# if there are several categories on the target side
# we can still convert because it will
# create a new category
if already_converted:
logger.warning(
f"For category {category!r}, would want to use a "
Expand All @@ -175,35 +178,28 @@ def _fill_category(
)
continue

try:
effective_input_weights = derive_weights(
dim=dim,
category=category,
rule=rule,
operation_type="input",
)
effective_output_weights = derive_weights(
dim=new_dim,
category=category,
rule=rule,
operation_type="output",
)
except WeightingInfoMissing as err:
logger.warning(str(err))
continue

# the left-hand side of the conversion formula summed up
lhs = (input_factors * effective_input_weights * self._da.loc[input_selection]).sum(
dim=dim
)
lhs = (input_factors * self._da.loc[input_selection]).sum(dim=dim)
# the right-hand side of the conversion formula split up
rhs = lhs / output_factors / effective_output_weights

da.loc[output_selection] = rhs

if not rule.is_restricted:
# stop processing rules for this category
rhs = lhs / output_factors

# if there is more than one category on the target side
if len(output_selection[new_dim]) > 1:
# TODO this leads to very long category names
new_category = "M_" + "_".join(output_selection[new_dim])
JGuetschow marked this conversation as resolved.
Show resolved Hide resolved
# add newly created category to da
new_categories = list(da.indexes["category (IPCC2006)"]) + [new_category]
da = da.reindex({"category (IPCC2006)": new_categories}, fill_value=np.nan)
new_output_selection = output_selection.copy()
new_output_selection[new_dim] = new_category
da.loc[new_output_selection] = rhs.sum(dim=new_dim)
return output_selection[new_dim], da
else:
da.loc[output_selection] = rhs

if not rule.is_restricted:
# stop processing rules for this category
return output_selection[new_dim], da

logger.debug(
f"No unrestricted rule to derive data for {category!r} applied, some or "
Expand Down Expand Up @@ -394,67 +390,6 @@ def factors_categories_to_xarray(
return selection, factors


class WeightingInfoMissing(ValueError):
"""Some information to derive weighting factors for a rule is missing."""

def __init__(
self,
category: climate_categories.Category,
rule: climate_categories.ConversionRule,
message: str,
):
full_message = (
f"Can not derive data for category {category!r} using rule"
f" '{rule}': {message} Skipping this rule."
)
ValueError.__init__(self, full_message)


def derive_weights(
*,
dim: str,
category: climate_categories.Category,
rule: climate_categories.ConversionRule,
operation_type: str,
) -> xr.DataArray | float:
"""Derive the weights to use for applying a specific rule.

Parameters
----------
dim: str
Dimension which contains the categories.
category: climate_categories.Category
Category which should be derived.
rule: climate_categories.ConversionRule
Rule that should be used to derive the category.
operation_type: ``input`` or ``output``
If weights for the source data (input) or the result data (output) should
be derived.

Returns
-------
factors: float or xr.DataArray
Object which can be multiplied with the input or output DataArray to apply
weights.
"""
# TODO this may change again in the next PR
if operation_type == "input":
return 1.0
elif operation_type == "output":
if rule.cardinality_b == "one":
return 1.0
else:
raise NotImplementedError(
"Splitting input categories into multiple"
" output categories is currently not supported. "
f"{rule.csv_original_text=}, {category=}"
)
else:
raise NotImplementedError(
f"operation_type must be either input or output. Got {operation_type}"
)


def prepare_auxiliary_dimensions(
conversion: climate_categories.Conversion,
auxiliary_dimensions: dict[str, str] | None,
Expand Down
3 changes: 3 additions & 0 deletions primap2/tests/data/BURDI_conversion.csv
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,19 @@ BURDI,IPCC2006_PRIMAP,comment
2.C,2.C
2.F,2.F
2.G + 2.D, 2.H
2.G, 2.H.3
3,2.D
4,M.AG
4.A,3.A.1
4.B,3.A.2
4.C,3.C.7
4.D, 3.C.45.AG
JGuetschow marked this conversation as resolved.
Show resolved Hide resolved
4.D + 4.C + 4.E + 4.F + 4.G,3.C
4.E,3.C.1.c
4.F,3.C.1.b
4.G,3.C.8
5,M.LULUCF
4+5,3
6,4
6.A,4.A
6.B,4.D
Expand Down
51 changes: 47 additions & 4 deletions primap2/tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
)


@pytest.mark.xfail
def test_convert_ipcc(empty_ds: xr.Dataset):
# build a DA categorized by IPCC1996 and with 1 everywhere so results are easy
# to see
Expand All @@ -64,12 +63,44 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
conversion=conversion,
auxiliary_dimensions={"gas": "source (gas)"},
)

# rule 1 -> 1
assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 2 + 3 -> 2
assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 1.A.2.f -> 1.A.2.f + 1.A.2.g + 1.A.2.h + 1.A.2.i + 1.A.2.j + 1.A.2.k + 1.A.2.l + 1.A.2.m
mcat = "M_1.A.2.f_1.A.2.g_1.A.2.h_1.A.2.i_1.A.2.j_1.A.2.k_1.A.2.l_1.A.2.m"
JGuetschow marked this conversation as resolved.
Show resolved Hide resolved
assert (result.pr.loc[{"category": mcat}] == 8.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 4.D for N2O only -> 3.C.4 + 3.C.5
mcat = "M_3.C.4_3.C.5"
assert (
(
result.pr.loc[{"category": mcat, "source (gas)": "N2O"}]
== 2.0 * primap2.ureg("Gg CO2 / year")
)
.all()
.item()
)
# all other gases should be nan
all_gases_but_N2O = list(result.indexes["source (gas)"])
all_gases_but_N2O.remove("N2O")
assert np.isnan(
result.pr.loc[{"category": mcat, "source (gas)": all_gases_but_N2O}].values
).all()
# rule 7 -> 5
assert (result.pr.loc[{"category": "5"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
# rule 2.F.6 -> 2.E + 2.F.6 + 2.G.1 + 2.G.2 + 2.G.4,
# rule 2.F.6 + 3.D -> 2.E + 2.F.6 + 2.G - ignored because 2.F.G already converted
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ignoring rules can be dangerous, because if we convert more than one country data coverage is likely different for the different countries and while one rule might apply for one country another rule might be needed for another country. I know this is not the place where the ignoring goes on, but I wanted to raise the point anyway.

Copy link
Collaborator Author

@crdanielbusch crdanielbusch Nov 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. Maybe that's something I can tackle in a following pull request, since we would like to add a feature to add/remove rules dynamically anyway.

About this particular case: Am I missing something or can we change the rules from

2.F.6 -> 2.E + 2.F.6 + 2.G.1 + 2.G.2 + 2.G.4
2.F.6 + 3.D -> 2.E + 2.F.6 + 2.G

to

2.F.6 -> 2.E + 2.F.6 + 2.G.1 + 2.G.2 + 2.G.4
3.D -> 2.G.3

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If 2.G has only subcategories 1-4 it looks like it can be rewritten. But I did not make the mapping, so there might be some motivation behind the rules that I don't know of.

# rule 2.G -> 2.H.3 - 1-to-1-conversion
mcat = "M_2.E_2.F.6_2.G.1_2.G.2_2.G.4"
assert (result.pr.loc[{"category": mcat}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item()
assert "M_2.E_2.F.6_2.G" not in list(result.indexes["category (IPCC2006)"])
assert (
(result.pr.loc[{"category": "2.H.3"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
)


# test with new conversion and two existing categorisations
@pytest.mark.xfail
def test_convert_BURDI(empty_ds: xr.Dataset):
# make a sample conversion object in climate categories
filepath = get_test_data_filepath("BURDI_conversion.csv")
Expand Down Expand Up @@ -150,12 +181,24 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
assert (
(result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
)
# 2.E + 2.B = 2.E, 2.E should not be part of new data set
# rule 2.E + 2.B -> 2.B
# 2.B is part of PRIMAP categories, but cannot be retrieved from conversion
JGuetschow marked this conversation as resolved.
Show resolved Hide resolved
assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
# cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
assert (
(result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
)
# 4.D -> M.3.C.45.AG
# This category is only available on M3C45AG branch in climate categories
# test locally with:
# `source venv/bin/activate`
# `pip install -e ../climate_categories`
# Will pass after climate categories release
assert (
(result.pr.loc[{"category": "3.C.45.AG"}] == 1.0 * primap2.ureg("Gg CO2 / year"))
.all()
.item()
)


# test with new conversion and new categorisations
Expand Down Expand Up @@ -198,5 +241,5 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()

# check result has 2 categories (input categorisation had 3)
# TODO this is ambiguous when order changes
# TODO this is ambiguous, order may change
assert result.shape == (2, 21, 4, 1)
Loading