From 952d9aca23bb2b21e9ea91841c235e1c9b39ff18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Thu, 23 Sep 2021 19:55:50 +0200 Subject: [PATCH 01/36] Add initial support for converting DataArrays. --- primap2/_aggregate.py | 28 ++- primap2/_alias_selection.py | 28 ++- primap2/_convert.py | 309 ++++++++++++++++++++++++++++++++++ primap2/_downscale.py | 8 +- primap2/accessors.py | 2 + primap2/tests/test_convert.py | 20 +++ setup.cfg | 1 + 7 files changed, 369 insertions(+), 27 deletions(-) create mode 100644 primap2/_convert.py create mode 100644 primap2/tests/test_convert.py diff --git a/primap2/_aggregate.py b/primap2/_aggregate.py index b1e30afb..59539030 100644 --- a/primap2/_aggregate.py +++ b/primap2/_aggregate.py @@ -33,16 +33,15 @@ def select_no_scalar_dimension( """ if sel is None: return obj - else: - sele: DatasetOrDataArray = obj.loc[sel] - if dim_names(obj) != dim_names(sele): - raise ValueError( - "The dimension of the selection doesn't match the dimension of the " - "orginal dataset. Likely you used a selection casting to a scalar " - "dimension, like sel={'axis': 'value'}. Please use " - "sel={'axis': ['value']} instead." - ) - return sele + selection: DatasetOrDataArray = obj.loc[sel] + if dim_names(obj) != dim_names(selection): + raise ValueError( + "The dimension of the selection doesn't match the dimension of the " + "orginal dataset. Likely you used a selection casting to a scalar " + "dimension, like sel={'axis': 'value'}. Please use " + "sel={'axis': ['value']} instead." + ) + return selection class DataArrayAggregationAccessor(BaseDataArrayAccessor): @@ -54,11 +53,10 @@ def _reduce_dim( "Only one of 'dim' and 'reduce_to_dim' may be supplied, not both." ) - if dim is None: - if reduce_to_dim is not None: - if isinstance(reduce_to_dim, str): - reduce_to_dim = [reduce_to_dim] - dim = set(self._da.dims) - set(reduce_to_dim) + if dim is None and reduce_to_dim is not None: + if isinstance(reduce_to_dim, str): + reduce_to_dim = [reduce_to_dim] + dim = set(self._da.dims) - set(reduce_to_dim) return dim diff --git a/primap2/_alias_selection.py b/primap2/_alias_selection.py index 9b16749c..73f75335 100644 --- a/primap2/_alias_selection.py +++ b/primap2/_alias_selection.py @@ -16,6 +16,22 @@ def __init__(self, dim): def translate(item: KeyT, translations: typing.Mapping[typing.Hashable, str]) -> KeyT: + """Translates a single str key or the keys of a dict using the given translations. + + If a key is not found in the translations, return it untranslated. + + Parameters + ---------- + item : str or dict with str keys + The input to translate. Either a str or a dict with str keys. + translations : dict + The translations to apply. + + Returns + ------- + translated : str or dict with str keys + The same type as the input item, but translated. + """ if isinstance(item, str): if item in translations: return translations[item] @@ -54,10 +70,9 @@ def translations_from_dims( ) -> typing.Dict[typing.Hashable, str]: ret: typing.Dict[typing.Hashable, str] = {} for dim in dims: - if isinstance(dim, str): - if " (" in dim: - key: str = dim.split("(")[0][:-1] - ret[key] = dim + if isinstance(dim, str) and " (" in dim: + key: str = dim.split("(")[0][:-1] + ret[key] = dim if "scenario" in ret: ret["scen"] = ret["scenario"] if "category" in ret: @@ -77,10 +92,7 @@ def alias( return dim else: try: - rdim = [] - for idim in dim: - rdim.append(alias(idim, translations, dims)) - return rdim + return [alias(idim, translations, dims) for idim in dim] except TypeError: # not iterable, so some other hashable like int if dim not in dims: raise DimensionNotExistingError(dim) diff --git a/primap2/_convert.py b/primap2/_convert.py new file mode 100644 index 00000000..96439f44 --- /dev/null +++ b/primap2/_convert.py @@ -0,0 +1,309 @@ +import typing +from typing import Hashable + +import climate_categories +import numpy as np +import xarray as xr +from loguru import logger + +from . import _accessor_base +from ._alias_selection import alias_dims + + +def extract_categorization_from_dim(dim: str) -> (str, str): + """Extract the pure dimension and the categorization from a composite dim. + + Parameters + ---------- + dim : str + Composite dim name like ``area (ISO3)`` where ``area`` is the pure dimension + name and ``ISO3`` is the used categorization. + + Examples + -------- + >>> extract_categorization_from_dim("area (ISO3)") + ('area', 'ISO3') + >>> extract_categorization_from_dim("area") + Traceback (most recent call last): + ... + ValueError: No categorization specified: 'area'. + + + Returns + ------- + pure_dim, categorization : str, str + The pure_dim without categorization information and the categorization. If the + input dim does not contain categorization information, a ValueError is raised. + """ + try: + pure, cat = dim.split("(", 1) + except ValueError: + raise ValueError(f"No categorization specified: {dim!r}.") + return pure[:-1], cat[:-1] + + +def applicable_rule(conversion, category): + """Choose the best rule to derive the given category using the given conversion. + + If there are multiple relevant rules, will prefer rules with: + 1. the given category as the only target category. + 2. only one source category + 3. rules defined earlier in the CSV. + + TODO: how to deal with restricted rules? + """ + rules = conversion.relevant_rules({conversion.categorization_b[category]}) + # a + b = c - d can not be used to derive c nor d, only a and b + rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())] + # drop all restricted rules + # TODO do something smart with restricted rules + rules = [r for r in rules if not any(r.auxiliary_categories.values())] + + if not rules: + raise KeyError(category) + # narrow down rules until we have exactly one rule to apply + # prefer rules where the target category is the only summand + if len(rules) != 1: + cardinalities = [r.cardinality_b for r in rules] + if "one" in cardinalities: + for i in range(len(rules)): + if cardinalities[i] == "many": + rules.pop(i) + # prefer rules with exactly one source category + if len(rules) != 1: + cardinalities = [r.cardinality_a for r in rules] + if "one" in cardinalities: + for i in range(len(rules)): + if cardinalities[i] == "many": + rules.pop(i) + # if we still have multiple eligible rules, just use the first + if len(rules) != 1: + rule_str = str(rules[0]) + logger.info( + f"There are {len(rules)} rules to derive data for" + f" {category!r}, will" + f" use {rule_str!r} because it was defined earlier." + ) + + return rules[0] + + +class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): + @alias_dims(["dim"]) + def convert( + self, + dim: typing.Union[Hashable, str], + categorization: typing.Union[climate_categories.Categorization, str], + *, + sum_rule: typing.Optional[str] = None, + input_weights: typing.Optional[xr.DataArray] = None, + output_weights: typing.Optional[xr.DataArray] = None, + ) -> xr.DataArray: + """Convert the data along the given dimension into the new categorization. + + Maps the given dimension from one categorization (terminology) into another. + Fetches the rules to do the mapping from the climate_categories package, and + therefore will only work if there are conversions rules to convert from the + current categorization to the new categorization. + + Parameters + ---------- + dim : str + Dimension to convert. Has to be a dimension from ``da.dims``. + categorization : climate_categories.Categorization or str + New categorization to convert the given dimension to. Either give the title + of the new categorization (like ``IPCC1996``) or a + ``climate_categories.Categorization`` object. + sum_rule : ``extensive``, ``intensive``, or None (default) + If data of categories has to be summed up or divided, we need information + whether the quantity measured is extensive (like, for example, total + emissions in a year subdivided into multiple sectoral categories) or + intensive (like, for example, average per-person emissions in a year + subdivided into different territorial entities). By default (None), a + warning is issued if data has to be summed up or divided. + input_weights : xr.DataArray, optional + If data in input categories has to be summed up and the sum_rule is + ``intensive``, weights for the input categories are required. + The weights can be given in any shape compatible with the DataArray that + is converted, e.g. to give different weights for industrial sectors by + country. However, at least the ``dim`` that is converted needs to be in + ``input_weights.dims``. + If no weights are specified but a rule requiring weights is specified + in the conversion rules, a warning is issued and the respective rule is + skipped (probably resulting in more NaNs in the output). + output_weights : xr.DataArray, optional + If data has to be divided into several output categories and the sum_rule is + ``extensive``, weights for the output categories are required. + The weights can be given in any shape compatible with the DataArray that + is converted, e.g. to give different weights for industrial sectors by + country. However, at least the ``dim`` that is converted needs to be in + ``output_weights.dims``. + If no weights are specified but a rule requiring weights is specified + in the conversion rules, a warning is issued and the respective rule is + skipped (probably resulting in more NaNs in the output). + + Returns + ------- + converted : xr.DataArray + A copy of the DataArray with the given dimension converted in the new + categorization. + """ + if not isinstance(categorization, climate_categories.Categorization): + categorization = climate_categories.cats[categorization] + + if sum_rule not in (None, "extensive", "intensive"): + raise ValueError( + f"sum_rule must bei either 'extensive' or 'intensive', not {sum_rule}" + ) + + dim_name, old_categorization_name = extract_categorization_from_dim(dim) + old_categorization: climate_categories.Categorization = climate_categories.cats[ + old_categorization_name + ] + conversion = old_categorization.conversion_to(categorization) + new_dim = f"{dim_name} ({categorization.name})" + + new_dims = [] + new_shape = [] + for i, old_dim in enumerate(self._da.dims): + if old_dim == dim: + new_dims.append(new_dim) + new_shape.append(len(categorization)) + else: + new_dims.append(old_dim) + new_shape.append(self._da.shape[i]) + + new_coords = {} + for coord in self._da.coords: + if coord == dim: + new_coords[new_dim] = np.array(list(categorization.keys())) + elif dim in self._da.coords[coord].dims: + logger.info( + f"Additional coordinate {coord} can not be converted automatically" + f" and is skipped." + ) + continue + else: + new_coords[coord] = self._da.coords[coord] + + # initialize the converted array using all NA + all_na_array = np.empty(new_shape) + all_na_array[:] = np.nan + converted = xr.DataArray( + data=all_na_array, + dims=new_dims, + coords=new_coords, + name=self._da.name, + attrs=self._da.attrs, + ) + + converted_categories = [] + for category in converted[new_dim]: + category = category.item() + if category in converted_categories: + continue + try: + rule = applicable_rule(conversion, category) + except KeyError: + logger.debug(f"No rule to derive data for {category!r}, will be NaN.") + continue + + # convert rule into xarray objects that will cleanly multiply regardless + # of dimensionality + input_selection = { + dim: [cat.codes[0] for cat in rule.factors_categories_a.keys()] + } + input_factors = xr.DataArray( + data=list(rule.factors_categories_a.values()), + dims=[dim], + coords=input_selection, + ) + new_dim_values = [cat.codes[0] for cat in rule.factors_categories_b.keys()] + output_selection = {new_dim: new_dim_values} + output_factors = xr.DataArray( + data=list(rule.factors_categories_b.values()), + dims=[new_dim], + coords=output_selection, + ) + + # if the applicable rule is a multi-output rule, but some of the + # outputs are already converted, give up + already_converted = set(new_dim_values).intersection( + set(converted_categories) + ) + if already_converted: + # TODO: maybe we can do better? + logger.warning( + f"For category {category!r}, would want to use a " + "rule with multiple outputs, but the following outputs " + f"are already converted: {already_converted!r}. " + "Skipping this category and leaving it NaN." + ) + continue + + # derive input and output weights (maybe trivial) + if rule.cardinality_a == "one" or sum_rule == "extensive": + effective_input_weights = 1 + elif sum_rule == "intensive": + # summing intensive units requires weights + if input_weights is None: + logger.warning( + f"To derive data for {category!r}, we need to sum up" + " multiple input categories. For sum_rule='intensive'," + " this requires input_weights, but none are specified." + " Will continue with NaN, specify input_weights to avoid this." + ) + continue + effective_input_weights = input_weights.loc[input_selection] + # normalize so it is actually a weight, not a factor + effective_input_weights /= effective_input_weights.sum(dim=dim) + else: # no sum rule specified, but needed + logger.warning( + f"To derive data for {category!r}, we need to sum up" + " multiple input categories, but the sum_rule is" + " not specified. Will continue with NaN, specify the" + " sum_rule to avoid this." + ) + continue + + if rule.cardinality_b == "one" or sum_rule == "intensive": + effective_output_weights = 1 + elif sum_rule == "extensive": + # dividing extensive units requires weights + if output_weights is None: + logger.warning( + f"To derive data for {category!r}, we need to split up" + " multiple output categories. For sum_rule='extensive'," + " this requires output_weights, but none are specified." + " Will continue with NaN, specify output_weights to avoid this." + ) + continue + effective_output_weights = output_weights.loc[output_selection] + # normalize so it is actually a weight, not a factor + effective_output_weights /= effective_output_weights.sum(dim=dim) + else: # no sum rule specified, but needed + logger.warning( + f"To derive data for {category!r}, we need to split up" + " multiple output categories, but the sum_rule is" + " not specified. Will continue with NaN, specify the" + " sum_rule to avoid this." + ) + continue + + # the left-hand side of the conversion formula summed up + lhs = ( + input_factors * effective_input_weights * self._da.loc[input_selection] + ).sum(dim=dim) + # the right-hand side of the conversion formula split up + rhs = lhs / output_factors / effective_output_weights + # TODO: using pr.set here is not efficient because it makes copies + converted = converted.pr.set( + dim=new_dim, + key=new_dim_values, + value=rhs, + ) + + # mark all filled categories as converted + converted_categories += new_dim_values + + return converted diff --git a/primap2/_downscale.py b/primap2/_downscale.py index 16b09134..a9eaf172 100644 --- a/primap2/_downscale.py +++ b/primap2/_downscale.py @@ -36,13 +36,13 @@ def downscale_timeseries( ---------- dim: str The name of the dimension which contains the basket and its contents, has to - be one of the dimensions in ``ds.dims``. + be one of the dimensions in ``da.dims``. basket: str The name of the super-category for which values are known at higher temporal - resolution and/or for a wider range. A value from ``ds[dimension]``. + resolution and/or for a wider range. A value from ``da[dimension]``. basket_contents: list of str The name of the sub-categories. The sum of all sub-categories equals the - basket. Values from ``ds[dimension]``. + basket. Values from ``da[dimension]``. check_consistency: bool, default True If for all points where the basket and all basket_contents are defined, it should be checked if the sum of the basket_contents actually equals @@ -50,7 +50,7 @@ def downscale_timeseries( sel: Selection dict, optional If the downscaling should only be done on a subset of the Dataset while retaining all other values unchanged, give a selection dictionary. The - downscaling will be done on ``ds.loc[sel]``. + downscaling will be done on ``da.loc[sel]``. skipna_evaluation_dims: list of str, optional Dimensions which should be evaluated to determine if NA values should be skipped entirely if missing fully. By default, no NA values are skipped. diff --git a/primap2/accessors.py b/primap2/accessors.py index 1323b188..e177362e 100644 --- a/primap2/accessors.py +++ b/primap2/accessors.py @@ -7,6 +7,7 @@ DataArrayAliasSelectionAccessor, DatasetAliasSelectionAccessor, ) +from ._convert import DataArrayConversionAccessor from ._data_format import DatasetDataFormatAccessor from ._downscale import DataArrayDownscalingAccessor, DatasetDownscalingAccessor from ._metadata import DatasetMetadataAccessor @@ -33,6 +34,7 @@ class PRIMAP2DatasetAccessor( class PRIMAP2DataArrayAccessor( DataArrayAggregationAccessor, DataArrayAliasSelectionAccessor, + DataArrayConversionAccessor, DataArrayDownscalingAccessor, DataArrayOverviewAccessor, DataArraySettersAccessor, diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py new file mode 100644 index 00000000..fbd7b9fb --- /dev/null +++ b/primap2/tests/test_convert.py @@ -0,0 +1,20 @@ +"""Tests for _convert.py""" + +import climate_categories as cc +import xarray as xr + +import primap2 + + +def test_convert_ipcc(empty_ds: xr.Dataset): + # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy + # to see + da = empty_ds["CO2"] + da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())}) + arr = da.data.copy() + arr[:] = 1 * primap2.ureg("Gg CO2 / year") + da.data = arr + + da.pr.convert("category", "IPCC2006", sum_rule="extensive") + + # TODO test that values actually make sense diff --git a/setup.cfg b/setup.cfg index 8668f10d..4269e994 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,6 +47,7 @@ install_requires = ruamel.yaml strictyaml openpyxl + climate_categories>=0.5.0 [options.extras_require] test = From 2e5bb708f9c36a03d34714c3ea11b322eec3e16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 2 Nov 2021 16:15:54 +0100 Subject: [PATCH 02/36] Depend on version of climate_categories with conversion support. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 4269e994..8acb6943 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = ruamel.yaml strictyaml openpyxl - climate_categories>=0.5.0 + climate_categories>=0.6.0 [options.extras_require] test = From 2afb274364303c62f5e9fb0129dd7e1df521d99f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 2 Nov 2021 17:32:46 +0100 Subject: [PATCH 03/36] Conversions: refactor some things into own sub-functions, fix some oversights. --- primap2/_convert.py | 157 +++++++++++++++++++++++++++++++------------- 1 file changed, 112 insertions(+), 45 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 96439f44..3edb5dba 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -1,3 +1,4 @@ +import copy import typing from typing import Hashable @@ -88,6 +89,106 @@ def applicable_rule(conversion, category): return rules[0] +def ensure_categorization_instance( + cat: typing.Union[str, climate_categories.Categorization] +) -> climate_categories.Categorization: + """Takes a categorization name or object and returns the corresponding + categorization object.""" + if isinstance(cat, climate_categories.Categorization): + return cat + return climate_categories.cats[cat] + + +def check_valid_sum_rule_types(sum_rule: typing.Optional[str]): + """Checks if the sum_rule is either "intensive", "extensive", or None. + + Raises a ValueError if an invalid sum_rule is used.""" + if sum_rule not in (None, "extensive", "intensive"): + raise ValueError( + f"if defined, sum_rule must be either 'extensive' or 'intensive', not" + f" {sum_rule}" + ) + + +def initialize_empty_converted_da( + *, + old_da: xr.DataArray, + old_dim: typing.Union[Hashable, str], + new_dim: str, + new_categorization: climate_categories.Categorization, +) -> xr.DataArray: + """Build a DataArray which can hold the data after conversion to a new + categorization. + + Returns a new DataArray with the same dimensions and coordinates as the old + DataArray, but with the old_dim dimension replaced by new_dim using the + new_categorization. + The returned DataArray is filled with NaN. + + Parameters + ---------- + old_da: xr.DataArray + The unconverted array. + old_dim: str + The name of the dimension (including the categorization) which will be + converted. Example: "area (ISO3)" + new_dim: str + The name of the dimension (including the categorization) after conversion. + Example: "area (ISO2)" + new_categorization: climate_categories.Categorization + The new categorization object. + + Returns + ------- + new_da: xr.DataArray + An empty array with the right shape to hold the data after conversion. + """ + new_dims = [] + new_shape = [] + for i, idim in enumerate(old_da.dims): + if idim == old_dim: + new_dims.append(new_dim) + new_shape.append(len(new_categorization)) + else: + new_dims.append(idim) + new_shape.append(old_da.shape[i]) + + new_coords = {} + for coord in old_da.coords: + if coord == old_dim: + new_coords[new_dim] = np.array(list(new_categorization.keys())) + elif old_dim in old_da.coords[coord].dims: + # The additional coordinate has the old_dim as one dimension, but we + # won't be able to convert it + logger.info( + f"Additional coordinate {coord} can not be converted automatically" + f" and is skipped." + ) + continue + else: + new_coords[coord] = old_da.coords[coord] + + new_attrs = copy.deepcopy(old_da.attrs) + for pdim in ("area", "cat", "scen"): + if pdim in new_attrs and new_attrs[pdim] == old_dim: + new_attrs[pdim] = new_dim + + if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]: + new_attrs["sec_cats"].remove(old_dim) + new_attrs["sec_cats"].append(new_dim) + + # initialize the converted array using all NA + all_na_array = np.empty(new_shape) + all_na_array[:] = np.nan + return xr.DataArray( + data=all_na_array, + dims=new_dims, + coords=new_coords, + name=old_da.name, + attrs=new_attrs, + ) + + class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): @alias_dims(["dim"]) def convert( @@ -148,53 +249,19 @@ def convert( A copy of the DataArray with the given dimension converted in the new categorization. """ - if not isinstance(categorization, climate_categories.Categorization): - categorization = climate_categories.cats[categorization] - - if sum_rule not in (None, "extensive", "intensive"): - raise ValueError( - f"sum_rule must bei either 'extensive' or 'intensive', not {sum_rule}" - ) + new_categorization = ensure_categorization_instance(categorization) + check_valid_sum_rule_types(sum_rule) dim_name, old_categorization_name = extract_categorization_from_dim(dim) - old_categorization: climate_categories.Categorization = climate_categories.cats[ - old_categorization_name - ] - conversion = old_categorization.conversion_to(categorization) - new_dim = f"{dim_name} ({categorization.name})" - - new_dims = [] - new_shape = [] - for i, old_dim in enumerate(self._da.dims): - if old_dim == dim: - new_dims.append(new_dim) - new_shape.append(len(categorization)) - else: - new_dims.append(old_dim) - new_shape.append(self._da.shape[i]) - - new_coords = {} - for coord in self._da.coords: - if coord == dim: - new_coords[new_dim] = np.array(list(categorization.keys())) - elif dim in self._da.coords[coord].dims: - logger.info( - f"Additional coordinate {coord} can not be converted automatically" - f" and is skipped." - ) - continue - else: - new_coords[coord] = self._da.coords[coord] - - # initialize the converted array using all NA - all_na_array = np.empty(new_shape) - all_na_array[:] = np.nan - converted = xr.DataArray( - data=all_na_array, - dims=new_dims, - coords=new_coords, - name=self._da.name, - attrs=self._da.attrs, + old_categorization = ensure_categorization_instance(old_categorization_name) + conversion = old_categorization.conversion_to(new_categorization) + new_dim = f"{dim_name} ({new_categorization.name})" + + converted = initialize_empty_converted_da( + old_da=self._da, + old_dim=dim, + new_dim=new_dim, + new_categorization=new_categorization, ) converted_categories = [] From 000dfc1d49204caf3a00a7b3f03e1db2018fc175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 5 Nov 2021 18:58:21 +0100 Subject: [PATCH 04/36] Conversions: refactor, do something useful with rules restricted to specific categories. --- primap2/_convert.py | 624 +++++++++++++++++++++++----------- primap2/tests/test_convert.py | 12 +- setup.cfg | 2 +- 3 files changed, 429 insertions(+), 209 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 3edb5dba..5917190d 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -11,6 +11,246 @@ from ._alias_selection import alias_dims +class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): + @alias_dims(["dim"]) + def convert( + self, + dim: typing.Union[Hashable, str], + categorization: typing.Union[climate_categories.Categorization, str], + *, + sum_rule: typing.Optional[str] = None, + input_weights: typing.Optional[xr.DataArray] = None, + output_weights: typing.Optional[xr.DataArray] = None, + auxiliary_dimensions: typing.Optional[typing.Dict[str, str]] = None, + ) -> xr.DataArray: + """Convert the data along the given dimension into the new categorization. + + Maps the given dimension from one categorization (terminology) into another. + Fetches the rules to do the mapping from the climate_categories package, and + therefore will only work if there are conversions rules to convert from the + current categorization to the new categorization. + + Parameters + ---------- + dim : str + Dimension to convert. Has to be a dimension from ``da.dims``. + categorization : climate_categories.Categorization or str + New categorization to convert the given dimension to. Either give the title + of the new categorization (like ``IPCC1996``) or a + ``climate_categories.Categorization`` object. + sum_rule : ``extensive``, ``intensive``, or None (default) + If data of categories has to be summed up or divided, we need information + whether the quantity measured is extensive (like, for example, total + emissions in a year subdivided into multiple sectoral categories) or + intensive (like, for example, average per-person emissions in a year + subdivided into different territorial entities). By default (None), a + warning is issued if data has to be summed up or divided. + input_weights : xr.DataArray, optional + If data in input categories has to be summed up and the sum_rule is + ``intensive``, weights for the input categories are required. + The weights can be given in any shape compatible with the DataArray that + is converted, e.g. to give different weights for industrial sectors by + country. However, at least the ``dim`` that is converted needs to be in + ``input_weights.dims``. + If no weights are specified but a rule requiring weights is specified + in the conversion rules, a warning is issued and the respective rule is + skipped (probably resulting in more NaNs in the output). + output_weights : xr.DataArray, optional + If data has to be divided into several output categories and the sum_rule is + ``extensive``, weights for the output categories are required. + The weights can be given in any shape compatible with the DataArray that + is converted, e.g. to give different weights for industrial sectors by + country. However, at least the ``dim`` that is converted needs to be in + ``output_weights.dims``. + If no weights are specified but a rule requiring weights is specified + in the conversion rules, a warning is issued and the respective rule is + skipped (probably resulting in more NaNs in the output). + auxiliary_dimensions : dict[str, str], optional + Mapping of auxiliary categorizations to dimension names used in this + DataArray. In conversions which contain rules which are valid only for + certain orthogonal dimensions (e.g. a conversion between different sectoral + terminologies, but some rules are only valid for specific countries), only + the categorization is specified. Therefore, in this case you have to specify + a mapping from categorization name to dimension name. + Example: {"ISO3": "area (ISO3)"}) . + + Returns + ------- + converted : xr.DataArray + A copy of the DataArray with the given dimension converted in the new + categorization. + """ + new_categorization = ensure_categorization_instance(categorization) + check_valid_sum_rule_types(sum_rule) + + dim_name, old_categorization_name = extract_categorization_from_dim(dim) + old_categorization = ensure_categorization_instance(old_categorization_name) + conversion = old_categorization.conversion_to(new_categorization) + auxiliary_dimensions = prepare_auxiliary_dimensions( + conversion, auxiliary_dimensions + ) + new_dim = f"{dim_name} ({new_categorization.name})" + + converted_da = initialize_empty_converted_da( + old_da=self._da, + old_dim=dim, + new_dim=new_dim, + new_categorization=new_categorization, + ) + + converted_categories = [] + for category in converted_da[new_dim]: + if category in converted_categories: + continue + newly_converted_categories, converted_da = self._fill_category( + da=converted_da, + dim=dim, + new_dim=new_dim, + already_converted_categories=converted_categories, + category=category.item(), + conversion=conversion, + sum_rule=sum_rule, + auxiliary_dimensions=auxiliary_dimensions, + input_weights=input_weights, + output_weights=output_weights, + ) + converted_categories += newly_converted_categories + + return converted_da + + def _fill_category( + self, + da: xr.DataArray, + dim: str, + new_dim: str, + already_converted_categories: typing.List[climate_categories.Category], + category: climate_categories.Category, + conversion: climate_categories.Conversion, + sum_rule: typing.Optional[str], + auxiliary_dimensions: typing.Optional[ + typing.Dict[climate_categories.Categorization, str] + ], + input_weights: typing.Optional[xr.DataArray] = None, + output_weights: typing.Optional[xr.DataArray] = None, + ) -> typing.Tuple[typing.List[climate_categories.Category], xr.DataArray]: + """Return a copy of da with the given category filled by values converted + using the given conversion. + + Parameters + ---------- + da: xr.DataArray + The array which should be filled with the newly converted values. + dim: str + The source dimension. + new_dim: str + The target dimension. + already_converted_categories: list of climate_categories.Category + Categories which are already converted and should not be overwritten. + This is important if the category that should be filled can be filled + using compound rules which fill additional categories. + category: climate_categories.Category + The category from the new dimension which should be filled. + conversion: climate_categories.Conversion + The conversion to use to compute the values for the given category. + sum_rule: str, optional + See docstring of `convert`. + auxiliary_dimensions: + See docstring of `convert`. + input_weights: xr.DataArray, optional + See docstring of `convert`. + output_weights: xr.DataArray, optional + See docstring of `convert`. + + Returns + ------- + filled_categories, filled: list of climate_categories.category, xr.DataArray + The categories that were filled and the new DataArray. + """ + try: + rules = applicable_rules(conversion, category) + except KeyError: + logger.debug(f"No rule to derive data for {category!r}, will be NaN.") + return [], da + + for rule in rules: + logger.debug(f"Processing rule {rule}.") + # iterate until a non-restricted rule was applied or all rules are + # exhausted + input_selection, input_factors = factors_categories_to_xarray( + dim=dim, + factors_categories=rule.factors_categories_a, + auxiliary_categories=rule.auxiliary_categories, + auxiliary_dimensions=auxiliary_dimensions, + ) + output_selection, output_factors = factors_categories_to_xarray( + dim=new_dim, + factors_categories=rule.factors_categories_b, + auxiliary_categories=rule.auxiliary_categories, + auxiliary_dimensions=auxiliary_dimensions, + ) + + # if it is a multi-output rule, but some of the + # outputs are already converted, we can't use it + # TODO: instead, we could use the already converted output as + # *input*, which would probably be more correct, but also pretty + # difficult. + already_converted = set(output_selection[new_dim]).intersection( + set(already_converted_categories) + ) + if already_converted: + logger.warning( + f"For category {category!r}, would want to use a " + "rule with multiple outputs, but the following outputs " + f"are already converted: {already_converted!r}. " + "Skipping this rule." + ) + continue + + try: + effective_input_weights = weights( + dim=dim, + category=category, + rule=rule, + operation_type="input", + selection=input_selection, + sum_rule=sum_rule, + weights=input_weights, + ) + effective_output_weights = weights( + dim=new_dim, + category=category, + rule=rule, + operation_type="output", + selection=output_selection, + sum_rule=sum_rule, + weights=output_weights, + ) + except WeightingInfoMissing as err: + logger.warning(str(err)) + continue + + # the left-hand side of the conversion formula summed up + lhs = ( + input_factors * effective_input_weights * self._da.loc[input_selection] + ).sum(dim=dim) + # the right-hand side of the conversion formula split up + rhs = lhs / output_factors / effective_output_weights + + # TODO: this is slow because it makes copies + # fillna behaviour (only overwrites NaN in converted) + da = da.combine_first(rhs) + + if not rule.is_restricted: + # stop processing rules for this category + return output_selection[new_dim], da + + logger.debug( + f"No unrestricted rule to derive data for {category!r} applied, some or " + f"all data for the category will be NaN." + ) + return [], da + + def extract_categorization_from_dim(dim: str) -> (str, str): """Extract the pure dimension and the categorization from a composite dim. @@ -43,50 +283,17 @@ def extract_categorization_from_dim(dim: str) -> (str, str): return pure[:-1], cat[:-1] -def applicable_rule(conversion, category): - """Choose the best rule to derive the given category using the given conversion. - - If there are multiple relevant rules, will prefer rules with: - 1. the given category as the only target category. - 2. only one source category - 3. rules defined earlier in the CSV. - - TODO: how to deal with restricted rules? - """ +def applicable_rules( + conversion, category +) -> typing.List[climate_categories.ConversionRule]: + """Find the possible rules to derive the category using the given conversion.""" rules = conversion.relevant_rules({conversion.categorization_b[category]}) # a + b = c - d can not be used to derive c nor d, only a and b rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())] - # drop all restricted rules - # TODO do something smart with restricted rules - rules = [r for r in rules if not any(r.auxiliary_categories.values())] if not rules: raise KeyError(category) - # narrow down rules until we have exactly one rule to apply - # prefer rules where the target category is the only summand - if len(rules) != 1: - cardinalities = [r.cardinality_b for r in rules] - if "one" in cardinalities: - for i in range(len(rules)): - if cardinalities[i] == "many": - rules.pop(i) - # prefer rules with exactly one source category - if len(rules) != 1: - cardinalities = [r.cardinality_a for r in rules] - if "one" in cardinalities: - for i in range(len(rules)): - if cardinalities[i] == "many": - rules.pop(i) - # if we still have multiple eligible rules, just use the first - if len(rules) != 1: - rule_str = str(rules[0]) - logger.info( - f"There are {len(rules)} rules to derive data for" - f" {category!r}, will" - f" use {rule_str!r} because it was defined earlier." - ) - - return rules[0] + return rules def ensure_categorization_instance( @@ -189,188 +396,191 @@ def initialize_empty_converted_da( ) -class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): - @alias_dims(["dim"]) - def convert( - self, - dim: typing.Union[Hashable, str], - categorization: typing.Union[climate_categories.Categorization, str], - *, - sum_rule: typing.Optional[str] = None, - input_weights: typing.Optional[xr.DataArray] = None, - output_weights: typing.Optional[xr.DataArray] = None, - ) -> xr.DataArray: - """Convert the data along the given dimension into the new categorization. +def factors_categories_to_xarray( + *, + dim: str, + factors_categories: typing.Dict[climate_categories.Category, int], + auxiliary_categories: typing.Dict[ + climate_categories.Categorization, typing.Set[climate_categories.Category] + ], + auxiliary_dimensions: typing.Dict[climate_categories.Categorization, str], +) -> typing.Tuple[typing.Dict[str, typing.List[str]], xr.DataArray]: + """Convert dictionary mapping categories to factors into xarray-compatible objects. + + Using the xarray objects ensures that in subsequent calculations, everything + will cleanly multiply reagardless of the dimensionality of the data. - Maps the given dimension from one categorization (terminology) into another. - Fetches the rules to do the mapping from the climate_categories package, and - therefore will only work if there are conversions rules to convert from the - current categorization to the new categorization. + Parameters + ---------- + dim: str + Dimension which contains the categories. + factors_categories: dict[climate_categories.Category, int] + Dictionary mapping categories to factors. + auxiliary_categories: dict + If the rule is limited to specific categories from other dimensions, + their categorizations and categories are given here. + auxiliary_dimensions: dict[climate_categories.Categorization, str] + If the rule is limited to specific categories from other dimensions, the mapping + from the used Categorizations to the dimension names used in the data to be + converted has to be given. - Parameters - ---------- - dim : str - Dimension to convert. Has to be a dimension from ``da.dims``. - categorization : climate_categories.Categorization or str - New categorization to convert the given dimension to. Either give the title - of the new categorization (like ``IPCC1996``) or a - ``climate_categories.Categorization`` object. - sum_rule : ``extensive``, ``intensive``, or None (default) - If data of categories has to be summed up or divided, we need information - whether the quantity measured is extensive (like, for example, total - emissions in a year subdivided into multiple sectoral categories) or - intensive (like, for example, average per-person emissions in a year - subdivided into different territorial entities). By default (None), a - warning is issued if data has to be summed up or divided. - input_weights : xr.DataArray, optional - If data in input categories has to be summed up and the sum_rule is - ``intensive``, weights for the input categories are required. - The weights can be given in any shape compatible with the DataArray that - is converted, e.g. to give different weights for industrial sectors by - country. However, at least the ``dim`` that is converted needs to be in - ``input_weights.dims``. - If no weights are specified but a rule requiring weights is specified - in the conversion rules, a warning is issued and the respective rule is - skipped (probably resulting in more NaNs in the output). - output_weights : xr.DataArray, optional - If data has to be divided into several output categories and the sum_rule is - ``extensive``, weights for the output categories are required. - The weights can be given in any shape compatible with the DataArray that - is converted, e.g. to give different weights for industrial sectors by - country. However, at least the ``dim`` that is converted needs to be in - ``output_weights.dims``. - If no weights are specified but a rule requiring weights is specified - in the conversion rules, a warning is issued and the respective rule is - skipped (probably resulting in more NaNs in the output). + Returns + ------- + selection, factors: dict[str, list[str]], xr.DataArray + selection is a dictionary which can be used as a selector to select the + appropriate categories from an xarray object. + factors is an xarray DataArray which can be multiplied with an xarray object + after applying the selection. + """ + selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]} + factors = xr.DataArray( + data=list(factors_categories.values()), + dims=[dim], + coords=selection, + ) - Returns - ------- - converted : xr.DataArray - A copy of the DataArray with the given dimension converted in the new - categorization. - """ - new_categorization = ensure_categorization_instance(categorization) - check_valid_sum_rule_types(sum_rule) + for aux_categorization, aux_categories in auxiliary_categories.items(): + if aux_categories: + aux_dim = auxiliary_dimensions[aux_categorization] + selection[aux_dim] = [cat.codes[0] for cat in aux_categories] - dim_name, old_categorization_name = extract_categorization_from_dim(dim) - old_categorization = ensure_categorization_instance(old_categorization_name) - conversion = old_categorization.conversion_to(new_categorization) - new_dim = f"{dim_name} ({new_categorization.name})" + return selection, factors - converted = initialize_empty_converted_da( - old_da=self._da, - old_dim=dim, - new_dim=new_dim, - new_categorization=new_categorization, + +class WeightingInfoMissing(ValueError): + """Some information to derive weighting factors for a rule is missing.""" + + def __init__( + self, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + message: str, + ): + full_message = ( + f"Can not derive data for category {category!r} using rule" + f" '{rule}': {message} Skipping this rule." ) + ValueError.__init__(self, full_message) - converted_categories = [] - for category in converted[new_dim]: - category = category.item() - if category in converted_categories: - continue - try: - rule = applicable_rule(conversion, category) - except KeyError: - logger.debug(f"No rule to derive data for {category!r}, will be NaN.") - continue - # convert rule into xarray objects that will cleanly multiply regardless - # of dimensionality - input_selection = { - dim: [cat.codes[0] for cat in rule.factors_categories_a.keys()] - } - input_factors = xr.DataArray( - data=list(rule.factors_categories_a.values()), - dims=[dim], - coords=input_selection, - ) - new_dim_values = [cat.codes[0] for cat in rule.factors_categories_b.keys()] - output_selection = {new_dim: new_dim_values} - output_factors = xr.DataArray( - data=list(rule.factors_categories_b.values()), - dims=[new_dim], - coords=output_selection, - ) +def weights( + *, + dim: str, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + sum_rule: typing.Optional[str], + operation_type: str, + weights: typing.Optional[xr.DataArray], + selection: typing.Dict[str, typing.List[str]], +) -> typing.Union[xr.DataArray, float]: + """Derive the weights to use for applying a specific rule. - # if the applicable rule is a multi-output rule, but some of the - # outputs are already converted, give up - already_converted = set(new_dim_values).intersection( - set(converted_categories) + Parameters + ---------- + dim: str + Dimension which contains the categories. + category: climate_categories.Category + Category which should be derived. + rule: climate_categories.ConversionRule + Rule that should be used to derive the category. + sum_rule : ``extensive``, ``intensive``, or None (default) + If data of categories has to be summed up or divided, we need information + whether the quantity measured is extensive (like, for example, total + emissions in a year subdivided into multiple sectoral categories) or + intensive (like, for example, average per-person emissions in a year + subdivided into different territorial entities). By default (None), a + warning is issued if data has to be summed up or divided. + operation_type: ``input`` or ``output`` + If weights for the source data (input) or the result data (output) should + be derived. + weights: xr.DataArray, optional + Weights for the individual categories. + selection: dict[str, list[str]] + Selection derived from the rule. + + Returns + ------- + factors: float or xr.DataArray + Object which can be multiplied with the input or output DataArray to apply + weights. + """ + if operation_type == "input": + operation_verb = "sum up" + trivial_sum_rule = "extensive" + nontrivial_sum_rule = "intensive" + rule_cardinality = rule.cardinality_a + else: + operation_verb = "split" + trivial_sum_rule = "intensive" + nontrivial_sum_rule = "extensive" + rule_cardinality = rule.cardinality_b + + # just one category or trivial sum rule, so no weights required + if rule_cardinality == "one" or sum_rule == trivial_sum_rule: + return 1.0 + if sum_rule == nontrivial_sum_rule: + if weights is None: + raise WeightingInfoMissing( + category=category, + rule=rule, + message=f"We need to {operation_verb} multiple categories with" + f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" + f" specified.", ) - if already_converted: - # TODO: maybe we can do better? - logger.warning( - f"For category {category!r}, would want to use a " - "rule with multiple outputs, but the following outputs " - f"are already converted: {already_converted!r}. " - "Skipping this category and leaving it NaN." - ) - continue + effective_weights = weights.loc[selection] + # normalize so it is actually a weight, not a factor + return effective_weights / effective_weights.sum(dim=dim) + + raise WeightingInfoMissing( + category=category, + rule=rule, + message=f"We need to {operation_verb} multiple categories, but the sum_rule is" + f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" + f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" + f" specified.", + ) - # derive input and output weights (maybe trivial) - if rule.cardinality_a == "one" or sum_rule == "extensive": - effective_input_weights = 1 - elif sum_rule == "intensive": - # summing intensive units requires weights - if input_weights is None: - logger.warning( - f"To derive data for {category!r}, we need to sum up" - " multiple input categories. For sum_rule='intensive'," - " this requires input_weights, but none are specified." - " Will continue with NaN, specify input_weights to avoid this." - ) - continue - effective_input_weights = input_weights.loc[input_selection] - # normalize so it is actually a weight, not a factor - effective_input_weights /= effective_input_weights.sum(dim=dim) - else: # no sum rule specified, but needed - logger.warning( - f"To derive data for {category!r}, we need to sum up" - " multiple input categories, but the sum_rule is" - " not specified. Will continue with NaN, specify the" - " sum_rule to avoid this." - ) - continue - if rule.cardinality_b == "one" or sum_rule == "intensive": - effective_output_weights = 1 - elif sum_rule == "extensive": - # dividing extensive units requires weights - if output_weights is None: - logger.warning( - f"To derive data for {category!r}, we need to split up" - " multiple output categories. For sum_rule='extensive'," - " this requires output_weights, but none are specified." - " Will continue with NaN, specify output_weights to avoid this." - ) - continue - effective_output_weights = output_weights.loc[output_selection] - # normalize so it is actually a weight, not a factor - effective_output_weights /= effective_output_weights.sum(dim=dim) - else: # no sum rule specified, but needed - logger.warning( - f"To derive data for {category!r}, we need to split up" - " multiple output categories, but the sum_rule is" - " not specified. Will continue with NaN, specify the" - " sum_rule to avoid this." - ) - continue +def prepare_auxiliary_dimensions( + conversion: climate_categories.Conversion, + auxiliary_dimensions: typing.Optional[typing.Dict[str, str]], +) -> typing.Optional[typing.Dict[climate_categories.Categorization, str]]: + """Prepare and check the auxiliary dimension mapping. - # the left-hand side of the conversion formula summed up - lhs = ( - input_factors * effective_input_weights * self._da.loc[input_selection] - ).sum(dim=dim) - # the right-hand side of the conversion formula split up - rhs = lhs / output_factors / effective_output_weights - # TODO: using pr.set here is not efficient because it makes copies - converted = converted.pr.set( - dim=new_dim, - key=new_dim_values, - value=rhs, + Check if all auxiliary categorizations used in the conversion are matched in + auxiliary_dimensions. + + Raises a ValueError if any dimension is missing. + + Returns + ------- + auxiliary_dimensions: dict mapping Categorization -> str + the auxiliary dimensions, but using Categorization objects instead of their + names. + """ + if conversion.auxiliary_categorizations_names: + if auxiliary_dimensions is None: + raise ValueError( + "The conversion uses auxiliary categories, but a translation to" + " dimension names was not provided using the argument" + " auxiliary_dimensions. Please provide auxiliary_dimensions mapping" + f" {conversion.auxiliary_categorizations_names} to the dimension" + " names used in the data." + ) + missing = set(conversion.auxiliary_categorizations_names).difference( + auxiliary_dimensions.keys() + ) + if missing: + raise ValueError( + "A dimension name was not given for all auxiliary categories:" + f" {missing} are missing in the auxiliary_dimensions argument, please" + " provide translations to the dimension names used in the data." ) - # mark all filled categories as converted - converted_categories += new_dim_values + if not auxiliary_dimensions: + return auxiliary_dimensions - return converted + return { + climate_categories.cats[name]: auxiliary_dimensions[name] + for name in auxiliary_dimensions + } diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index fbd7b9fb..32c3670d 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,6 +1,7 @@ """Tests for _convert.py""" import climate_categories as cc +import pytest import xarray as xr import primap2 @@ -11,10 +12,19 @@ def test_convert_ipcc(empty_ds: xr.Dataset): # to see da = empty_ds["CO2"] da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())}) + da = da.expand_dims({"source (gas)": list(cc.gas.keys())}) arr = da.data.copy() arr[:] = 1 * primap2.ureg("Gg CO2 / year") da.data = arr - da.pr.convert("category", "IPCC2006", sum_rule="extensive") + with pytest.raises(ValueError, match="The conversion uses auxiliary categories"): + da.pr.convert("category", "IPCC2006", sum_rule="extensive") + + da.pr.convert( + "category", + "IPCC2006", + sum_rule="extensive", + auxiliary_dimensions={"gas": "source (gas)"}, + ) # TODO test that values actually make sense diff --git a/setup.cfg b/setup.cfg index 8acb6943..537fd060 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = ruamel.yaml strictyaml openpyxl - climate_categories>=0.6.0 + climate_categories>=0.6.2 [options.extras_require] test = From 8b9b86f69f28420759923cab8248b033830523e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 5 Nov 2021 19:14:30 +0100 Subject: [PATCH 05/36] Avoid name clash between weight function and its own argument. --- primap2/_convert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 5917190d..df0b0936 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -207,7 +207,7 @@ def _fill_category( continue try: - effective_input_weights = weights( + effective_input_weights = derive_weights( dim=dim, category=category, rule=rule, @@ -216,7 +216,7 @@ def _fill_category( sum_rule=sum_rule, weights=input_weights, ) - effective_output_weights = weights( + effective_output_weights = derive_weights( dim=new_dim, category=category, rule=rule, @@ -463,7 +463,7 @@ def __init__( ValueError.__init__(self, full_message) -def weights( +def derive_weights( *, dim: str, category: climate_categories.Category, From f4cc526d15ddaaf001e54e8e02beb7a634871446 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Fri, 5 Nov 2021 20:48:32 +0100 Subject: [PATCH 06/36] Require climate_categories >= 0.6.3, which introduces some API we need. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 537fd060..1c873c9d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ install_requires = ruamel.yaml strictyaml openpyxl - climate_categories>=0.6.2 + climate_categories>=0.6.3 [options.extras_require] test = From fdbec61cb5b8083d90fca4600b531fa54e1a3b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 14 Nov 2023 16:50:15 +0100 Subject: [PATCH 07/36] style: ruff --- primap2/_convert.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index df0b0936..3f803251 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -1,6 +1,6 @@ import copy import typing -from typing import Hashable +from collections.abc import Hashable import climate_categories import numpy as np @@ -21,7 +21,7 @@ def convert( sum_rule: typing.Optional[str] = None, input_weights: typing.Optional[xr.DataArray] = None, output_weights: typing.Optional[xr.DataArray] = None, - auxiliary_dimensions: typing.Optional[typing.Dict[str, str]] = None, + auxiliary_dimensions: typing.Optional[dict[str, str]] = None, ) -> xr.DataArray: """Convert the data along the given dimension into the new categorization. @@ -123,16 +123,16 @@ def _fill_category( da: xr.DataArray, dim: str, new_dim: str, - already_converted_categories: typing.List[climate_categories.Category], + already_converted_categories: list[climate_categories.Category], category: climate_categories.Category, conversion: climate_categories.Conversion, sum_rule: typing.Optional[str], auxiliary_dimensions: typing.Optional[ - typing.Dict[climate_categories.Categorization, str] + dict[climate_categories.Categorization, str] ], input_weights: typing.Optional[xr.DataArray] = None, output_weights: typing.Optional[xr.DataArray] = None, - ) -> typing.Tuple[typing.List[climate_categories.Category], xr.DataArray]: + ) -> tuple[list[climate_categories.Category], xr.DataArray]: """Return a copy of da with the given category filled by values converted using the given conversion. @@ -279,13 +279,11 @@ def extract_categorization_from_dim(dim: str) -> (str, str): try: pure, cat = dim.split("(", 1) except ValueError: - raise ValueError(f"No categorization specified: {dim!r}.") + raise ValueError(f"No categorization specified: {dim!r}.") from None return pure[:-1], cat[:-1] -def applicable_rules( - conversion, category -) -> typing.List[climate_categories.ConversionRule]: +def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]: """Find the possible rules to derive the category using the given conversion.""" rules = conversion.relevant_rules({conversion.categorization_b[category]}) # a + b = c - d can not be used to derive c nor d, only a and b @@ -399,12 +397,12 @@ def initialize_empty_converted_da( def factors_categories_to_xarray( *, dim: str, - factors_categories: typing.Dict[climate_categories.Category, int], - auxiliary_categories: typing.Dict[ - climate_categories.Categorization, typing.Set[climate_categories.Category] + factors_categories: dict[climate_categories.Category, int], + auxiliary_categories: dict[ + climate_categories.Categorization, set[climate_categories.Category] ], - auxiliary_dimensions: typing.Dict[climate_categories.Categorization, str], -) -> typing.Tuple[typing.Dict[str, typing.List[str]], xr.DataArray]: + auxiliary_dimensions: dict[climate_categories.Categorization, str], +) -> tuple[dict[str, list[str]], xr.DataArray]: """Convert dictionary mapping categories to factors into xarray-compatible objects. Using the xarray objects ensures that in subsequent calculations, everything @@ -471,7 +469,7 @@ def derive_weights( sum_rule: typing.Optional[str], operation_type: str, weights: typing.Optional[xr.DataArray], - selection: typing.Dict[str, typing.List[str]], + selection: dict[str, list[str]], ) -> typing.Union[xr.DataArray, float]: """Derive the weights to use for applying a specific rule. @@ -543,8 +541,8 @@ def derive_weights( def prepare_auxiliary_dimensions( conversion: climate_categories.Conversion, - auxiliary_dimensions: typing.Optional[typing.Dict[str, str]], -) -> typing.Optional[typing.Dict[climate_categories.Categorization, str]]: + auxiliary_dimensions: typing.Optional[dict[str, str]], +) -> typing.Optional[dict[climate_categories.Categorization, str]]: """Prepare and check the auxiliary dimension mapping. Check if all auxiliary categorizations used in the conversion are matched in From 1ecee182369312c497fac13f0fd5e710f4d0eee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Wed, 15 Nov 2023 10:40:46 +0100 Subject: [PATCH 08/36] fix: stub file generation --- primap-stubs.patch | 54 +++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/primap-stubs.patch b/primap-stubs.patch index 334b78ce..63e28fde 100644 --- a/primap-stubs.patch +++ b/primap-stubs.patch @@ -1,40 +1,34 @@ -diff '--color=auto' -r -u xarray-orig/core/dataarray.pyi xarray/core/dataarray.pyi ---- xarray-orig/core/dataarray.pyi 2022-05-09 19:44:00.059497745 +0200 -+++ xarray/core/dataarray.pyi 2022-05-09 19:45:46.028378457 +0200 -@@ -26,6 +26,7 @@ - from dask.delayed import Delayed - from iris.cube import Cube as iris_Cube - from typing import Any, Callable, Hashable, Iterable, Literal, Mapping, Sequence +diff '--color=auto' -ru xarray.orig/core/dataarray.pyi xarray/core/dataarray.pyi +--- xarray.orig/core/dataarray.pyi 2023-11-15 10:36:00.509607027 +0100 ++++ xarray/core/dataarray.pyi 2023-11-15 10:35:25.561354898 +0100 +@@ -1,3 +1,4 @@ +import primap2 + import datetime + import numpy as np + import pandas as pd +@@ -48,6 +49,8 @@ + def __setitem__(self, key, value) -> None: ... - class _LocIndexer: - data_array: Incomplete -@@ -36,6 +37,8 @@ - class DataArray(AbstractArray, DataWithCoords, DataArrayArithmetic): - dt: Incomplete - def __init__(self, data: Any = ..., coords: Union[Sequence[tuple], Mapping[Any, Any], None] = ..., dims: Union[Hashable, Sequence[Hashable], None] = ..., name: Hashable = ..., attrs: Mapping = ..., indexes: dict[Hashable, pd.Index] = ..., fastpath: bool = ...) -> None: ... + class DataArray(AbstractArray, DataWithCoords, DataArrayArithmetic, DataArrayAggregations): + @property + def pr(self) -> primap2.accessors.PRIMAP2DataArrayAccessor: ... + dt: Incomplete + def __init__(self, data: Any = ..., coords: Sequence[Sequence[Any] | pd.Index | DataArray] | Mapping[Any, Any] | None = ..., dims: Hashable | Sequence[Hashable] | None = ..., name: Hashable | None = ..., attrs: Mapping | None = ..., indexes: Mapping[Any, Index] | None = ..., fastpath: bool = ...) -> None: ... def to_dataset(self, dim: Hashable = ..., *, name: Hashable = ..., promote_attrs: bool = ...) -> Dataset: ... - @property - def name(self) -> Union[Hashable, None]: ... -diff '--color=auto' -r -u xarray-orig/core/dataset.pyi xarray/core/dataset.pyi ---- xarray-orig/core/dataset.pyi 2022-05-09 19:44:00.059497745 +0200 -+++ xarray/core/dataset.pyi 2022-05-09 19:45:37.980311296 +0200 -@@ -27,6 +27,7 @@ - from numbers import Number - from os import PathLike - from typing import Any, Callable, Collection, Hashable, Iterable, Iterator, Literal, Mapping, MutableMapping, Sequence, overload +diff '--color=auto' -ru xarray.orig/core/dataset.pyi xarray/core/dataset.pyi +--- xarray.orig/core/dataset.pyi 2023-11-15 10:36:00.513607056 +0100 ++++ xarray/core/dataset.pyi 2023-11-15 10:35:25.577355012 +0100 +@@ -1,3 +1,4 @@ +import primap2 + import datetime + import numpy as np + import pandas as pd +@@ -59,6 +60,8 @@ + def __setitem__(self, key, value) -> None: ... - def calculate_dimensions(variables: Mapping[Any, Variable]) -> dict[Hashable, int]: ... - def merge_indexes(indexes: Mapping[Any, Union[Hashable, Sequence[Hashable]]], variables: Mapping[Any, Variable], coord_names: set[Hashable], append: bool = ...) -> tuple[dict[Hashable, Variable], set[Hashable]]: ... -@@ -50,6 +51,8 @@ - - class Dataset(DataWithCoords, DatasetArithmetic, Mapping): - def __init__(self, data_vars: Mapping[Any, Any] = ..., coords: Mapping[Any, Any] = ..., attrs: Mapping[Any, Any] = ...) -> None: ... + class Dataset(DataWithCoords, DatasetAggregations, DatasetArithmetic, Mapping[Hashable, 'DataArray']): + @property + def pr(self) -> primap2.accessors.PRIMAP2DatasetAccessor: ... + def __init__(self, data_vars: DataVars | None = ..., coords: Mapping[Any, Any] | None = ..., attrs: Mapping[Any, Any] | None = ...) -> None: ... + def __eq__(self, other: DsCompatible) -> Self: ... @classmethod - def load_store(cls, store, decoder: Incomplete | None = ...) -> Dataset: ... - @property From a7a1d9c54364ee3a927b7c3b1e951197bcd92824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Wed, 15 Nov 2023 11:10:06 +0100 Subject: [PATCH 09/36] types: better typing for sum_rule --- primap2/_convert.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 3f803251..c38e33fd 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -18,7 +18,9 @@ def convert( dim: typing.Union[Hashable, str], categorization: typing.Union[climate_categories.Categorization, str], *, - sum_rule: typing.Optional[str] = None, + sum_rule: typing.Literal["intensive"] + | typing.Literal["extensive"] + | None = None, input_weights: typing.Optional[xr.DataArray] = None, output_weights: typing.Optional[xr.DataArray] = None, auxiliary_dimensions: typing.Optional[dict[str, str]] = None, From 1a2f2ba8909236df9d2ebe9f22f9e975cbb9be4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Wed, 15 Nov 2023 12:26:11 +0100 Subject: [PATCH 10/36] test: some tests for correct results of convert() --- primap2/tests/test_convert.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 32c3670d..5cdf3092 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -20,11 +20,20 @@ def test_convert_ipcc(empty_ds: xr.Dataset): with pytest.raises(ValueError, match="The conversion uses auxiliary categories"): da.pr.convert("category", "IPCC2006", sum_rule="extensive") - da.pr.convert( + result = da.pr.convert( "category", "IPCC2006", sum_rule="extensive", auxiliary_dimensions={"gas": "source (gas)"}, ) - # TODO test that values actually make sense + assert ( + (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")) + .all() + .item() + ) + assert ( + (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")) + .all() + .item() + ) From 7e66953e5e66251b83bdf45fbe3a909fe7284046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Wed, 15 Nov 2023 12:27:42 +0100 Subject: [PATCH 11/36] perf: convert in-place --- primap2/_convert.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index c38e33fd..5ce8f719 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -238,9 +238,7 @@ def _fill_category( # the right-hand side of the conversion formula split up rhs = lhs / output_factors / effective_output_weights - # TODO: this is slow because it makes copies - # fillna behaviour (only overwrites NaN in converted) - da = da.combine_first(rhs) + da.loc[output_selection] = rhs if not rule.is_restricted: # stop processing rules for this category @@ -387,6 +385,7 @@ def initialize_empty_converted_da( # initialize the converted array using all NA all_na_array = np.empty(new_shape) all_na_array[:] = np.nan + all_na_array = all_na_array * old_da.pint.units return xr.DataArray( data=all_na_array, dims=new_dims, From a76feea442c7b5834153a572806e279afc137c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Wed, 15 Nov 2023 12:34:07 +0100 Subject: [PATCH 12/36] fix: types for 3.9 --- primap2/_convert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 5ce8f719..64f6f35a 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -18,9 +18,9 @@ def convert( dim: typing.Union[Hashable, str], categorization: typing.Union[climate_categories.Categorization, str], *, - sum_rule: typing.Literal["intensive"] - | typing.Literal["extensive"] - | None = None, + sum_rule: typing.Optional[ + typing.Union[typing.Literal["intensive"], typing.Literal["extensive"]] + ] = None, input_weights: typing.Optional[xr.DataArray] = None, output_weights: typing.Optional[xr.DataArray] = None, auxiliary_dimensions: typing.Optional[dict[str, str]] = None, From 91f9b76919b5ad706739b0e19ea2a45e6ad7ffa9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:48:54 +0000 Subject: [PATCH 13/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- primap2/_convert.py | 60 +++++++++++++++-------------------- primap2/tests/test_convert.py | 12 ++----- 2 files changed, 27 insertions(+), 45 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 64f6f35a..d755d69a 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -15,15 +15,13 @@ class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): @alias_dims(["dim"]) def convert( self, - dim: typing.Union[Hashable, str], - categorization: typing.Union[climate_categories.Categorization, str], + dim: Hashable | str, + categorization: climate_categories.Categorization | str, *, - sum_rule: typing.Optional[ - typing.Union[typing.Literal["intensive"], typing.Literal["extensive"]] - ] = None, - input_weights: typing.Optional[xr.DataArray] = None, - output_weights: typing.Optional[xr.DataArray] = None, - auxiliary_dimensions: typing.Optional[dict[str, str]] = None, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, ) -> xr.DataArray: """Convert the data along the given dimension into the new categorization. @@ -88,9 +86,7 @@ def convert( dim_name, old_categorization_name = extract_categorization_from_dim(dim) old_categorization = ensure_categorization_instance(old_categorization_name) conversion = old_categorization.conversion_to(new_categorization) - auxiliary_dimensions = prepare_auxiliary_dimensions( - conversion, auxiliary_dimensions - ) + auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) new_dim = f"{dim_name} ({new_categorization.name})" converted_da = initialize_empty_converted_da( @@ -128,12 +124,10 @@ def _fill_category( already_converted_categories: list[climate_categories.Category], category: climate_categories.Category, conversion: climate_categories.Conversion, - sum_rule: typing.Optional[str], - auxiliary_dimensions: typing.Optional[ - dict[climate_categories.Categorization, str] - ], - input_weights: typing.Optional[xr.DataArray] = None, - output_weights: typing.Optional[xr.DataArray] = None, + sum_rule: str | None, + auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, ) -> tuple[list[climate_categories.Category], xr.DataArray]: """Return a copy of da with the given category filled by values converted using the given conversion. @@ -232,9 +226,9 @@ def _fill_category( continue # the left-hand side of the conversion formula summed up - lhs = ( - input_factors * effective_input_weights * self._da.loc[input_selection] - ).sum(dim=dim) + lhs = (input_factors * effective_input_weights * self._da.loc[input_selection]).sum( + dim=dim + ) # the right-hand side of the conversion formula split up rhs = lhs / output_factors / effective_output_weights @@ -295,7 +289,7 @@ def applicable_rules(conversion, category) -> list[climate_categories.Conversion def ensure_categorization_instance( - cat: typing.Union[str, climate_categories.Categorization] + cat: str | climate_categories.Categorization, ) -> climate_categories.Categorization: """Takes a categorization name or object and returns the corresponding categorization object.""" @@ -304,21 +298,20 @@ def ensure_categorization_instance( return climate_categories.cats[cat] -def check_valid_sum_rule_types(sum_rule: typing.Optional[str]): +def check_valid_sum_rule_types(sum_rule: str | None): """Checks if the sum_rule is either "intensive", "extensive", or None. Raises a ValueError if an invalid sum_rule is used.""" if sum_rule not in (None, "extensive", "intensive"): raise ValueError( - f"if defined, sum_rule must be either 'extensive' or 'intensive', not" - f" {sum_rule}" + f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}" ) def initialize_empty_converted_da( *, old_da: xr.DataArray, - old_dim: typing.Union[Hashable, str], + old_dim: Hashable | str, new_dim: str, new_categorization: climate_categories.Categorization, ) -> xr.DataArray: @@ -399,9 +392,7 @@ def factors_categories_to_xarray( *, dim: str, factors_categories: dict[climate_categories.Category, int], - auxiliary_categories: dict[ - climate_categories.Categorization, set[climate_categories.Category] - ], + auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], auxiliary_dimensions: dict[climate_categories.Categorization, str], ) -> tuple[dict[str, list[str]], xr.DataArray]: """Convert dictionary mapping categories to factors into xarray-compatible objects. @@ -467,11 +458,11 @@ def derive_weights( dim: str, category: climate_categories.Category, rule: climate_categories.ConversionRule, - sum_rule: typing.Optional[str], + sum_rule: str | None, operation_type: str, - weights: typing.Optional[xr.DataArray], + weights: xr.DataArray | None, selection: dict[str, list[str]], -) -> typing.Union[xr.DataArray, float]: +) -> xr.DataArray | float: """Derive the weights to use for applying a specific rule. Parameters @@ -542,8 +533,8 @@ def derive_weights( def prepare_auxiliary_dimensions( conversion: climate_categories.Conversion, - auxiliary_dimensions: typing.Optional[dict[str, str]], -) -> typing.Optional[dict[climate_categories.Categorization, str]]: + auxiliary_dimensions: dict[str, str] | None, +) -> dict[climate_categories.Categorization, str] | None: """Prepare and check the auxiliary dimension mapping. Check if all auxiliary categorizations used in the conversion are matched in @@ -580,6 +571,5 @@ def prepare_auxiliary_dimensions( return auxiliary_dimensions return { - climate_categories.cats[name]: auxiliary_dimensions[name] - for name in auxiliary_dimensions + climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions } diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 5cdf3092..81dfb8a1 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -27,13 +27,5 @@ def test_convert_ipcc(empty_ds: xr.Dataset): auxiliary_dimensions={"gas": "source (gas)"}, ) - assert ( - (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")) - .all() - .item() - ) - assert ( - (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")) - .all() - .item() - ) + assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() From 084980512be0c58c189022593e38a17c33f79a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Mon, 7 Oct 2024 14:51:24 +0200 Subject: [PATCH 14/36] fix: _alias_selection is called _selection now --- primap2/_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index d755d69a..ff70aedd 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -8,7 +8,7 @@ from loguru import logger from . import _accessor_base -from ._alias_selection import alias_dims +from ._selection import alias_dims class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): From e88f760d7a8b2822a19ee252a3d9fcf3cc9b90c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Mon, 7 Oct 2024 14:54:48 +0200 Subject: [PATCH 15/36] fix: bump minimum required version of climate_categories to something non-ancient --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d09c22c9..b6dd2b41 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = openpyxl>=3.1 tqdm>=4.66 msgpack>=1 - climate_categories>=0.6.3 + climate_categories>=0.10.1 [options.extras_require] test = From 57c8a5f753c47e699511192eaccf4024d71cee33 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 10 Oct 2024 09:33:03 +0200 Subject: [PATCH 16/36] BURDI test draft --- primap2/tests/test_convert.py | 64 +++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 81dfb8a1..a4f66791 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -29,3 +29,67 @@ def test_convert_ipcc(empty_ds: xr.Dataset): assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() + +def test_convert_BURDI(empty_ds: xr.Dataset): + # build a DA categorized by BURDI and with 1 everywhere so results are easy + # to see + + # TODO this should come from climate categories + mapping_BURDI_to_IPCC2006_PRIMAP = { + "1" : "1", + "1.A" : "1.A", + "1.A.1" : "1.A.1", + "1.A.2" : "1.A.2", + "1.A.3" : "1.A.3", + "1.A.4" : "1.A.4", + "1.A.5" : "1.A.5", + "1.B" : "1.B", + "1.B.1" : "1.B.1", + "1.B.2" : "1.B.2", + "2" : "M.2.BURDI", + "2.A" : "2.A", + "2.B" : "M.2.B_2.B", + "2.C" : "2.C", + "2.D" : "M.2.H.1_2", + "2.E" : "M.2.B_2.E", + "2.F" : "2.F", + "2.G" : "2.H.3", + "3" : "2.D", + "4" : "M.AG", + "4.A" : "3.A.1", + "4.B" : "3.A.2", + "4.C" : "3.C.7", + "4.D" : "M.3.C.45.AG", + "4.E" : "3.C.1.c", + "4.F" : "3.C.1.b", + "4.G" : "3.C.8", + "5" : "M.LULUCF", + "6" : "4", + "6.A" : "4.A", + "6.B" : "4.D", + "6.C" : "4.C", + "6.D" : "4.E", + "24540" : "0", + "15163" : "M.0.EL", + "14637" : "M.BK", + "14424" : "M.BK.A", + "14423" : "M.BK.M", + "14638" : "M.BIO", + "7" : "5", + } # 5.A-D ignored as not fitting 2006 cats + + + da = empty_ds["CO2"] + da = da.expand_dims({"category (IPCC1996)": list(mapping_BURDI_to_IPCC2006_PRIMAP.keys())}) + da = da.expand_dims({"source (gas)": list(cc.gas.keys())}) + arr = da.data.copy() + + result = da.pr.convert( + "category", + "IPCC2006", + sum_rule="extensive", + auxiliary_dimensions={"gas" : "source (gas)"}, + ) + + # TODO + assert False From b9973cdc8783464f75d1a29d734c303f9df55887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Thu, 10 Oct 2024 15:06:35 +0200 Subject: [PATCH 17/36] docs: add some algorithm notes --- primap2/_convert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/primap2/_convert.py b/primap2/_convert.py index ff70aedd..b908b32e 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -96,6 +96,10 @@ def convert( new_categorization=new_categorization, ) + # idea: convert 1-to-1 mappings first, should be easy in a single xarray + # operation + # note: if you have multiple rules to fill a single category, we should + # use something like fillna converted_categories = [] for category in converted_da[new_dim]: if category in converted_categories: From 920999c53c072d79016e08dacd182465fd193ba0 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 14 Oct 2024 15:42:10 +0200 Subject: [PATCH 18/36] test for BURDI conversion --- primap2/_convert.py | 222 +++++++++++++----------- primap2/tests/data/BURDI_conversion.csv | 40 +++++ primap2/tests/test_convert.py | 117 +++++++------ 3 files changed, 225 insertions(+), 154 deletions(-) create mode 100644 primap2/tests/data/BURDI_conversion.csv diff --git a/primap2/_convert.py b/primap2/_convert.py index b908b32e..99c5b4da 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -11,18 +11,18 @@ from ._selection import alias_dims -class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): +class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) : @alias_dims(["dim"]) def convert( - self, - dim: Hashable | str, - categorization: climate_categories.Categorization | str, - *, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ) -> xr.DataArray: + self, + dim: Hashable | str, + categorization: climate_categories.Categorization | climate_categories._conversions.Conversion | str, + *, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, + ) -> xr.DataArray : """Convert the data along the given dimension into the new categorization. Maps the given dimension from one categorization (terminology) into another. @@ -37,7 +37,8 @@ def convert( categorization : climate_categories.Categorization or str New categorization to convert the given dimension to. Either give the title of the new categorization (like ``IPCC1996``) or a - ``climate_categories.Categorization`` object. + ``climate_categories.Categorization`` object or a + ``climate_categories._conversions.Conversion`` object. sum_rule : ``extensive``, ``intensive``, or None (default) If data of categories has to be summed up or divided, we need information whether the quantity measured is extensive (like, for example, total @@ -80,12 +81,25 @@ def convert( A copy of the DataArray with the given dimension converted in the new categorization. """ - new_categorization = ensure_categorization_instance(categorization) + dim_name, old_categorization_name = extract_categorization_from_dim(dim) + + # user put in str of new category or categorisation object + if isinstance(categorization, (climate_categories.Categorization, str)) : + new_categorization = ensure_categorization_instance(categorization) + old_categorization = ensure_categorization_instance(old_categorization_name) + conversion = old_categorization.conversion_to(new_categorization) + elif isinstance(categorization, climate_categories._conversions.Conversion): + new_categorization = ensure_categorization_instance(categorization.categorization_b_name) + conversion = categorization + else: + raise ValueError( + f"categorization must be of instance climate_categories.Categorization " + f"or climate_categories._conversions.Conversion. Got {type(categorization)}" + ) + check_valid_sum_rule_types(sum_rule) - dim_name, old_categorization_name = extract_categorization_from_dim(dim) - old_categorization = ensure_categorization_instance(old_categorization_name) - conversion = old_categorization.conversion_to(new_categorization) + auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) new_dim = f"{dim_name} ({new_categorization.name})" @@ -101,8 +115,8 @@ def convert( # note: if you have multiple rules to fill a single category, we should # use something like fillna converted_categories = [] - for category in converted_da[new_dim]: - if category in converted_categories: + for category in converted_da[new_dim] : + if category in converted_categories : continue newly_converted_categories, converted_da = self._fill_category( da=converted_da, @@ -121,18 +135,18 @@ def convert( return converted_da def _fill_category( - self, - da: xr.DataArray, - dim: str, - new_dim: str, - already_converted_categories: list[climate_categories.Category], - category: climate_categories.Category, - conversion: climate_categories.Conversion, - sum_rule: str | None, - auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - ) -> tuple[list[climate_categories.Category], xr.DataArray]: + self, + da: xr.DataArray, + dim: str, + new_dim: str, + already_converted_categories: list[climate_categories.Category], + category: climate_categories.Category, + conversion: climate_categories.Conversion, + sum_rule: str | None, + auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + ) -> tuple[list[climate_categories.Category], xr.DataArray] : """Return a copy of da with the given category filled by values converted using the given conversion. @@ -166,13 +180,13 @@ def _fill_category( filled_categories, filled: list of climate_categories.category, xr.DataArray The categories that were filled and the new DataArray. """ - try: + try : rules = applicable_rules(conversion, category) - except KeyError: + except KeyError : logger.debug(f"No rule to derive data for {category!r}, will be NaN.") return [], da - for rule in rules: + for rule in rules : logger.debug(f"Processing rule {rule}.") # iterate until a non-restricted rule was applied or all rules are # exhausted @@ -197,7 +211,7 @@ def _fill_category( already_converted = set(output_selection[new_dim]).intersection( set(already_converted_categories) ) - if already_converted: + if already_converted : logger.warning( f"For category {category!r}, would want to use a " "rule with multiple outputs, but the following outputs " @@ -206,7 +220,7 @@ def _fill_category( ) continue - try: + try : effective_input_weights = derive_weights( dim=dim, category=category, @@ -225,7 +239,7 @@ def _fill_category( sum_rule=sum_rule, weights=output_weights, ) - except WeightingInfoMissing as err: + except WeightingInfoMissing as err : logger.warning(str(err)) continue @@ -238,7 +252,7 @@ def _fill_category( da.loc[output_selection] = rhs - if not rule.is_restricted: + if not rule.is_restricted : # stop processing rules for this category return output_selection[new_dim], da @@ -249,7 +263,7 @@ def _fill_category( return [], da -def extract_categorization_from_dim(dim: str) -> (str, str): +def extract_categorization_from_dim(dim: str) -> (str, str) : """Extract the pure dimension and the categorization from a composite dim. Parameters @@ -274,51 +288,51 @@ def extract_categorization_from_dim(dim: str) -> (str, str): The pure_dim without categorization information and the categorization. If the input dim does not contain categorization information, a ValueError is raised. """ - try: + try : pure, cat = dim.split("(", 1) - except ValueError: + except ValueError : raise ValueError(f"No categorization specified: {dim!r}.") from None return pure[:-1], cat[:-1] -def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]: +def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] : """Find the possible rules to derive the category using the given conversion.""" rules = conversion.relevant_rules({conversion.categorization_b[category]}) # a + b = c - d can not be used to derive c nor d, only a and b rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())] - if not rules: + if not rules : raise KeyError(category) return rules def ensure_categorization_instance( - cat: str | climate_categories.Categorization, -) -> climate_categories.Categorization: + cat: str | climate_categories.Categorization, +) -> climate_categories.Categorization : """Takes a categorization name or object and returns the corresponding categorization object.""" - if isinstance(cat, climate_categories.Categorization): + if isinstance(cat, climate_categories.Categorization) : return cat return climate_categories.cats[cat] -def check_valid_sum_rule_types(sum_rule: str | None): +def check_valid_sum_rule_types(sum_rule: str | None) : """Checks if the sum_rule is either "intensive", "extensive", or None. Raises a ValueError if an invalid sum_rule is used.""" - if sum_rule not in (None, "extensive", "intensive"): + if sum_rule not in (None, "extensive", "intensive") : raise ValueError( f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}" ) def initialize_empty_converted_da( - *, - old_da: xr.DataArray, - old_dim: Hashable | str, - new_dim: str, - new_categorization: climate_categories.Categorization, -) -> xr.DataArray: + *, + old_da: xr.DataArray, + old_dim: Hashable | str, + new_dim: str, + new_categorization: climate_categories.Categorization, +) -> xr.DataArray : """Build a DataArray which can hold the data after conversion to a new categorization. @@ -347,19 +361,19 @@ def initialize_empty_converted_da( """ new_dims = [] new_shape = [] - for i, idim in enumerate(old_da.dims): - if idim == old_dim: + for i, idim in enumerate(old_da.dims) : + if idim == old_dim : new_dims.append(new_dim) new_shape.append(len(new_categorization)) - else: + else : new_dims.append(idim) new_shape.append(old_da.shape[i]) new_coords = {} - for coord in old_da.coords: - if coord == old_dim: + for coord in old_da.coords : + if coord == old_dim : new_coords[new_dim] = np.array(list(new_categorization.keys())) - elif old_dim in old_da.coords[coord].dims: + elif old_dim in old_da.coords[coord].dims : # The additional coordinate has the old_dim as one dimension, but we # won't be able to convert it logger.info( @@ -367,15 +381,15 @@ def initialize_empty_converted_da( f" and is skipped." ) continue - else: + else : new_coords[coord] = old_da.coords[coord] new_attrs = copy.deepcopy(old_da.attrs) - for pdim in ("area", "cat", "scen"): - if pdim in new_attrs and new_attrs[pdim] == old_dim: + for pdim in ("area", "cat", "scen") : + if pdim in new_attrs and new_attrs[pdim] == old_dim : new_attrs[pdim] = new_dim - if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]: + if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] : new_attrs["sec_cats"].remove(old_dim) new_attrs["sec_cats"].append(new_dim) @@ -393,12 +407,12 @@ def initialize_empty_converted_da( def factors_categories_to_xarray( - *, - dim: str, - factors_categories: dict[climate_categories.Category, int], - auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], - auxiliary_dimensions: dict[climate_categories.Categorization, str], -) -> tuple[dict[str, list[str]], xr.DataArray]: + *, + dim: str, + factors_categories: dict[climate_categories.Category, int], + auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], + auxiliary_dimensions: dict[climate_categories.Categorization, str], +) -> tuple[dict[str, list[str]], xr.DataArray] : """Convert dictionary mapping categories to factors into xarray-compatible objects. Using the xarray objects ensures that in subsequent calculations, everything @@ -426,30 +440,30 @@ def factors_categories_to_xarray( factors is an xarray DataArray which can be multiplied with an xarray object after applying the selection. """ - selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]} + selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]} factors = xr.DataArray( data=list(factors_categories.values()), dims=[dim], coords=selection, ) - for aux_categorization, aux_categories in auxiliary_categories.items(): - if aux_categories: + for aux_categorization, aux_categories in auxiliary_categories.items() : + if aux_categories : aux_dim = auxiliary_dimensions[aux_categorization] selection[aux_dim] = [cat.codes[0] for cat in aux_categories] return selection, factors -class WeightingInfoMissing(ValueError): +class WeightingInfoMissing(ValueError) : """Some information to derive weighting factors for a rule is missing.""" def __init__( - self, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - message: str, - ): + self, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + message: str, + ) : full_message = ( f"Can not derive data for category {category!r} using rule" f" '{rule}': {message} Skipping this rule." @@ -458,15 +472,15 @@ def __init__( def derive_weights( - *, - dim: str, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - sum_rule: str | None, - operation_type: str, - weights: xr.DataArray | None, - selection: dict[str, list[str]], -) -> xr.DataArray | float: + *, + dim: str, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + sum_rule: str | None, + operation_type: str, + weights: xr.DataArray | None, + selection: dict[str, list[str]], +) -> xr.DataArray | float : """Derive the weights to use for applying a specific rule. Parameters @@ -498,28 +512,28 @@ def derive_weights( Object which can be multiplied with the input or output DataArray to apply weights. """ - if operation_type == "input": + if operation_type == "input" : operation_verb = "sum up" trivial_sum_rule = "extensive" nontrivial_sum_rule = "intensive" rule_cardinality = rule.cardinality_a - else: + else : operation_verb = "split" trivial_sum_rule = "intensive" nontrivial_sum_rule = "extensive" rule_cardinality = rule.cardinality_b # just one category or trivial sum rule, so no weights required - if rule_cardinality == "one" or sum_rule == trivial_sum_rule: + if rule_cardinality == "one" or sum_rule == trivial_sum_rule : return 1.0 - if sum_rule == nontrivial_sum_rule: - if weights is None: + if sum_rule == nontrivial_sum_rule : + if weights is None : raise WeightingInfoMissing( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories with" - f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" - f" specified.", + f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" + f" specified.", ) effective_weights = weights.loc[selection] # normalize so it is actually a weight, not a factor @@ -529,16 +543,16 @@ def derive_weights( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories, but the sum_rule is" - f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" - f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" - f" specified.", + f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" + f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" + f" specified.", ) def prepare_auxiliary_dimensions( - conversion: climate_categories.Conversion, - auxiliary_dimensions: dict[str, str] | None, -) -> dict[climate_categories.Categorization, str] | None: + conversion: climate_categories.Conversion, + auxiliary_dimensions: dict[str, str] | None, +) -> dict[climate_categories.Categorization, str] | None : """Prepare and check the auxiliary dimension mapping. Check if all auxiliary categorizations used in the conversion are matched in @@ -552,8 +566,8 @@ def prepare_auxiliary_dimensions( the auxiliary dimensions, but using Categorization objects instead of their names. """ - if conversion.auxiliary_categorizations_names: - if auxiliary_dimensions is None: + if conversion.auxiliary_categorizations_names : + if auxiliary_dimensions is None : raise ValueError( "The conversion uses auxiliary categories, but a translation to" " dimension names was not provided using the argument" @@ -564,16 +578,16 @@ def prepare_auxiliary_dimensions( missing = set(conversion.auxiliary_categorizations_names).difference( auxiliary_dimensions.keys() ) - if missing: + if missing : raise ValueError( "A dimension name was not given for all auxiliary categories:" f" {missing} are missing in the auxiliary_dimensions argument, please" " provide translations to the dimension names used in the data." ) - if not auxiliary_dimensions: + if not auxiliary_dimensions : return auxiliary_dimensions return { - climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions + climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions } diff --git a/primap2/tests/data/BURDI_conversion.csv b/primap2/tests/data/BURDI_conversion.csv new file mode 100644 index 00000000..82e0ce50 --- /dev/null +++ b/primap2/tests/data/BURDI_conversion.csv @@ -0,0 +1,40 @@ +# references: non_annex1_data repo +# last_update: 2024-10-14 +BURDI,IPCC2006_PRIMAP,comment +1,1 +1.A,1.A +1.A.1,1.A.1 +1.A.2,1.A.2 +1.A.3,1.A.3 +1.A.4,1.A.4 +1.A.5,1.A.5 +1.B,1.B +1.B.1,1.B.1 +1.B.2,1.B.2 +2 + 3,2 +2.A,2.A +2.B + 2.E,2.B +2.C,2.C +2.F,2.F +2.G + 2.D, 2.H +3,2.D +4,M.AG +4.A,3.A.1 +4.B,3.A.2 +4.C,3.C.7 +4.D + 4.C + 4.E + 4.F + 4.G,3.C +4.E,3.C.1.c +4.F,3.C.1.b +4.G,3.C.8 +5,M.LULUCF +6,4 +6.A,4.A +6.B,4.D +6.C,4.C +6.D,4.E +24540,0 +15163,M.0.EL +14637,M.BK +14424,M.BK.A +14423,M.BK.M, leaving 14638 --> M.BIO out for now, as it's not in climate categories +7,5, 5.A-D ignored as not fitting 2006 cats diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index a4f66791..8202d43a 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,10 +1,12 @@ """Tests for _convert.py""" import climate_categories as cc +import climate_categories._conversions as conversions import pytest import xarray as xr - +import pathlib import primap2 +import numpy as np def test_convert_ipcc(empty_ds: xr.Dataset): @@ -31,65 +33,80 @@ def test_convert_ipcc(empty_ds: xr.Dataset): assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() def test_convert_BURDI(empty_ds: xr.Dataset): - # build a DA categorized by BURDI and with 1 everywhere so results are easy - # to see + # make a sample conversion object in climate categories + filepath = pathlib.Path("data/BURDI_conversion.csv") + conv = conversions.ConversionSpec.from_csv(filepath) + conv = conv.hydrate(cats=cc.cats["BURDI"]._cats) - # TODO this should come from climate categories - mapping_BURDI_to_IPCC2006_PRIMAP = { - "1" : "1", - "1.A" : "1.A", - "1.A.1" : "1.A.1", - "1.A.2" : "1.A.2", - "1.A.3" : "1.A.3", - "1.A.4" : "1.A.4", - "1.A.5" : "1.A.5", - "1.B" : "1.B", - "1.B.1" : "1.B.1", - "1.B.2" : "1.B.2", - "2" : "M.2.BURDI", - "2.A" : "2.A", - "2.B" : "M.2.B_2.B", - "2.C" : "2.C", - "2.D" : "M.2.H.1_2", - "2.E" : "M.2.B_2.E", - "2.F" : "2.F", - "2.G" : "2.H.3", - "3" : "2.D", - "4" : "M.AG", - "4.A" : "3.A.1", - "4.B" : "3.A.2", - "4.C" : "3.C.7", - "4.D" : "M.3.C.45.AG", - "4.E" : "3.C.1.c", - "4.F" : "3.C.1.b", - "4.G" : "3.C.8", - "5" : "M.LULUCF", - "6" : "4", - "6.A" : "4.A", - "6.B" : "4.D", - "6.C" : "4.C", - "6.D" : "4.E", - "24540" : "0", - "15163" : "M.0.EL", - "14637" : "M.BK", - "14424" : "M.BK.A", - "14423" : "M.BK.M", - "14638" : "M.BIO", - "7" : "5", - } # 5.A-D ignored as not fitting 2006 cats + # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ + # unfccc_di_reader_config.py + BURDI_categories = [ + "1", + "1.A", + "1.A.1", + "1.A.2", + "1.A.3", + "1.A.4", + "1.A.5", + "1.B", + "1.B.1", + "1.B.2", + "2", + "2.A", + "2.B", + "2.C", + "2.D", + "2.E", + "2.F", + "2.G", + "3", + "4", + "4.A", + "4.B", + "4.C", + "4.D", + "4.E", + "4.F", + "4.G", + "5", + "6", + "6.A", + "6.B", + "6.C", + "6.D", + "24540", + "15163", + "14637", + "14424", + "14423", + "14638", + "7"] + # build a DA categorized by BURDI and with 1 everywhere so results are easy + # to see da = empty_ds["CO2"] - da = da.expand_dims({"category (IPCC1996)": list(mapping_BURDI_to_IPCC2006_PRIMAP.keys())}) + da = da.expand_dims({"category (BURDI)": BURDI_categories}) da = da.expand_dims({"source (gas)": list(cc.gas.keys())}) arr = da.data.copy() + arr[:] = 1 * primap2.ureg("Gg CO2 / year") + da.data = arr result = da.pr.convert( "category", - "IPCC2006", + conv, sum_rule="extensive", auxiliary_dimensions={"gas" : "source (gas)"}, ) - # TODO - assert False + # cat 2 + 3 in BURDI equals cat 2 in IPCC2006_PRIMAP + assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() + # cat 4.D + 4.C + 4.E + 4.F + 4.G in BURDI equals cat 3.C in IPCC2006_PRIMAP + assert (result.pr.loc[{"category" : "3.C"}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item() + # cat 5 in BURDI equals cat M.LULUCF in IPCC2006_PRIMAP + assert (result.pr.loc[{"category" : "M.LULUCF"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + # 2.E + 2.B = 2.E, 2.E should not be part of new data set + assert np.isnan(result.pr.loc[{"category" : "2.E"}].values).all() + # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP + # TODO: This will fail. M.BIO is currently not listed in climate categories + assert (result.pr.loc[{"category" : "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() \ No newline at end of file From 69c637bdcd7194cf1e9a234230b3c02af56fcaaa Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 14 Oct 2024 15:44:44 +0200 Subject: [PATCH 19/36] ruff --- primap2/_convert.py | 209 +++++++++++++++++----------------- primap2/tests/test_convert.py | 105 +++++++++-------- 2 files changed, 163 insertions(+), 151 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 99c5b4da..f13c8ac1 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -11,18 +11,20 @@ from ._selection import alias_dims -class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) : +class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): @alias_dims(["dim"]) def convert( - self, - dim: Hashable | str, - categorization: climate_categories.Categorization | climate_categories._conversions.Conversion | str, - *, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ) -> xr.DataArray : + self, + dim: Hashable | str, + categorization: climate_categories.Categorization + | climate_categories._conversions.Conversion + | str, + *, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, + ) -> xr.DataArray: """Convert the data along the given dimension into the new categorization. Maps the given dimension from one categorization (terminology) into another. @@ -84,22 +86,23 @@ def convert( dim_name, old_categorization_name = extract_categorization_from_dim(dim) # user put in str of new category or categorisation object - if isinstance(categorization, (climate_categories.Categorization, str)) : + if isinstance(categorization, (climate_categories.Categorization, str)): new_categorization = ensure_categorization_instance(categorization) old_categorization = ensure_categorization_instance(old_categorization_name) conversion = old_categorization.conversion_to(new_categorization) elif isinstance(categorization, climate_categories._conversions.Conversion): - new_categorization = ensure_categorization_instance(categorization.categorization_b_name) + new_categorization = ensure_categorization_instance( + categorization.categorization_b_name + ) conversion = categorization else: raise ValueError( f"categorization must be of instance climate_categories.Categorization " f"or climate_categories._conversions.Conversion. Got {type(categorization)}" - ) + ) check_valid_sum_rule_types(sum_rule) - auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) new_dim = f"{dim_name} ({new_categorization.name})" @@ -115,8 +118,8 @@ def convert( # note: if you have multiple rules to fill a single category, we should # use something like fillna converted_categories = [] - for category in converted_da[new_dim] : - if category in converted_categories : + for category in converted_da[new_dim]: + if category in converted_categories: continue newly_converted_categories, converted_da = self._fill_category( da=converted_da, @@ -135,18 +138,18 @@ def convert( return converted_da def _fill_category( - self, - da: xr.DataArray, - dim: str, - new_dim: str, - already_converted_categories: list[climate_categories.Category], - category: climate_categories.Category, - conversion: climate_categories.Conversion, - sum_rule: str | None, - auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - ) -> tuple[list[climate_categories.Category], xr.DataArray] : + self, + da: xr.DataArray, + dim: str, + new_dim: str, + already_converted_categories: list[climate_categories.Category], + category: climate_categories.Category, + conversion: climate_categories.Conversion, + sum_rule: str | None, + auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + ) -> tuple[list[climate_categories.Category], xr.DataArray]: """Return a copy of da with the given category filled by values converted using the given conversion. @@ -180,13 +183,13 @@ def _fill_category( filled_categories, filled: list of climate_categories.category, xr.DataArray The categories that were filled and the new DataArray. """ - try : + try: rules = applicable_rules(conversion, category) - except KeyError : + except KeyError: logger.debug(f"No rule to derive data for {category!r}, will be NaN.") return [], da - for rule in rules : + for rule in rules: logger.debug(f"Processing rule {rule}.") # iterate until a non-restricted rule was applied or all rules are # exhausted @@ -211,7 +214,7 @@ def _fill_category( already_converted = set(output_selection[new_dim]).intersection( set(already_converted_categories) ) - if already_converted : + if already_converted: logger.warning( f"For category {category!r}, would want to use a " "rule with multiple outputs, but the following outputs " @@ -220,7 +223,7 @@ def _fill_category( ) continue - try : + try: effective_input_weights = derive_weights( dim=dim, category=category, @@ -239,7 +242,7 @@ def _fill_category( sum_rule=sum_rule, weights=output_weights, ) - except WeightingInfoMissing as err : + except WeightingInfoMissing as err: logger.warning(str(err)) continue @@ -252,7 +255,7 @@ def _fill_category( da.loc[output_selection] = rhs - if not rule.is_restricted : + if not rule.is_restricted: # stop processing rules for this category return output_selection[new_dim], da @@ -263,7 +266,7 @@ def _fill_category( return [], da -def extract_categorization_from_dim(dim: str) -> (str, str) : +def extract_categorization_from_dim(dim: str) -> (str, str): """Extract the pure dimension and the categorization from a composite dim. Parameters @@ -288,51 +291,51 @@ def extract_categorization_from_dim(dim: str) -> (str, str) : The pure_dim without categorization information and the categorization. If the input dim does not contain categorization information, a ValueError is raised. """ - try : + try: pure, cat = dim.split("(", 1) - except ValueError : + except ValueError: raise ValueError(f"No categorization specified: {dim!r}.") from None return pure[:-1], cat[:-1] -def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] : +def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]: """Find the possible rules to derive the category using the given conversion.""" rules = conversion.relevant_rules({conversion.categorization_b[category]}) # a + b = c - d can not be used to derive c nor d, only a and b rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())] - if not rules : + if not rules: raise KeyError(category) return rules def ensure_categorization_instance( - cat: str | climate_categories.Categorization, -) -> climate_categories.Categorization : + cat: str | climate_categories.Categorization, +) -> climate_categories.Categorization: """Takes a categorization name or object and returns the corresponding categorization object.""" - if isinstance(cat, climate_categories.Categorization) : + if isinstance(cat, climate_categories.Categorization): return cat return climate_categories.cats[cat] -def check_valid_sum_rule_types(sum_rule: str | None) : +def check_valid_sum_rule_types(sum_rule: str | None): """Checks if the sum_rule is either "intensive", "extensive", or None. Raises a ValueError if an invalid sum_rule is used.""" - if sum_rule not in (None, "extensive", "intensive") : + if sum_rule not in (None, "extensive", "intensive"): raise ValueError( f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}" ) def initialize_empty_converted_da( - *, - old_da: xr.DataArray, - old_dim: Hashable | str, - new_dim: str, - new_categorization: climate_categories.Categorization, -) -> xr.DataArray : + *, + old_da: xr.DataArray, + old_dim: Hashable | str, + new_dim: str, + new_categorization: climate_categories.Categorization, +) -> xr.DataArray: """Build a DataArray which can hold the data after conversion to a new categorization. @@ -361,19 +364,19 @@ def initialize_empty_converted_da( """ new_dims = [] new_shape = [] - for i, idim in enumerate(old_da.dims) : - if idim == old_dim : + for i, idim in enumerate(old_da.dims): + if idim == old_dim: new_dims.append(new_dim) new_shape.append(len(new_categorization)) - else : + else: new_dims.append(idim) new_shape.append(old_da.shape[i]) new_coords = {} - for coord in old_da.coords : - if coord == old_dim : + for coord in old_da.coords: + if coord == old_dim: new_coords[new_dim] = np.array(list(new_categorization.keys())) - elif old_dim in old_da.coords[coord].dims : + elif old_dim in old_da.coords[coord].dims: # The additional coordinate has the old_dim as one dimension, but we # won't be able to convert it logger.info( @@ -381,15 +384,15 @@ def initialize_empty_converted_da( f" and is skipped." ) continue - else : + else: new_coords[coord] = old_da.coords[coord] new_attrs = copy.deepcopy(old_da.attrs) - for pdim in ("area", "cat", "scen") : - if pdim in new_attrs and new_attrs[pdim] == old_dim : + for pdim in ("area", "cat", "scen"): + if pdim in new_attrs and new_attrs[pdim] == old_dim: new_attrs[pdim] = new_dim - if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] : + if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]: new_attrs["sec_cats"].remove(old_dim) new_attrs["sec_cats"].append(new_dim) @@ -407,12 +410,12 @@ def initialize_empty_converted_da( def factors_categories_to_xarray( - *, - dim: str, - factors_categories: dict[climate_categories.Category, int], - auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], - auxiliary_dimensions: dict[climate_categories.Categorization, str], -) -> tuple[dict[str, list[str]], xr.DataArray] : + *, + dim: str, + factors_categories: dict[climate_categories.Category, int], + auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], + auxiliary_dimensions: dict[climate_categories.Categorization, str], +) -> tuple[dict[str, list[str]], xr.DataArray]: """Convert dictionary mapping categories to factors into xarray-compatible objects. Using the xarray objects ensures that in subsequent calculations, everything @@ -440,30 +443,30 @@ def factors_categories_to_xarray( factors is an xarray DataArray which can be multiplied with an xarray object after applying the selection. """ - selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]} + selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]} factors = xr.DataArray( data=list(factors_categories.values()), dims=[dim], coords=selection, ) - for aux_categorization, aux_categories in auxiliary_categories.items() : - if aux_categories : + for aux_categorization, aux_categories in auxiliary_categories.items(): + if aux_categories: aux_dim = auxiliary_dimensions[aux_categorization] selection[aux_dim] = [cat.codes[0] for cat in aux_categories] return selection, factors -class WeightingInfoMissing(ValueError) : +class WeightingInfoMissing(ValueError): """Some information to derive weighting factors for a rule is missing.""" def __init__( - self, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - message: str, - ) : + self, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + message: str, + ): full_message = ( f"Can not derive data for category {category!r} using rule" f" '{rule}': {message} Skipping this rule." @@ -472,15 +475,15 @@ def __init__( def derive_weights( - *, - dim: str, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - sum_rule: str | None, - operation_type: str, - weights: xr.DataArray | None, - selection: dict[str, list[str]], -) -> xr.DataArray | float : + *, + dim: str, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + sum_rule: str | None, + operation_type: str, + weights: xr.DataArray | None, + selection: dict[str, list[str]], +) -> xr.DataArray | float: """Derive the weights to use for applying a specific rule. Parameters @@ -512,28 +515,28 @@ def derive_weights( Object which can be multiplied with the input or output DataArray to apply weights. """ - if operation_type == "input" : + if operation_type == "input": operation_verb = "sum up" trivial_sum_rule = "extensive" nontrivial_sum_rule = "intensive" rule_cardinality = rule.cardinality_a - else : + else: operation_verb = "split" trivial_sum_rule = "intensive" nontrivial_sum_rule = "extensive" rule_cardinality = rule.cardinality_b # just one category or trivial sum rule, so no weights required - if rule_cardinality == "one" or sum_rule == trivial_sum_rule : + if rule_cardinality == "one" or sum_rule == trivial_sum_rule: return 1.0 - if sum_rule == nontrivial_sum_rule : - if weights is None : + if sum_rule == nontrivial_sum_rule: + if weights is None: raise WeightingInfoMissing( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories with" - f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" - f" specified.", + f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" + f" specified.", ) effective_weights = weights.loc[selection] # normalize so it is actually a weight, not a factor @@ -543,16 +546,16 @@ def derive_weights( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories, but the sum_rule is" - f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" - f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" - f" specified.", + f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" + f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" + f" specified.", ) def prepare_auxiliary_dimensions( - conversion: climate_categories.Conversion, - auxiliary_dimensions: dict[str, str] | None, -) -> dict[climate_categories.Categorization, str] | None : + conversion: climate_categories.Conversion, + auxiliary_dimensions: dict[str, str] | None, +) -> dict[climate_categories.Categorization, str] | None: """Prepare and check the auxiliary dimension mapping. Check if all auxiliary categorizations used in the conversion are matched in @@ -566,8 +569,8 @@ def prepare_auxiliary_dimensions( the auxiliary dimensions, but using Categorization objects instead of their names. """ - if conversion.auxiliary_categorizations_names : - if auxiliary_dimensions is None : + if conversion.auxiliary_categorizations_names: + if auxiliary_dimensions is None: raise ValueError( "The conversion uses auxiliary categories, but a translation to" " dimension names was not provided using the argument" @@ -578,16 +581,16 @@ def prepare_auxiliary_dimensions( missing = set(conversion.auxiliary_categorizations_names).difference( auxiliary_dimensions.keys() ) - if missing : + if missing: raise ValueError( "A dimension name was not given for all auxiliary categories:" f" {missing} are missing in the auxiliary_dimensions argument, please" " provide translations to the dimension names used in the data." ) - if not auxiliary_dimensions : + if not auxiliary_dimensions: return auxiliary_dimensions return { - climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions + climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions } diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 8202d43a..0cdaa97b 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,12 +1,14 @@ """Tests for _convert.py""" +import pathlib + import climate_categories as cc import climate_categories._conversions as conversions +import numpy as np import pytest import xarray as xr -import pathlib + import primap2 -import numpy as np def test_convert_ipcc(empty_ds: xr.Dataset): @@ -32,6 +34,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset): assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() + def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories filepath = pathlib.Path("data/BURDI_conversion.csv") @@ -41,47 +44,47 @@ def test_convert_BURDI(empty_ds: xr.Dataset): # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ # unfccc_di_reader_config.py BURDI_categories = [ - "1", - "1.A", - "1.A.1", - "1.A.2", - "1.A.3", - "1.A.4", - "1.A.5", - "1.B", - "1.B.1", - "1.B.2", - "2", - "2.A", - "2.B", - "2.C", - "2.D", - "2.E", - "2.F", - "2.G", - "3", - "4", - "4.A", - "4.B", - "4.C", - "4.D", - "4.E", - "4.F", - "4.G", - "5", - "6", - "6.A", - "6.B", - "6.C", - "6.D", - "24540", - "15163", - "14637", - "14424", - "14423", - "14638", - "7"] - + "1", + "1.A", + "1.A.1", + "1.A.2", + "1.A.3", + "1.A.4", + "1.A.5", + "1.B", + "1.B.1", + "1.B.2", + "2", + "2.A", + "2.B", + "2.C", + "2.D", + "2.E", + "2.F", + "2.G", + "3", + "4", + "4.A", + "4.B", + "4.C", + "4.D", + "4.E", + "4.F", + "4.G", + "5", + "6", + "6.A", + "6.B", + "6.C", + "6.D", + "24540", + "15163", + "14637", + "14424", + "14423", + "14638", + "7", + ] # build a DA categorized by BURDI and with 1 everywhere so results are easy # to see @@ -96,17 +99,23 @@ def test_convert_BURDI(empty_ds: xr.Dataset): "category", conv, sum_rule="extensive", - auxiliary_dimensions={"gas" : "source (gas)"}, + auxiliary_dimensions={"gas": "source (gas)"}, ) # cat 2 + 3 in BURDI equals cat 2 in IPCC2006_PRIMAP assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() # cat 4.D + 4.C + 4.E + 4.F + 4.G in BURDI equals cat 3.C in IPCC2006_PRIMAP - assert (result.pr.loc[{"category" : "3.C"}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item() + assert (result.pr.loc[{"category": "3.C"}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item() # cat 5 in BURDI equals cat M.LULUCF in IPCC2006_PRIMAP - assert (result.pr.loc[{"category" : "M.LULUCF"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + assert ( + (result.pr.loc[{"category": "M.LULUCF"}] == 1.0 * primap2.ureg("Gg CO2 / year")) + .all() + .item() + ) # 2.E + 2.B = 2.E, 2.E should not be part of new data set - assert np.isnan(result.pr.loc[{"category" : "2.E"}].values).all() + assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all() # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP # TODO: This will fail. M.BIO is currently not listed in climate categories - assert (result.pr.loc[{"category" : "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() \ No newline at end of file + assert ( + (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + ) From 113abda5e8d674a8d1a25172d03ea3f3eed67d90 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 14 Oct 2024 15:47:19 +0200 Subject: [PATCH 20/36] comments --- primap2/_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index f13c8ac1..4577cc4e 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -85,11 +85,11 @@ def convert( """ dim_name, old_categorization_name = extract_categorization_from_dim(dim) - # user put in str of new category or categorisation object if isinstance(categorization, (climate_categories.Categorization, str)): new_categorization = ensure_categorization_instance(categorization) old_categorization = ensure_categorization_instance(old_categorization_name) conversion = old_categorization.conversion_to(new_categorization) + # TODO: Refactor or change variable name for categorization. Conversion is not really the same elif isinstance(categorization, climate_categories._conversions.Conversion): new_categorization = ensure_categorization_instance( categorization.categorization_b_name From 7428d51d291694c16841aec5c3eed8528617f81a Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 17 Oct 2024 09:51:58 +0200 Subject: [PATCH 21/36] add test for custom categorisations and custom conversion --- primap2/_convert.py | 22 +++++--- .../tests/data/simple_categorisation_a.yaml | 35 ++++++++++++ .../tests/data/simple_categorisation_b.yaml | 27 +++++++++ primap2/tests/data/simple_conversion.csv | 5 ++ primap2/tests/test_convert.py | 56 ++++++++++++++++++- 5 files changed, 137 insertions(+), 8 deletions(-) create mode 100644 primap2/tests/data/simple_categorisation_a.yaml create mode 100644 primap2/tests/data/simple_categorisation_b.yaml create mode 100644 primap2/tests/data/simple_conversion.csv diff --git a/primap2/_convert.py b/primap2/_convert.py index 4577cc4e..05c90416 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -16,10 +16,11 @@ class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): def convert( self, dim: Hashable | str, - categorization: climate_categories.Categorization - | climate_categories._conversions.Conversion - | str, + # TODO naming + categorization: climate_categories.Categorization | str, *, + custom_categorisation_a : climate_categories.Categorization | None = None, + custom_categorisation_b : climate_categories.Categorization | None = None, sum_rule: typing.Literal["intensive", "extensive"] | None = None, input_weights: xr.DataArray | None = None, output_weights: xr.DataArray | None = None, @@ -34,6 +35,7 @@ def convert( Parameters ---------- + # TODO dim : str Dimension to convert. Has to be a dimension from ``da.dims``. categorization : climate_categories.Categorization or str @@ -85,16 +87,22 @@ def convert( """ dim_name, old_categorization_name = extract_categorization_from_dim(dim) + # TODO find better logic for all this if isinstance(categorization, (climate_categories.Categorization, str)): new_categorization = ensure_categorization_instance(categorization) old_categorization = ensure_categorization_instance(old_categorization_name) conversion = old_categorization.conversion_to(new_categorization) # TODO: Refactor or change variable name for categorization. Conversion is not really the same elif isinstance(categorization, climate_categories._conversions.Conversion): - new_categorization = ensure_categorization_instance( - categorization.categorization_b_name - ) - conversion = categorization + if custom_categorisation_a and custom_categorisation_b: + old_categorization = ensure_categorization_instance(custom_categorisation_a) + new_categorization = ensure_categorization_instance(custom_categorisation_b) + conversion = categorization + else: + new_categorization = ensure_categorization_instance( + categorization.categorization_b_name + ) + conversion = categorization else: raise ValueError( f"categorization must be of instance climate_categories.Categorization " diff --git a/primap2/tests/data/simple_categorisation_a.yaml b/primap2/tests/data/simple_categorisation_a.yaml new file mode 100644 index 00000000..1656c97b --- /dev/null +++ b/primap2/tests/data/simple_categorisation_a.yaml @@ -0,0 +1,35 @@ +name: A +title: Simple Categorization +comment: A simple example categorization without relationships between categories +references: doi:00000/00000 +institution: PIK +last_update: 2021-02-23 +hierarchical: no +version: 1 +categories: + 1: + title: Category 1 + comment: The first category + alternative_codes: + - A + - CatA + info: + important_data: + - A + - B + - C + other_important_thing: ABC + 2: + title: Category 2 + comment: The second category + alternative_codes: + - B + - CatB + 3: + title: Category 3 + comment: The third category + alternative_codes: + - C + - CatC + unnumbered: + title: The unnumbered category \ No newline at end of file diff --git a/primap2/tests/data/simple_categorisation_b.yaml b/primap2/tests/data/simple_categorisation_b.yaml new file mode 100644 index 00000000..35751f9b --- /dev/null +++ b/primap2/tests/data/simple_categorisation_b.yaml @@ -0,0 +1,27 @@ +name: B +title: Simple Categorization +comment: A simple example categorization without relationships between categories +references: doi:00000/00000 +institution: PIK +last_update: 2021-02-23 +hierarchical: no +version: 1 +categories: + 1: + title: Category 1 + comment: The first category + alternative_codes: + - A + - CatA + info: + important_data: + - A + - B + - C + other_important_thing: ABC + 2: + title: Category 2 + comment: The second category + alternative_codes: + - B + - CatB \ No newline at end of file diff --git a/primap2/tests/data/simple_conversion.csv b/primap2/tests/data/simple_conversion.csv new file mode 100644 index 00000000..724f62d9 --- /dev/null +++ b/primap2/tests/data/simple_conversion.csv @@ -0,0 +1,5 @@ +# references: test +# last_update: 2024-10-14 +A,B,comment +1,1, no comment +2+3,2 diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 0cdaa97b..30644ac3 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -11,6 +11,7 @@ import primap2 +# test with existing conversion and two existing categorisations def test_convert_ipcc(empty_ds: xr.Dataset): # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy # to see @@ -35,11 +36,12 @@ def test_convert_ipcc(empty_ds: xr.Dataset): assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() +# test with new conversion and two existing categorisations def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories filepath = pathlib.Path("data/BURDI_conversion.csv") conv = conversions.ConversionSpec.from_csv(filepath) - conv = conv.hydrate(cats=cc.cats["BURDI"]._cats) + conv = conv.hydrate(cats=cc.cats) # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ # unfccc_di_reader_config.py @@ -112,6 +114,11 @@ def test_convert_BURDI(empty_ds: xr.Dataset): .all() .item() ) + # 3.C.7 (converted from 4.C) should still be part of the data set, + # although it apprears in two conversion rules + assert ( + (result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + ) # 2.E + 2.B = 2.E, 2.E should not be part of new data set assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all() # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP @@ -119,3 +126,50 @@ def test_convert_BURDI(empty_ds: xr.Dataset): assert ( (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() ) + + +# test with new conversion and new categorisations +def test_simple__custom_conversion_and_categorisation(empty_ds): + # make categorisation A from yaml + categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml") + + # make categorisation B from yaml + categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml") + + # make conversion from csv + conv = conversions.ConversionSpec.from_csv("data/simple_conversion.csv") + # categories not part of climate categories so we need to add them manually + cats = { + "A": categorisation_a, + "B": categorisation_b, + } + conv = conv.hydrate(cats=cats) + + # make a dummy dataset based on A cats + da = empty_ds["CO2"] + da = da.expand_dims({"category (A)": list(categorisation_a.keys())}) + arr = da.data.copy() + arr[:] = 1 * primap2.ureg("Gg CO2 / year") + da.data = arr + + # convert to categorisation B + result = da.pr.convert( + "category", + categorization=conv, + custom_categorisation_a=categorisation_a, + custom_categorisation_b=categorisation_b, + sum_rule="extensive", + ) + + # category name includes B - the target categorisation + assert sorted(result.coords) == ["area (ISO3)", "category (B)", "source", "time"] + + # check 1 -> 1 + assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + + # check 2 + 3 -> 2 + assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item() + + # check result has 2 categories (input categorisation had 3) + # TODO this is ambiguous when order changes + assert result.shape == (2, 21, 4, 1) From 868afbc3c0269dba64cd4894c08f1d8385fb7d45 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 21 Oct 2024 12:50:24 +0200 Subject: [PATCH 22/36] refactor convert --- primap2/_convert.py | 302 ++++++++++++++++++---------------- primap2/tests/test_convert.py | 35 ++-- 2 files changed, 186 insertions(+), 151 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 05c90416..1284740a 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -11,21 +11,22 @@ from ._selection import alias_dims -class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): - @alias_dims(["dim"]) - def convert( - self, - dim: Hashable | str, - # TODO naming - categorization: climate_categories.Categorization | str, - *, - custom_categorisation_a : climate_categories.Categorization | None = None, - custom_categorisation_b : climate_categories.Categorization | None = None, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ) -> xr.DataArray: +class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) : + + def convert_inner( + self, + dim: Hashable | str, + # TODO type will change to climate_categories.Conversion when + # https://github.com/primap-community/climate_categories/pull/164 is merged + *, + conversion: climate_categories._conversions.Conversion, + old_categorization: climate_categories.Categorization, + new_categorization: climate_categories.Categorization, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, + ) -> xr.DataArray : """Convert the data along the given dimension into the new categorization. Maps the given dimension from one categorization (terminology) into another. @@ -35,14 +36,22 @@ def convert( Parameters ---------- - # TODO dim : str Dimension to convert. Has to be a dimension from ``da.dims``. - categorization : climate_categories.Categorization or str + # TODO type will change to climate_categories.Conversion when climate_categories/pull/164 is merged + conversion : climate_categories.Categorization or str or climate_categories._conversions.Conversion New categorization to convert the given dimension to. Either give the title of the new categorization (like ``IPCC1996``) or a ``climate_categories.Categorization`` object or a ``climate_categories._conversions.Conversion`` object. + custom_categorization_input + A custom categorization for the input data. Must be provided if conversion uses + input categorisation that is not in ``climate_categories``. + Overwrites categorisation in conversion if both are provided. + custom_categorization_output + A custom categorization for the output data. Must be provided if conversion uses + output categorisation that is not in ``climate_categories``. + Overwrites categorisation in conversion if both are provided. sum_rule : ``extensive``, ``intensive``, or None (default) If data of categories has to be summed up or divided, we need information whether the quantity measured is extensive (like, for example, total @@ -85,33 +94,13 @@ def convert( A copy of the DataArray with the given dimension converted in the new categorization. """ - dim_name, old_categorization_name = extract_categorization_from_dim(dim) - - # TODO find better logic for all this - if isinstance(categorization, (climate_categories.Categorization, str)): - new_categorization = ensure_categorization_instance(categorization) - old_categorization = ensure_categorization_instance(old_categorization_name) - conversion = old_categorization.conversion_to(new_categorization) - # TODO: Refactor or change variable name for categorization. Conversion is not really the same - elif isinstance(categorization, climate_categories._conversions.Conversion): - if custom_categorisation_a and custom_categorisation_b: - old_categorization = ensure_categorization_instance(custom_categorisation_a) - new_categorization = ensure_categorization_instance(custom_categorisation_b) - conversion = categorization - else: - new_categorization = ensure_categorization_instance( - categorization.categorization_b_name - ) - conversion = categorization - else: - raise ValueError( - f"categorization must be of instance climate_categories.Categorization " - f"or climate_categories._conversions.Conversion. Got {type(categorization)}" - ) check_valid_sum_rule_types(sum_rule) auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) + + # TODO maybe dim_name as argument from one level above + dim_name, old_categorization = extract_categorization_from_dim(dim) new_dim = f"{dim_name} ({new_categorization.name})" converted_da = initialize_empty_converted_da( @@ -126,8 +115,8 @@ def convert( # note: if you have multiple rules to fill a single category, we should # use something like fillna converted_categories = [] - for category in converted_da[new_dim]: - if category in converted_categories: + for category in converted_da[new_dim] : + if category in converted_categories : continue newly_converted_categories, converted_da = self._fill_category( da=converted_da, @@ -145,19 +134,65 @@ def convert( return converted_da + @alias_dims(["dim"]) + def convert(self, + dim, + new_categorization = None, + conversion = None, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, + ): + + if (not new_categorization and not conversion): + raise ValueError( + "conversion or new_categorization must be provided." + ) + + # TODO clean up algorithm + if (conversion and not new_categorization): + old_categorization = conversion.categorization_a + new_categorization = conversion.categorization_b + elif (new_categorization and not conversion): + new_categorization = ensure_categorization_instance(new_categorization) + dim_name, old_categorization = extract_categorization_from_dim(dim) + old_categorization = ensure_categorization_instance(old_categorization) + conversion = old_categorization.conversion_to(new_categorization) + elif (new_categorization and conversion): + new_categorization = ensure_categorization_instance(new_categorization) + if new_categorization != conversion.categorization_b: + raise ValueError( + "New categorization is different to target categorisation in conversion." + ) + old_categorization = conversion.categorization_a + new_categorization = conversion.categorization_b + + + return self.convert_inner( + dim, + conversion = conversion, + old_categorization = old_categorization, + new_categorization = new_categorization, + sum_rule=sum_rule, + input_weights=input_weights, + output_weights=output_weights, + auxiliary_dimensions=auxiliary_dimensions, + ) + def _fill_category( - self, - da: xr.DataArray, - dim: str, - new_dim: str, - already_converted_categories: list[climate_categories.Category], - category: climate_categories.Category, - conversion: climate_categories.Conversion, - sum_rule: str | None, - auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - ) -> tuple[list[climate_categories.Category], xr.DataArray]: + self, + da: xr.DataArray, + dim: str, + new_dim: str, + already_converted_categories: list[climate_categories.Category], + category: climate_categories.Category, + conversion: climate_categories.Conversion, + sum_rule: str | None, + auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + ) -> tuple[list[climate_categories.Category], xr.DataArray] : """Return a copy of da with the given category filled by values converted using the given conversion. @@ -191,13 +226,13 @@ def _fill_category( filled_categories, filled: list of climate_categories.category, xr.DataArray The categories that were filled and the new DataArray. """ - try: + try : rules = applicable_rules(conversion, category) - except KeyError: + except KeyError : logger.debug(f"No rule to derive data for {category!r}, will be NaN.") return [], da - for rule in rules: + for rule in rules : logger.debug(f"Processing rule {rule}.") # iterate until a non-restricted rule was applied or all rules are # exhausted @@ -222,7 +257,7 @@ def _fill_category( already_converted = set(output_selection[new_dim]).intersection( set(already_converted_categories) ) - if already_converted: + if already_converted : logger.warning( f"For category {category!r}, would want to use a " "rule with multiple outputs, but the following outputs " @@ -231,7 +266,7 @@ def _fill_category( ) continue - try: + try : effective_input_weights = derive_weights( dim=dim, category=category, @@ -250,7 +285,7 @@ def _fill_category( sum_rule=sum_rule, weights=output_weights, ) - except WeightingInfoMissing as err: + except WeightingInfoMissing as err : logger.warning(str(err)) continue @@ -263,7 +298,7 @@ def _fill_category( da.loc[output_selection] = rhs - if not rule.is_restricted: + if not rule.is_restricted : # stop processing rules for this category return output_selection[new_dim], da @@ -273,8 +308,7 @@ def _fill_category( ) return [], da - -def extract_categorization_from_dim(dim: str) -> (str, str): +def extract_categorization_from_dim(dim: str) -> (str, str) : """Extract the pure dimension and the categorization from a composite dim. Parameters @@ -299,51 +333,47 @@ def extract_categorization_from_dim(dim: str) -> (str, str): The pure_dim without categorization information and the categorization. If the input dim does not contain categorization information, a ValueError is raised. """ - try: + try : pure, cat = dim.split("(", 1) - except ValueError: + except ValueError : raise ValueError(f"No categorization specified: {dim!r}.") from None return pure[:-1], cat[:-1] - -def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]: +def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] : """Find the possible rules to derive the category using the given conversion.""" rules = conversion.relevant_rules({conversion.categorization_b[category]}) # a + b = c - d can not be used to derive c nor d, only a and b rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())] - if not rules: + if not rules : raise KeyError(category) return rules - def ensure_categorization_instance( - cat: str | climate_categories.Categorization, -) -> climate_categories.Categorization: + cat: str | climate_categories.Categorization, +) -> climate_categories.Categorization : """Takes a categorization name or object and returns the corresponding categorization object.""" - if isinstance(cat, climate_categories.Categorization): + if isinstance(cat, climate_categories.Categorization) : return cat return climate_categories.cats[cat] - -def check_valid_sum_rule_types(sum_rule: str | None): +def check_valid_sum_rule_types(sum_rule: str | None) : """Checks if the sum_rule is either "intensive", "extensive", or None. Raises a ValueError if an invalid sum_rule is used.""" - if sum_rule not in (None, "extensive", "intensive"): + if sum_rule not in (None, "extensive", "intensive") : raise ValueError( f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}" ) - def initialize_empty_converted_da( - *, - old_da: xr.DataArray, - old_dim: Hashable | str, - new_dim: str, - new_categorization: climate_categories.Categorization, -) -> xr.DataArray: + *, + old_da: xr.DataArray, + old_dim: Hashable | str, + new_dim: str, + new_categorization: climate_categories.Categorization, +) -> xr.DataArray : """Build a DataArray which can hold the data after conversion to a new categorization. @@ -372,19 +402,19 @@ def initialize_empty_converted_da( """ new_dims = [] new_shape = [] - for i, idim in enumerate(old_da.dims): - if idim == old_dim: + for i, idim in enumerate(old_da.dims) : + if idim == old_dim : new_dims.append(new_dim) new_shape.append(len(new_categorization)) - else: + else : new_dims.append(idim) new_shape.append(old_da.shape[i]) new_coords = {} - for coord in old_da.coords: - if coord == old_dim: + for coord in old_da.coords : + if coord == old_dim : new_coords[new_dim] = np.array(list(new_categorization.keys())) - elif old_dim in old_da.coords[coord].dims: + elif old_dim in old_da.coords[coord].dims : # The additional coordinate has the old_dim as one dimension, but we # won't be able to convert it logger.info( @@ -392,15 +422,15 @@ def initialize_empty_converted_da( f" and is skipped." ) continue - else: + else : new_coords[coord] = old_da.coords[coord] new_attrs = copy.deepcopy(old_da.attrs) - for pdim in ("area", "cat", "scen"): - if pdim in new_attrs and new_attrs[pdim] == old_dim: + for pdim in ("area", "cat", "scen") : + if pdim in new_attrs and new_attrs[pdim] == old_dim : new_attrs[pdim] = new_dim - if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]: + if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] : new_attrs["sec_cats"].remove(old_dim) new_attrs["sec_cats"].append(new_dim) @@ -416,14 +446,13 @@ def initialize_empty_converted_da( attrs=new_attrs, ) - def factors_categories_to_xarray( - *, - dim: str, - factors_categories: dict[climate_categories.Category, int], - auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], - auxiliary_dimensions: dict[climate_categories.Categorization, str], -) -> tuple[dict[str, list[str]], xr.DataArray]: + *, + dim: str, + factors_categories: dict[climate_categories.Category, int], + auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], + auxiliary_dimensions: dict[climate_categories.Categorization, str], +) -> tuple[dict[str, list[str]], xr.DataArray] : """Convert dictionary mapping categories to factors into xarray-compatible objects. Using the xarray objects ensures that in subsequent calculations, everything @@ -451,47 +480,45 @@ def factors_categories_to_xarray( factors is an xarray DataArray which can be multiplied with an xarray object after applying the selection. """ - selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]} + selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]} factors = xr.DataArray( data=list(factors_categories.values()), dims=[dim], coords=selection, ) - for aux_categorization, aux_categories in auxiliary_categories.items(): - if aux_categories: + for aux_categorization, aux_categories in auxiliary_categories.items() : + if aux_categories : aux_dim = auxiliary_dimensions[aux_categorization] selection[aux_dim] = [cat.codes[0] for cat in aux_categories] return selection, factors - -class WeightingInfoMissing(ValueError): +class WeightingInfoMissing(ValueError) : """Some information to derive weighting factors for a rule is missing.""" def __init__( - self, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - message: str, - ): + self, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + message: str, + ) : full_message = ( f"Can not derive data for category {category!r} using rule" f" '{rule}': {message} Skipping this rule." ) ValueError.__init__(self, full_message) - def derive_weights( - *, - dim: str, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - sum_rule: str | None, - operation_type: str, - weights: xr.DataArray | None, - selection: dict[str, list[str]], -) -> xr.DataArray | float: + *, + dim: str, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + sum_rule: str | None, + operation_type: str, + weights: xr.DataArray | None, + selection: dict[str, list[str]], +) -> xr.DataArray | float : """Derive the weights to use for applying a specific rule. Parameters @@ -523,28 +550,28 @@ def derive_weights( Object which can be multiplied with the input or output DataArray to apply weights. """ - if operation_type == "input": + if operation_type == "input" : operation_verb = "sum up" trivial_sum_rule = "extensive" nontrivial_sum_rule = "intensive" rule_cardinality = rule.cardinality_a - else: + else : operation_verb = "split" trivial_sum_rule = "intensive" nontrivial_sum_rule = "extensive" rule_cardinality = rule.cardinality_b # just one category or trivial sum rule, so no weights required - if rule_cardinality == "one" or sum_rule == trivial_sum_rule: + if rule_cardinality == "one" or sum_rule == trivial_sum_rule : return 1.0 - if sum_rule == nontrivial_sum_rule: - if weights is None: + if sum_rule == nontrivial_sum_rule : + if weights is None : raise WeightingInfoMissing( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories with" - f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" - f" specified.", + f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" + f" specified.", ) effective_weights = weights.loc[selection] # normalize so it is actually a weight, not a factor @@ -554,16 +581,15 @@ def derive_weights( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories, but the sum_rule is" - f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" - f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" - f" specified.", + f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" + f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" + f" specified.", ) - def prepare_auxiliary_dimensions( - conversion: climate_categories.Conversion, - auxiliary_dimensions: dict[str, str] | None, -) -> dict[climate_categories.Categorization, str] | None: + conversion: climate_categories.Conversion, + auxiliary_dimensions: dict[str, str] | None, +) -> dict[climate_categories.Categorization, str] | None : """Prepare and check the auxiliary dimension mapping. Check if all auxiliary categorizations used in the conversion are matched in @@ -577,8 +603,8 @@ def prepare_auxiliary_dimensions( the auxiliary dimensions, but using Categorization objects instead of their names. """ - if conversion.auxiliary_categorizations_names: - if auxiliary_dimensions is None: + if conversion.auxiliary_categorizations_names : + if auxiliary_dimensions is None : raise ValueError( "The conversion uses auxiliary categories, but a translation to" " dimension names was not provided using the argument" @@ -589,16 +615,16 @@ def prepare_auxiliary_dimensions( missing = set(conversion.auxiliary_categorizations_names).difference( auxiliary_dimensions.keys() ) - if missing: + if missing : raise ValueError( "A dimension name was not given for all auxiliary categories:" f" {missing} are missing in the auxiliary_dimensions argument, please" " provide translations to the dimension names used in the data." ) - if not auxiliary_dimensions: + if not auxiliary_dimensions : return auxiliary_dimensions return { - climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions + climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions } diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 30644ac3..f2800172 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -22,12 +22,18 @@ def test_convert_ipcc(empty_ds: xr.Dataset): arr[:] = 1 * primap2.ureg("Gg CO2 / year") da.data = arr + new_categorization_name = "IPCC2006" + with pytest.raises(ValueError, match="The conversion uses auxiliary categories"): - da.pr.convert("category", "IPCC2006", sum_rule="extensive") + da.pr.convert( + dim="category", + new_categorization=new_categorization_name, + sum_rule="extensive", + ) result = da.pr.convert( - "category", - "IPCC2006", + dim="category", + new_categorization=new_categorization_name, sum_rule="extensive", auxiliary_dimensions={"gas": "source (gas)"}, ) @@ -98,8 +104,8 @@ def test_convert_BURDI(empty_ds: xr.Dataset): da.data = arr result = da.pr.convert( - "category", - conv, + dim="category", + conversion=conv, sum_rule="extensive", auxiliary_dimensions={"gas": "source (gas)"}, ) @@ -123,13 +129,17 @@ def test_convert_BURDI(empty_ds: xr.Dataset): assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all() # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP # TODO: This will fail. M.BIO is currently not listed in climate categories - assert ( - (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() - ) + # assert ( + # (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + # ) + + +# def test_with_custom_conversion_and_one_custom_categorisation(empty_ds): +# assert False # test with new conversion and new categorisations -def test_simple__custom_conversion_and_categorisation(empty_ds): +def test_custom_conversion_and_two_custom_categorisations(empty_ds): # make categorisation A from yaml categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml") @@ -154,10 +164,9 @@ def test_simple__custom_conversion_and_categorisation(empty_ds): # convert to categorisation B result = da.pr.convert( - "category", - categorization=conv, - custom_categorisation_a=categorisation_a, - custom_categorisation_b=categorisation_b, + dim="category", + conversion=conv, + new_categorization=categorisation_b, sum_rule="extensive", ) From 4ea5398ca7f6f9e2e750302fc510475eba9a021d Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 21 Oct 2024 12:52:14 +0200 Subject: [PATCH 23/36] ruff --- primap2/_convert.py | 255 ++++++++++++++++++++++---------------------- 1 file changed, 130 insertions(+), 125 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 1284740a..c4fa9faf 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -11,22 +11,21 @@ from ._selection import alias_dims -class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) : - +class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): def convert_inner( - self, - dim: Hashable | str, - # TODO type will change to climate_categories.Conversion when - # https://github.com/primap-community/climate_categories/pull/164 is merged - *, - conversion: climate_categories._conversions.Conversion, - old_categorization: climate_categories.Categorization, - new_categorization: climate_categories.Categorization, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ) -> xr.DataArray : + self, + dim: Hashable | str, + # TODO type will change to climate_categories.Conversion when + # https://github.com/primap-community/climate_categories/pull/164 is merged + *, + conversion: climate_categories._conversions.Conversion, + old_categorization: climate_categories.Categorization, + new_categorization: climate_categories.Categorization, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, + ) -> xr.DataArray: """Convert the data along the given dimension into the new categorization. Maps the given dimension from one categorization (terminology) into another. @@ -115,8 +114,8 @@ def convert_inner( # note: if you have multiple rules to fill a single category, we should # use something like fillna converted_categories = [] - for category in converted_da[new_dim] : - if category in converted_categories : + for category in converted_da[new_dim]: + if category in converted_categories: continue newly_converted_categories, converted_da = self._fill_category( da=converted_da, @@ -135,31 +134,29 @@ def convert_inner( return converted_da @alias_dims(["dim"]) - def convert(self, - dim, - new_categorization = None, - conversion = None, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ): - - if (not new_categorization and not conversion): - raise ValueError( - "conversion or new_categorization must be provided." - ) + def convert( + self, + dim, + new_categorization=None, + conversion=None, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, + ): + if not new_categorization and not conversion: + raise ValueError("conversion or new_categorization must be provided.") # TODO clean up algorithm - if (conversion and not new_categorization): + if conversion and not new_categorization: old_categorization = conversion.categorization_a new_categorization = conversion.categorization_b - elif (new_categorization and not conversion): + elif new_categorization and not conversion: new_categorization = ensure_categorization_instance(new_categorization) dim_name, old_categorization = extract_categorization_from_dim(dim) old_categorization = ensure_categorization_instance(old_categorization) conversion = old_categorization.conversion_to(new_categorization) - elif (new_categorization and conversion): + elif new_categorization and conversion: new_categorization = ensure_categorization_instance(new_categorization) if new_categorization != conversion.categorization_b: raise ValueError( @@ -168,12 +165,11 @@ def convert(self, old_categorization = conversion.categorization_a new_categorization = conversion.categorization_b - return self.convert_inner( dim, - conversion = conversion, - old_categorization = old_categorization, - new_categorization = new_categorization, + conversion=conversion, + old_categorization=old_categorization, + new_categorization=new_categorization, sum_rule=sum_rule, input_weights=input_weights, output_weights=output_weights, @@ -181,18 +177,18 @@ def convert(self, ) def _fill_category( - self, - da: xr.DataArray, - dim: str, - new_dim: str, - already_converted_categories: list[climate_categories.Category], - category: climate_categories.Category, - conversion: climate_categories.Conversion, - sum_rule: str | None, - auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - ) -> tuple[list[climate_categories.Category], xr.DataArray] : + self, + da: xr.DataArray, + dim: str, + new_dim: str, + already_converted_categories: list[climate_categories.Category], + category: climate_categories.Category, + conversion: climate_categories.Conversion, + sum_rule: str | None, + auxiliary_dimensions: dict[climate_categories.Categorization, str] | None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + ) -> tuple[list[climate_categories.Category], xr.DataArray]: """Return a copy of da with the given category filled by values converted using the given conversion. @@ -226,13 +222,13 @@ def _fill_category( filled_categories, filled: list of climate_categories.category, xr.DataArray The categories that were filled and the new DataArray. """ - try : + try: rules = applicable_rules(conversion, category) - except KeyError : + except KeyError: logger.debug(f"No rule to derive data for {category!r}, will be NaN.") return [], da - for rule in rules : + for rule in rules: logger.debug(f"Processing rule {rule}.") # iterate until a non-restricted rule was applied or all rules are # exhausted @@ -257,7 +253,7 @@ def _fill_category( already_converted = set(output_selection[new_dim]).intersection( set(already_converted_categories) ) - if already_converted : + if already_converted: logger.warning( f"For category {category!r}, would want to use a " "rule with multiple outputs, but the following outputs " @@ -266,7 +262,7 @@ def _fill_category( ) continue - try : + try: effective_input_weights = derive_weights( dim=dim, category=category, @@ -285,7 +281,7 @@ def _fill_category( sum_rule=sum_rule, weights=output_weights, ) - except WeightingInfoMissing as err : + except WeightingInfoMissing as err: logger.warning(str(err)) continue @@ -298,7 +294,7 @@ def _fill_category( da.loc[output_selection] = rhs - if not rule.is_restricted : + if not rule.is_restricted: # stop processing rules for this category return output_selection[new_dim], da @@ -308,7 +304,8 @@ def _fill_category( ) return [], da -def extract_categorization_from_dim(dim: str) -> (str, str) : + +def extract_categorization_from_dim(dim: str) -> (str, str): """Extract the pure dimension and the categorization from a composite dim. Parameters @@ -333,47 +330,51 @@ def extract_categorization_from_dim(dim: str) -> (str, str) : The pure_dim without categorization information and the categorization. If the input dim does not contain categorization information, a ValueError is raised. """ - try : + try: pure, cat = dim.split("(", 1) - except ValueError : + except ValueError: raise ValueError(f"No categorization specified: {dim!r}.") from None return pure[:-1], cat[:-1] -def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] : + +def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]: """Find the possible rules to derive the category using the given conversion.""" rules = conversion.relevant_rules({conversion.categorization_b[category]}) # a + b = c - d can not be used to derive c nor d, only a and b rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())] - if not rules : + if not rules: raise KeyError(category) return rules + def ensure_categorization_instance( - cat: str | climate_categories.Categorization, -) -> climate_categories.Categorization : + cat: str | climate_categories.Categorization, +) -> climate_categories.Categorization: """Takes a categorization name or object and returns the corresponding categorization object.""" - if isinstance(cat, climate_categories.Categorization) : + if isinstance(cat, climate_categories.Categorization): return cat return climate_categories.cats[cat] -def check_valid_sum_rule_types(sum_rule: str | None) : + +def check_valid_sum_rule_types(sum_rule: str | None): """Checks if the sum_rule is either "intensive", "extensive", or None. Raises a ValueError if an invalid sum_rule is used.""" - if sum_rule not in (None, "extensive", "intensive") : + if sum_rule not in (None, "extensive", "intensive"): raise ValueError( f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}" ) + def initialize_empty_converted_da( - *, - old_da: xr.DataArray, - old_dim: Hashable | str, - new_dim: str, - new_categorization: climate_categories.Categorization, -) -> xr.DataArray : + *, + old_da: xr.DataArray, + old_dim: Hashable | str, + new_dim: str, + new_categorization: climate_categories.Categorization, +) -> xr.DataArray: """Build a DataArray which can hold the data after conversion to a new categorization. @@ -402,19 +403,19 @@ def initialize_empty_converted_da( """ new_dims = [] new_shape = [] - for i, idim in enumerate(old_da.dims) : - if idim == old_dim : + for i, idim in enumerate(old_da.dims): + if idim == old_dim: new_dims.append(new_dim) new_shape.append(len(new_categorization)) - else : + else: new_dims.append(idim) new_shape.append(old_da.shape[i]) new_coords = {} - for coord in old_da.coords : - if coord == old_dim : + for coord in old_da.coords: + if coord == old_dim: new_coords[new_dim] = np.array(list(new_categorization.keys())) - elif old_dim in old_da.coords[coord].dims : + elif old_dim in old_da.coords[coord].dims: # The additional coordinate has the old_dim as one dimension, but we # won't be able to convert it logger.info( @@ -422,15 +423,15 @@ def initialize_empty_converted_da( f" and is skipped." ) continue - else : + else: new_coords[coord] = old_da.coords[coord] new_attrs = copy.deepcopy(old_da.attrs) - for pdim in ("area", "cat", "scen") : - if pdim in new_attrs and new_attrs[pdim] == old_dim : + for pdim in ("area", "cat", "scen"): + if pdim in new_attrs and new_attrs[pdim] == old_dim: new_attrs[pdim] = new_dim - if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] : + if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]: new_attrs["sec_cats"].remove(old_dim) new_attrs["sec_cats"].append(new_dim) @@ -446,13 +447,14 @@ def initialize_empty_converted_da( attrs=new_attrs, ) + def factors_categories_to_xarray( - *, - dim: str, - factors_categories: dict[climate_categories.Category, int], - auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], - auxiliary_dimensions: dict[climate_categories.Categorization, str], -) -> tuple[dict[str, list[str]], xr.DataArray] : + *, + dim: str, + factors_categories: dict[climate_categories.Category, int], + auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]], + auxiliary_dimensions: dict[climate_categories.Categorization, str], +) -> tuple[dict[str, list[str]], xr.DataArray]: """Convert dictionary mapping categories to factors into xarray-compatible objects. Using the xarray objects ensures that in subsequent calculations, everything @@ -480,45 +482,47 @@ def factors_categories_to_xarray( factors is an xarray DataArray which can be multiplied with an xarray object after applying the selection. """ - selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]} + selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]} factors = xr.DataArray( data=list(factors_categories.values()), dims=[dim], coords=selection, ) - for aux_categorization, aux_categories in auxiliary_categories.items() : - if aux_categories : + for aux_categorization, aux_categories in auxiliary_categories.items(): + if aux_categories: aux_dim = auxiliary_dimensions[aux_categorization] selection[aux_dim] = [cat.codes[0] for cat in aux_categories] return selection, factors -class WeightingInfoMissing(ValueError) : + +class WeightingInfoMissing(ValueError): """Some information to derive weighting factors for a rule is missing.""" def __init__( - self, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - message: str, - ) : + self, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + message: str, + ): full_message = ( f"Can not derive data for category {category!r} using rule" f" '{rule}': {message} Skipping this rule." ) ValueError.__init__(self, full_message) + def derive_weights( - *, - dim: str, - category: climate_categories.Category, - rule: climate_categories.ConversionRule, - sum_rule: str | None, - operation_type: str, - weights: xr.DataArray | None, - selection: dict[str, list[str]], -) -> xr.DataArray | float : + *, + dim: str, + category: climate_categories.Category, + rule: climate_categories.ConversionRule, + sum_rule: str | None, + operation_type: str, + weights: xr.DataArray | None, + selection: dict[str, list[str]], +) -> xr.DataArray | float: """Derive the weights to use for applying a specific rule. Parameters @@ -550,28 +554,28 @@ def derive_weights( Object which can be multiplied with the input or output DataArray to apply weights. """ - if operation_type == "input" : + if operation_type == "input": operation_verb = "sum up" trivial_sum_rule = "extensive" nontrivial_sum_rule = "intensive" rule_cardinality = rule.cardinality_a - else : + else: operation_verb = "split" trivial_sum_rule = "intensive" nontrivial_sum_rule = "extensive" rule_cardinality = rule.cardinality_b # just one category or trivial sum rule, so no weights required - if rule_cardinality == "one" or sum_rule == trivial_sum_rule : + if rule_cardinality == "one" or sum_rule == trivial_sum_rule: return 1.0 - if sum_rule == nontrivial_sum_rule : - if weights is None : + if sum_rule == nontrivial_sum_rule: + if weights is None: raise WeightingInfoMissing( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories with" - f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" - f" specified.", + f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are" + f" specified.", ) effective_weights = weights.loc[selection] # normalize so it is actually a weight, not a factor @@ -581,15 +585,16 @@ def derive_weights( category=category, rule=rule, message=f"We need to {operation_verb} multiple categories, but the sum_rule is" - f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" - f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" - f" specified.", + f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or" + f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are" + f" specified.", ) + def prepare_auxiliary_dimensions( - conversion: climate_categories.Conversion, - auxiliary_dimensions: dict[str, str] | None, -) -> dict[climate_categories.Categorization, str] | None : + conversion: climate_categories.Conversion, + auxiliary_dimensions: dict[str, str] | None, +) -> dict[climate_categories.Categorization, str] | None: """Prepare and check the auxiliary dimension mapping. Check if all auxiliary categorizations used in the conversion are matched in @@ -603,8 +608,8 @@ def prepare_auxiliary_dimensions( the auxiliary dimensions, but using Categorization objects instead of their names. """ - if conversion.auxiliary_categorizations_names : - if auxiliary_dimensions is None : + if conversion.auxiliary_categorizations_names: + if auxiliary_dimensions is None: raise ValueError( "The conversion uses auxiliary categories, but a translation to" " dimension names was not provided using the argument" @@ -615,16 +620,16 @@ def prepare_auxiliary_dimensions( missing = set(conversion.auxiliary_categorizations_names).difference( auxiliary_dimensions.keys() ) - if missing : + if missing: raise ValueError( "A dimension name was not given for all auxiliary categories:" f" {missing} are missing in the auxiliary_dimensions argument, please" " provide translations to the dimension names used in the data." ) - if not auxiliary_dimensions : + if not auxiliary_dimensions: return auxiliary_dimensions return { - climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions + climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions } From 44c2caef61f893797ac0ae2ffe9943e26d9458fb Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 21 Oct 2024 14:56:13 +0200 Subject: [PATCH 24/36] clean up --- primap2/_convert.py | 143 ++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 71 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index c4fa9faf..e2b16a79 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -19,12 +19,65 @@ def convert_inner( # https://github.com/primap-community/climate_categories/pull/164 is merged *, conversion: climate_categories._conversions.Conversion, - old_categorization: climate_categories.Categorization, new_categorization: climate_categories.Categorization, sum_rule: typing.Literal["intensive", "extensive"] | None = None, input_weights: xr.DataArray | None = None, output_weights: xr.DataArray | None = None, auxiliary_dimensions: dict[str, str] | None = None, + ) -> xr.DataArray: + """ + See docstring of `convert` for details on arguments and behavior. + """ + + check_valid_sum_rule_types(sum_rule) + + auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) + + # TODO maybe dim_name as argument from one level above + dim_name, old_categorization = extract_categorization_from_dim(dim) + new_dim = f"{dim_name} ({new_categorization.name})" + + converted_da = initialize_empty_converted_da( + old_da=self._da, + old_dim=dim, + new_dim=new_dim, + new_categorization=new_categorization, + ) + + # idea: convert 1-to-1 mappings first, should be easy in a single xarray + # operation + # note: if you have multiple rules to fill a single category, we should + # use something like fillna + converted_categories = [] + for category in converted_da[new_dim]: + if category in converted_categories: + continue + newly_converted_categories, converted_da = self._fill_category( + da=converted_da, + dim=dim, + new_dim=new_dim, + already_converted_categories=converted_categories, + category=category.item(), + conversion=conversion, + sum_rule=sum_rule, + auxiliary_dimensions=auxiliary_dimensions, + input_weights=input_weights, + output_weights=output_weights, + ) + converted_categories += newly_converted_categories + + return converted_da + + @alias_dims(["dim"]) + def convert( + self, + dim: Hashable | str, + conversion: climate_categories._conversions.Conversion = None, + new_categorization: climate_categories.Categorization = None, + sum_rule: typing.Literal["intensive", "extensive"] | None = None, + input_weights: xr.DataArray | None = None, + output_weights: xr.DataArray | None = None, + auxiliary_dimensions: dict[str, str] | None = None, ) -> xr.DataArray: """Convert the data along the given dimension into the new categorization. @@ -37,20 +90,15 @@ def convert_inner( ---------- dim : str Dimension to convert. Has to be a dimension from ``da.dims``. - # TODO type will change to climate_categories.Conversion when climate_categories/pull/164 is merged - conversion : climate_categories.Categorization or str or climate_categories._conversions.Conversion - New categorization to convert the given dimension to. Either give the title - of the new categorization (like ``IPCC1996``) or a - ``climate_categories.Categorization`` object or a - ``climate_categories._conversions.Conversion`` object. - custom_categorization_input - A custom categorization for the input data. Must be provided if conversion uses - input categorisation that is not in ``climate_categories``. - Overwrites categorisation in conversion if both are provided. - custom_categorization_output - A custom categorization for the output data. Must be provided if conversion uses - output categorisation that is not in ``climate_categories``. - Overwrites categorisation in conversion if both are provided. + conversion : climate_categories.Conversion + The conversion rules that describe the conversion from the old to the new categorization. + Contains ``climate_categories.Categorization`` object for old and new categorisation. + new_categorization: str + New categorization to convert the given dimension to. If the categorization + is part of climate categories the title of the new categorization (like ``IPCC1996``) + will work. A ``climate_categories.Categorization`` object can be used regardless + of wether it is part of climate_categories. When providing just the new categorization, + the old categorization as well as the conversion must be part of climate_categories. sum_rule : ``extensive``, ``intensive``, or None (default) If data of categories has to be summed up or divided, we need information whether the quantity measured is extensive (like, for example, total @@ -94,69 +142,20 @@ def convert_inner( categorization. """ - check_valid_sum_rule_types(sum_rule) - - auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) - - # TODO maybe dim_name as argument from one level above - dim_name, old_categorization = extract_categorization_from_dim(dim) - new_dim = f"{dim_name} ({new_categorization.name})" - - converted_da = initialize_empty_converted_da( - old_da=self._da, - old_dim=dim, - new_dim=new_dim, - new_categorization=new_categorization, - ) - - # idea: convert 1-to-1 mappings first, should be easy in a single xarray - # operation - # note: if you have multiple rules to fill a single category, we should - # use something like fillna - converted_categories = [] - for category in converted_da[new_dim]: - if category in converted_categories: - continue - newly_converted_categories, converted_da = self._fill_category( - da=converted_da, - dim=dim, - new_dim=new_dim, - already_converted_categories=converted_categories, - category=category.item(), - conversion=conversion, - sum_rule=sum_rule, - auxiliary_dimensions=auxiliary_dimensions, - input_weights=input_weights, - output_weights=output_weights, - ) - converted_categories += newly_converted_categories - - return converted_da - - @alias_dims(["dim"]) - def convert( - self, - dim, - new_categorization=None, - conversion=None, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ): - if not new_categorization and not conversion: - raise ValueError("conversion or new_categorization must be provided.") - - # TODO clean up algorithm + # user provides only conversion if conversion and not new_categorization: old_categorization = conversion.categorization_a new_categorization = conversion.categorization_b + # user provides only new categorisation elif new_categorization and not conversion: new_categorization = ensure_categorization_instance(new_categorization) dim_name, old_categorization = extract_categorization_from_dim(dim) old_categorization = ensure_categorization_instance(old_categorization) conversion = old_categorization.conversion_to(new_categorization) - elif new_categorization and conversion: + # user provides conversion AND new categorisation + # TODO: There is no additional value in providing both because all + # the information is already in conversion. Maybe we shouldn't allow this case? + elif new_categorization: new_categorization = ensure_categorization_instance(new_categorization) if new_categorization != conversion.categorization_b: raise ValueError( @@ -164,11 +163,13 @@ def convert( ) old_categorization = conversion.categorization_a new_categorization = conversion.categorization_b + # User does not provide anything + else: + raise ValueError("conversion or new_categorization must be provided.") return self.convert_inner( dim, conversion=conversion, - old_categorization=old_categorization, new_categorization=new_categorization, sum_rule=sum_rule, input_weights=input_weights, From 7ffe50637ea5802c01b09507cf2d05d54c52310d Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 21 Oct 2024 14:59:08 +0200 Subject: [PATCH 25/36] more cleanup --- primap2/_convert.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index e2b16a79..4a980c2a 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -153,17 +153,15 @@ def convert( old_categorization = ensure_categorization_instance(old_categorization) conversion = old_categorization.conversion_to(new_categorization) # user provides conversion AND new categorisation - # TODO: There is no additional value in providing both because all - # the information is already in conversion. Maybe we shouldn't allow this case? + # TODO: There is no additional value in providing conversion AND new_categorization because all + # information is already in conversion. Maybe we shouldn't allow this case? elif new_categorization: - new_categorization = ensure_categorization_instance(new_categorization) if new_categorization != conversion.categorization_b: raise ValueError( "New categorization is different to target categorisation in conversion." ) - old_categorization = conversion.categorization_a new_categorization = conversion.categorization_b - # User does not provide anything + # User provides neither conversion nor new categorization else: raise ValueError("conversion or new_categorization must be provided.") From 0db737363cdcd0e21f5ce081207aa52efca12784 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 21 Oct 2024 15:33:14 +0200 Subject: [PATCH 26/36] docstring and argument passing from outer to inner convert function --- primap2/_convert.py | 114 ++++++++++++++++++++++++---------- primap2/tests/test_convert.py | 4 -- 2 files changed, 82 insertions(+), 36 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 4a980c2a..39d4bfd2 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -12,29 +12,82 @@ class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): - def convert_inner( + def _convert_inner( self, dim: Hashable | str, + *, # TODO type will change to climate_categories.Conversion when # https://github.com/primap-community/climate_categories/pull/164 is merged - *, conversion: climate_categories._conversions.Conversion, - new_categorization: climate_categories.Categorization, sum_rule: typing.Literal["intensive", "extensive"] | None = None, input_weights: xr.DataArray | None = None, output_weights: xr.DataArray | None = None, auxiliary_dimensions: dict[str, str] | None = None, ) -> xr.DataArray: - """ - See docstring of `convert` for details on arguments and behavior. + """Convert the data along the given dimension into the new categorization. + + Maps the given dimension from one categorization (terminology) into another. + Fetches the rules to do the mapping from the climate_categories package, and + therefore will only work if there are conversions rules to convert from the + current categorization to the new categorization. + + Parameters + ---------- + dim : str + Dimension to convert. Has to be a dimension from ``da.dims``. + conversion : climate_categories.Conversion + The conversion rules that describe the conversion from the old to the new + categorization. Contains ``climate_categories.Categorization`` + object for old and new categorization. + sum_rule : ``extensive``, ``intensive``, or None (default) + If data of categories has to be summed up or divided, we need information + whether the quantity measured is extensive (like, for example, total + emissions in a year subdivided into multiple sectoral categories) or + intensive (like, for example, average per-person emissions in a year + subdivided into different territorial entities). By default (None), a + warning is issued if data has to be summed up or divided. + input_weights : xr.DataArray, optional + If data in input categories has to be summed up and the sum_rule is + ``intensive``, weights for the input categories are required. + The weights can be given in any shape compatible with the DataArray that + is converted, e.g. to give different weights for industrial sectors by + country. However, at least the ``dim`` that is converted needs to be in + ``input_weights.dims``. + If no weights are specified but a rule requiring weights is specified + in the conversion rules, a warning is issued and the respective rule is + skipped (probably resulting in more NaNs in the output). + output_weights : xr.DataArray, optional + If data has to be divided into several output categories and the sum_rule is + ``extensive``, weights for the output categories are required. + The weights can be given in any shape compatible with the DataArray that + is converted, e.g. to give different weights for industrial sectors by + country. However, at least the ``dim`` that is converted needs to be in + ``output_weights.dims``. + If no weights are specified but a rule requiring weights is specified + in the conversion rules, a warning is issued and the respective rule is + skipped (probably resulting in more NaNs in the output). + auxiliary_dimensions : dict[str, str], optional + Mapping of auxiliary categorizations to dimension names used in this + DataArray. In conversions which contain rules which are valid only for + certain orthogonal dimensions (e.g. a conversion between different sectoral + terminologies, but some rules are only valid for specific countries), only + the categorization is specified. Therefore, in this case you have to specify + a mapping from categorization name to dimension name. + Example: {"ISO3": "area (ISO3)"}) . + + Returns + ------- + converted : xr.DataArray + A copy of the DataArray with the given dimension converted in the new + categorization. """ check_valid_sum_rule_types(sum_rule) auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) - # TODO maybe dim_name as argument from one level above dim_name, old_categorization = extract_categorization_from_dim(dim) + new_categorization = conversion.categorization_b new_dim = f"{dim_name} ({new_categorization.name})" converted_da = initialize_empty_converted_da( @@ -72,8 +125,8 @@ def convert_inner( def convert( self, dim: Hashable | str, - conversion: climate_categories._conversions.Conversion = None, - new_categorization: climate_categories.Categorization = None, + conversion: climate_categories._conversions.Conversion | None = None, + new_categorization: climate_categories.Categorization | str | None = None, sum_rule: typing.Literal["intensive", "extensive"] | None = None, input_weights: xr.DataArray | None = None, output_weights: xr.DataArray | None = None, @@ -81,7 +134,8 @@ def convert( ) -> xr.DataArray: """Convert the data along the given dimension into the new categorization. - Maps the given dimension from one categorization (terminology) into another. + Generates conversion and new categorization from given inputs. Maps the given dimension + from one categorization (terminology) into another. Fetches the rules to do the mapping from the climate_categories package, and therefore will only work if there are conversions rules to convert from the current categorization to the new categorization. @@ -92,13 +146,15 @@ def convert( Dimension to convert. Has to be a dimension from ``da.dims``. conversion : climate_categories.Conversion The conversion rules that describe the conversion from the old to the new categorization. - Contains ``climate_categories.Categorization`` object for old and new categorisation. + Contains ``climate_categories.Categorization`` object for old and new categorization. + Either conversion or new_categorization must be provided. new_categorization: str New categorization to convert the given dimension to. If the categorization is part of climate categories the title of the new categorization (like ``IPCC1996``) will work. A ``climate_categories.Categorization`` object can be used regardless - of wether it is part of climate_categories. When providing just the new categorization, + of whether it is part of climate_categories. When providing just the new categorization, the old categorization as well as the conversion must be part of climate_categories. + Either conversion or new_categorization must be provided. sum_rule : ``extensive``, ``intensive``, or None (default) If data of categories has to be summed up or divided, we need information whether the quantity measured is extensive (like, for example, total @@ -142,33 +198,27 @@ def convert( categorization. """ - # user provides only conversion - if conversion and not new_categorization: - old_categorization = conversion.categorization_a - new_categorization = conversion.categorization_b - # user provides only new categorisation - elif new_categorization and not conversion: - new_categorization = ensure_categorization_instance(new_categorization) - dim_name, old_categorization = extract_categorization_from_dim(dim) - old_categorization = ensure_categorization_instance(old_categorization) - conversion = old_categorization.conversion_to(new_categorization) - # user provides conversion AND new categorisation - # TODO: There is no additional value in providing conversion AND new_categorization because all - # information is already in conversion. Maybe we shouldn't allow this case? - elif new_categorization: - if new_categorization != conversion.categorization_b: + + # User provides neither conversion nor new categorization + if (not new_categorization and not conversion): + raise ValueError("conversion or new_categorization must be provided.") + if new_categorization: + # User provides only new_categorization + if not conversion: + new_categorization = ensure_categorization_instance(new_categorization) + dim_name, old_categorization = extract_categorization_from_dim(dim) + old_categorization = ensure_categorization_instance(old_categorization) + conversion = old_categorization.conversion_to(new_categorization) + # User provides new_categorizatiom amd conversion, but they don't match + # TODO: What's the use case of provoding both? Maybe remove + elif new_categorization != conversion.categorization_b: raise ValueError( "New categorization is different to target categorisation in conversion." ) - new_categorization = conversion.categorization_b - # User provides neither conversion nor new categorization - else: - raise ValueError("conversion or new_categorization must be provided.") - return self.convert_inner( + return self._convert_inner( dim, conversion=conversion, - new_categorization=new_categorization, sum_rule=sum_rule, input_weights=input_weights, output_weights=output_weights, diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index f2800172..2e23aab4 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -134,10 +134,6 @@ def test_convert_BURDI(empty_ds: xr.Dataset): # ) -# def test_with_custom_conversion_and_one_custom_categorisation(empty_ds): -# assert False - - # test with new conversion and new categorisations def test_custom_conversion_and_two_custom_categorisations(empty_ds): # make categorisation A from yaml From adb202c22a34963bf1a8a40233f61a3fdfc2ae19 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 21 Oct 2024 15:41:57 +0200 Subject: [PATCH 27/36] ruff and docstring --- primap2/_convert.py | 15 +++++++-------- primap2/tests/test_convert.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 39d4bfd2..56fc00f5 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -124,7 +124,7 @@ def _convert_inner( @alias_dims(["dim"]) def convert( self, - dim: Hashable | str, + dim: Hashable | str, conversion: climate_categories._conversions.Conversion | None = None, new_categorization: climate_categories.Categorization | str | None = None, sum_rule: typing.Literal["intensive", "extensive"] | None = None, @@ -151,10 +151,10 @@ def convert( new_categorization: str New categorization to convert the given dimension to. If the categorization is part of climate categories the title of the new categorization (like ``IPCC1996``) - will work. A ``climate_categories.Categorization`` object can be used regardless - of whether it is part of climate_categories. When providing just the new categorization, + will work. When providing just the new categorization, the old categorization as well as the conversion must be part of climate_categories. - Either conversion or new_categorization must be provided. + Either conversion or new_categorization must be provided. Note that if both + new_categorization and conversion are provided, conversion will be prioritized. sum_rule : ``extensive``, ``intensive``, or None (default) If data of categories has to be summed up or divided, we need information whether the quantity measured is extensive (like, for example, total @@ -198,9 +198,8 @@ def convert( categorization. """ - # User provides neither conversion nor new categorization - if (not new_categorization and not conversion): + if not new_categorization and not conversion: raise ValueError("conversion or new_categorization must be provided.") if new_categorization: # User provides only new_categorization @@ -209,8 +208,8 @@ def convert( dim_name, old_categorization = extract_categorization_from_dim(dim) old_categorization = ensure_categorization_instance(old_categorization) conversion = old_categorization.conversion_to(new_categorization) - # User provides new_categorizatiom amd conversion, but they don't match - # TODO: What's the use case of provoding both? Maybe remove + # User provides new_categorization and conversion, but they don't match + # TODO: What's the use case of providing both? Maybe remove elif new_categorization != conversion.categorization_b: raise ValueError( "New categorization is different to target categorisation in conversion." diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 2e23aab4..854cbde7 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -11,6 +11,16 @@ import primap2 +def test_conversion_and_new_categorisation_missing(empty_ds: xr.Dataset): + da = empty_ds["CO2"] + da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())}) + da = da.expand_dims({"source (gas)": list(cc.gas.keys())}) + with pytest.raises(ValueError, match="conversion or new_categorization must be provided."): + da.pr.convert( + dim="category", + ) + + # test with existing conversion and two existing categorisations def test_convert_ipcc(empty_ds: xr.Dataset): # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy From b896f642d992fcac8202eef6be3aa74d1330f2c5 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Tue, 22 Oct 2024 10:43:35 +0200 Subject: [PATCH 28/36] remove _convert_inner wrapper --- primap2/_convert.py | 116 ++++------------------------------ primap2/tests/test_convert.py | 30 ++++++--- 2 files changed, 34 insertions(+), 112 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 56fc00f5..295ac437 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -12,7 +12,8 @@ class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor): - def _convert_inner( + @alias_dims(["dim"]) + def convert( self, dim: Hashable | str, *, @@ -87,6 +88,16 @@ def _convert_inner( auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions) dim_name, old_categorization = extract_categorization_from_dim(dim) + + if conversion.categorization_a_name != old_categorization: + msg = ( + "The source categorization in the conversion " + f"({conversion.categorization_a_name}) does " + "not match the categorization in the data set " + f"({old_categorization})." + ) + raise ValueError(msg) + new_categorization = conversion.categorization_b new_dim = f"{dim_name} ({new_categorization.name})" @@ -121,109 +132,6 @@ def _convert_inner( return converted_da - @alias_dims(["dim"]) - def convert( - self, - dim: Hashable | str, - conversion: climate_categories._conversions.Conversion | None = None, - new_categorization: climate_categories.Categorization | str | None = None, - sum_rule: typing.Literal["intensive", "extensive"] | None = None, - input_weights: xr.DataArray | None = None, - output_weights: xr.DataArray | None = None, - auxiliary_dimensions: dict[str, str] | None = None, - ) -> xr.DataArray: - """Convert the data along the given dimension into the new categorization. - - Generates conversion and new categorization from given inputs. Maps the given dimension - from one categorization (terminology) into another. - Fetches the rules to do the mapping from the climate_categories package, and - therefore will only work if there are conversions rules to convert from the - current categorization to the new categorization. - - Parameters - ---------- - dim : str - Dimension to convert. Has to be a dimension from ``da.dims``. - conversion : climate_categories.Conversion - The conversion rules that describe the conversion from the old to the new categorization. - Contains ``climate_categories.Categorization`` object for old and new categorization. - Either conversion or new_categorization must be provided. - new_categorization: str - New categorization to convert the given dimension to. If the categorization - is part of climate categories the title of the new categorization (like ``IPCC1996``) - will work. When providing just the new categorization, - the old categorization as well as the conversion must be part of climate_categories. - Either conversion or new_categorization must be provided. Note that if both - new_categorization and conversion are provided, conversion will be prioritized. - sum_rule : ``extensive``, ``intensive``, or None (default) - If data of categories has to be summed up or divided, we need information - whether the quantity measured is extensive (like, for example, total - emissions in a year subdivided into multiple sectoral categories) or - intensive (like, for example, average per-person emissions in a year - subdivided into different territorial entities). By default (None), a - warning is issued if data has to be summed up or divided. - input_weights : xr.DataArray, optional - If data in input categories has to be summed up and the sum_rule is - ``intensive``, weights for the input categories are required. - The weights can be given in any shape compatible with the DataArray that - is converted, e.g. to give different weights for industrial sectors by - country. However, at least the ``dim`` that is converted needs to be in - ``input_weights.dims``. - If no weights are specified but a rule requiring weights is specified - in the conversion rules, a warning is issued and the respective rule is - skipped (probably resulting in more NaNs in the output). - output_weights : xr.DataArray, optional - If data has to be divided into several output categories and the sum_rule is - ``extensive``, weights for the output categories are required. - The weights can be given in any shape compatible with the DataArray that - is converted, e.g. to give different weights for industrial sectors by - country. However, at least the ``dim`` that is converted needs to be in - ``output_weights.dims``. - If no weights are specified but a rule requiring weights is specified - in the conversion rules, a warning is issued and the respective rule is - skipped (probably resulting in more NaNs in the output). - auxiliary_dimensions : dict[str, str], optional - Mapping of auxiliary categorizations to dimension names used in this - DataArray. In conversions which contain rules which are valid only for - certain orthogonal dimensions (e.g. a conversion between different sectoral - terminologies, but some rules are only valid for specific countries), only - the categorization is specified. Therefore, in this case you have to specify - a mapping from categorization name to dimension name. - Example: {"ISO3": "area (ISO3)"}) . - - Returns - ------- - converted : xr.DataArray - A copy of the DataArray with the given dimension converted in the new - categorization. - """ - - # User provides neither conversion nor new categorization - if not new_categorization and not conversion: - raise ValueError("conversion or new_categorization must be provided.") - if new_categorization: - # User provides only new_categorization - if not conversion: - new_categorization = ensure_categorization_instance(new_categorization) - dim_name, old_categorization = extract_categorization_from_dim(dim) - old_categorization = ensure_categorization_instance(old_categorization) - conversion = old_categorization.conversion_to(new_categorization) - # User provides new_categorization and conversion, but they don't match - # TODO: What's the use case of providing both? Maybe remove - elif new_categorization != conversion.categorization_b: - raise ValueError( - "New categorization is different to target categorisation in conversion." - ) - - return self._convert_inner( - dim, - conversion=conversion, - sum_rule=sum_rule, - input_weights=input_weights, - output_weights=output_weights, - auxiliary_dimensions=auxiliary_dimensions, - ) - def _fill_category( self, da: xr.DataArray, diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 854cbde7..b8543a27 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,6 +1,7 @@ """Tests for _convert.py""" import pathlib +import re import climate_categories as cc import climate_categories._conversions as conversions @@ -11,17 +12,31 @@ import primap2 -def test_conversion_and_new_categorisation_missing(empty_ds: xr.Dataset): +def test_conversion_source_does_not_match_dataset_dimension(empty_ds): + # make a data set with IPCC1996 categories da = empty_ds["CO2"] da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())}) da = da.expand_dims({"source (gas)": list(cc.gas.keys())}) - with pytest.raises(ValueError, match="conversion or new_categorization must be provided."): - da.pr.convert( + arr = da.data.copy() + arr[:] = 1 * primap2.ureg("Gg CO2 / year") + da.data = arr + + # load the BURDI to IPCC2006 category conversion + filepath = pathlib.Path("data/BURDI_conversion.csv") + conv = conversions.ConversionSpec.from_csv(filepath) + conv = conv.hydrate(cats=cc.cats) + + msg = ( + "The source categorization in the conversion (BURDI) " + "does not match the categorization in the data set (IPCC1996)." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + result = da.pr.convert( dim="category", + conversion=conv, ) -# test with existing conversion and two existing categorisations def test_convert_ipcc(empty_ds: xr.Dataset): # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy # to see @@ -32,18 +47,18 @@ def test_convert_ipcc(empty_ds: xr.Dataset): arr[:] = 1 * primap2.ureg("Gg CO2 / year") da.data = arr - new_categorization_name = "IPCC2006" + conversion = cc.IPCC1996.conversion_to(cc.IPCC2006) with pytest.raises(ValueError, match="The conversion uses auxiliary categories"): da.pr.convert( dim="category", - new_categorization=new_categorization_name, + conversion=conversion, sum_rule="extensive", ) result = da.pr.convert( dim="category", - new_categorization=new_categorization_name, + conversion=conversion, sum_rule="extensive", auxiliary_dimensions={"gas": "source (gas)"}, ) @@ -172,7 +187,6 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds): result = da.pr.convert( dim="category", conversion=conv, - new_categorization=categorisation_b, sum_rule="extensive", ) From e62291d2b53c73678aaa5b41670252f6da87ec75 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 24 Oct 2024 14:53:15 +0200 Subject: [PATCH 29/36] update climate categories --- primap2/tests/data/BURDI_conversion.csv | 3 ++- primap2/tests/test_convert.py | 21 +++++++++------------ setup.cfg | 2 +- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/primap2/tests/data/BURDI_conversion.csv b/primap2/tests/data/BURDI_conversion.csv index 82e0ce50..028247c7 100644 --- a/primap2/tests/data/BURDI_conversion.csv +++ b/primap2/tests/data/BURDI_conversion.csv @@ -36,5 +36,6 @@ BURDI,IPCC2006_PRIMAP,comment 15163,M.0.EL 14637,M.BK 14424,M.BK.A -14423,M.BK.M, leaving 14638 --> M.BIO out for now, as it's not in climate categories +14423,M.BK.M, +14638, M.BIO 7,5, 5.A-D ignored as not fitting 2006 cats diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index b8543a27..20f750a6 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -4,7 +4,6 @@ import re import climate_categories as cc -import climate_categories._conversions as conversions import numpy as np import pytest import xarray as xr @@ -23,8 +22,8 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): # load the BURDI to IPCC2006 category conversion filepath = pathlib.Path("data/BURDI_conversion.csv") - conv = conversions.ConversionSpec.from_csv(filepath) - conv = conv.hydrate(cats=cc.cats) + + conv = cc.Conversion.from_csv(filepath) msg = ( "The source categorization in the conversion (BURDI) " @@ -71,8 +70,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset): def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories filepath = pathlib.Path("data/BURDI_conversion.csv") - conv = conversions.ConversionSpec.from_csv(filepath) - conv = conv.hydrate(cats=cc.cats) + conv = cc.Conversion.from_csv(filepath) # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ # unfccc_di_reader_config.py @@ -153,10 +151,9 @@ def test_convert_BURDI(empty_ds: xr.Dataset): # 2.E + 2.B = 2.E, 2.E should not be part of new data set assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all() # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP - # TODO: This will fail. M.BIO is currently not listed in climate categories - # assert ( - # (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() - # ) + assert ( + (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() + ) # test with new conversion and new categorisations @@ -167,14 +164,14 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds): # make categorisation B from yaml categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml") - # make conversion from csv - conv = conversions.ConversionSpec.from_csv("data/simple_conversion.csv") # categories not part of climate categories so we need to add them manually cats = { "A": categorisation_a, "B": categorisation_b, } - conv = conv.hydrate(cats=cats) + + # make conversion from csv + conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats) # make a dummy dataset based on A cats da = empty_ds["CO2"] diff --git a/setup.cfg b/setup.cfg index fd6b0e2a..cf8d3065 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = openpyxl>=3.1 tqdm>=4.66 msgpack>=1 - climate_categories>=0.10.1 + climate_categories>=0.10.2 [options.extras_require] test = From bd9b91d1a47f0d9c9d803a95db29e44ba0d58491 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Oct 2024 12:53:53 +0000 Subject: [PATCH 30/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- primap2/tests/data/simple_categorisation_a.yaml | 2 +- primap2/tests/data/simple_categorisation_b.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/primap2/tests/data/simple_categorisation_a.yaml b/primap2/tests/data/simple_categorisation_a.yaml index 1656c97b..beef5533 100644 --- a/primap2/tests/data/simple_categorisation_a.yaml +++ b/primap2/tests/data/simple_categorisation_a.yaml @@ -32,4 +32,4 @@ categories: - C - CatC unnumbered: - title: The unnumbered category \ No newline at end of file + title: The unnumbered category diff --git a/primap2/tests/data/simple_categorisation_b.yaml b/primap2/tests/data/simple_categorisation_b.yaml index 35751f9b..05e1dc07 100644 --- a/primap2/tests/data/simple_categorisation_b.yaml +++ b/primap2/tests/data/simple_categorisation_b.yaml @@ -24,4 +24,4 @@ categories: comment: The second category alternative_codes: - B - - CatB \ No newline at end of file + - CatB From 1b714fd5d4c12e32f9da811118d97a57a2ea54aa Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 24 Oct 2024 15:07:26 +0200 Subject: [PATCH 31/36] get test data with importlib --- primap2/tests/test_convert.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 20f750a6..79327765 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,6 +1,7 @@ """Tests for _convert.py""" -import pathlib +import importlib +import importlib.resources import re import climate_categories as cc @@ -11,6 +12,10 @@ import primap2 +def get_test_data_filepath(fname: str): + return importlib.resources.files("primap2.tests.data").joinpath(fname) + + def test_conversion_source_does_not_match_dataset_dimension(empty_ds): # make a data set with IPCC1996 categories da = empty_ds["CO2"] @@ -21,7 +26,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): da.data = arr # load the BURDI to IPCC2006 category conversion - filepath = pathlib.Path("data/BURDI_conversion.csv") + filepath = get_test_data_filepath("BURDI_conversion.csv") conv = cc.Conversion.from_csv(filepath) @@ -30,7 +35,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): "does not match the categorization in the data set (IPCC1996)." ) with pytest.raises(ValueError, match=re.escape(msg)): - result = da.pr.convert( + result = da.pr.convert( # noqa: F841 dim="category", conversion=conv, ) @@ -69,7 +74,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset): # test with new conversion and two existing categorisations def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories - filepath = pathlib.Path("data/BURDI_conversion.csv") + filepath = get_test_data_filepath("BURDI_conversion.csv") conv = cc.Conversion.from_csv(filepath) # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ @@ -159,10 +164,10 @@ def test_convert_BURDI(empty_ds: xr.Dataset): # test with new conversion and new categorisations def test_custom_conversion_and_two_custom_categorisations(empty_ds): # make categorisation A from yaml - categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml") + categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml")) # make categorisation B from yaml - categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml") + categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml")) # categories not part of climate categories so we need to add them manually cats = { @@ -171,7 +176,7 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds): } # make conversion from csv - conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats) + conv = cc.Conversion.from_csv(get_test_data_filepath("simple_conversion.csv"), cats=cats) # make a dummy dataset based on A cats da = empty_ds["CO2"] From 73ce9599c33a29303cbada2de128f61f772a2a1c Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 24 Oct 2024 15:11:07 +0200 Subject: [PATCH 32/36] Revert "get test data with importlib" This reverts commit 1b714fd5d4c12e32f9da811118d97a57a2ea54aa. --- primap2/tests/test_convert.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 79327765..20f750a6 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,7 +1,6 @@ """Tests for _convert.py""" -import importlib -import importlib.resources +import pathlib import re import climate_categories as cc @@ -12,10 +11,6 @@ import primap2 -def get_test_data_filepath(fname: str): - return importlib.resources.files("primap2.tests.data").joinpath(fname) - - def test_conversion_source_does_not_match_dataset_dimension(empty_ds): # make a data set with IPCC1996 categories da = empty_ds["CO2"] @@ -26,7 +21,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): da.data = arr # load the BURDI to IPCC2006 category conversion - filepath = get_test_data_filepath("BURDI_conversion.csv") + filepath = pathlib.Path("data/BURDI_conversion.csv") conv = cc.Conversion.from_csv(filepath) @@ -35,7 +30,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): "does not match the categorization in the data set (IPCC1996)." ) with pytest.raises(ValueError, match=re.escape(msg)): - result = da.pr.convert( # noqa: F841 + result = da.pr.convert( dim="category", conversion=conv, ) @@ -74,7 +69,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset): # test with new conversion and two existing categorisations def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories - filepath = get_test_data_filepath("BURDI_conversion.csv") + filepath = pathlib.Path("data/BURDI_conversion.csv") conv = cc.Conversion.from_csv(filepath) # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ @@ -164,10 +159,10 @@ def test_convert_BURDI(empty_ds: xr.Dataset): # test with new conversion and new categorisations def test_custom_conversion_and_two_custom_categorisations(empty_ds): # make categorisation A from yaml - categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml")) + categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml") # make categorisation B from yaml - categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml")) + categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml") # categories not part of climate categories so we need to add them manually cats = { @@ -176,7 +171,7 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds): } # make conversion from csv - conv = cc.Conversion.from_csv(get_test_data_filepath("simple_conversion.csv"), cats=cats) + conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats) # make a dummy dataset based on A cats da = empty_ds["CO2"] From c90bb018d082d5347793e40e4e47f271af7a973b Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 24 Oct 2024 15:14:01 +0200 Subject: [PATCH 33/36] importlib --- primap2/tests/test_convert.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 20f750a6..79327765 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -1,6 +1,7 @@ """Tests for _convert.py""" -import pathlib +import importlib +import importlib.resources import re import climate_categories as cc @@ -11,6 +12,10 @@ import primap2 +def get_test_data_filepath(fname: str): + return importlib.resources.files("primap2.tests.data").joinpath(fname) + + def test_conversion_source_does_not_match_dataset_dimension(empty_ds): # make a data set with IPCC1996 categories da = empty_ds["CO2"] @@ -21,7 +26,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): da.data = arr # load the BURDI to IPCC2006 category conversion - filepath = pathlib.Path("data/BURDI_conversion.csv") + filepath = get_test_data_filepath("BURDI_conversion.csv") conv = cc.Conversion.from_csv(filepath) @@ -30,7 +35,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): "does not match the categorization in the data set (IPCC1996)." ) with pytest.raises(ValueError, match=re.escape(msg)): - result = da.pr.convert( + result = da.pr.convert( # noqa: F841 dim="category", conversion=conv, ) @@ -69,7 +74,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset): # test with new conversion and two existing categorisations def test_convert_BURDI(empty_ds: xr.Dataset): # make a sample conversion object in climate categories - filepath = pathlib.Path("data/BURDI_conversion.csv") + filepath = get_test_data_filepath("BURDI_conversion.csv") conv = cc.Conversion.from_csv(filepath) # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/ @@ -159,10 +164,10 @@ def test_convert_BURDI(empty_ds: xr.Dataset): # test with new conversion and new categorisations def test_custom_conversion_and_two_custom_categorisations(empty_ds): # make categorisation A from yaml - categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml") + categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml")) # make categorisation B from yaml - categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml") + categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml")) # categories not part of climate categories so we need to add them manually cats = { @@ -171,7 +176,7 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds): } # make conversion from csv - conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats) + conv = cc.Conversion.from_csv(get_test_data_filepath("simple_conversion.csv"), cats=cats) # make a dummy dataset based on A cats da = empty_ds["CO2"] From fffb84af5d5c4b6faed8035776f656c4bf81afd4 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Thu, 24 Oct 2024 15:21:20 +0200 Subject: [PATCH 34/36] clean up --- primap2/_convert.py | 4 +--- primap2/tests/test_convert.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/primap2/_convert.py b/primap2/_convert.py index 295ac437..36662e23 100644 --- a/primap2/_convert.py +++ b/primap2/_convert.py @@ -17,9 +17,7 @@ def convert( self, dim: Hashable | str, *, - # TODO type will change to climate_categories.Conversion when - # https://github.com/primap-community/climate_categories/pull/164 is merged - conversion: climate_categories._conversions.Conversion, + conversion: climate_categories.Conversion, sum_rule: typing.Literal["intensive", "extensive"] | None = None, input_weights: xr.DataArray | None = None, output_weights: xr.DataArray | None = None, diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 79327765..2126b6cf 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -27,7 +27,6 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds): # load the BURDI to IPCC2006 category conversion filepath = get_test_data_filepath("BURDI_conversion.csv") - conv = cc.Conversion.from_csv(filepath) msg = ( From 4b9bf2b7028a83b77a76e1174901025cdeaf683c Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 28 Oct 2024 10:04:32 +0100 Subject: [PATCH 35/36] test signed commit --- primap2/tests/test_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index 2126b6cf..e91cd9a0 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -148,7 +148,7 @@ def test_convert_BURDI(empty_ds: xr.Dataset): .item() ) # 3.C.7 (converted from 4.C) should still be part of the data set, - # although it apprears in two conversion rules + # although it appears in two conversion rules assert ( (result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item() ) From 86aec445e80d16ca7ff9627108ef65f7d84e3423 Mon Sep 17 00:00:00 2001 From: Daniel Busch Date: Mon, 28 Oct 2024 10:08:36 +0100 Subject: [PATCH 36/36] update email for verified commits --- primap2/tests/test_convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py index e91cd9a0..2edfe5af 100644 --- a/primap2/tests/test_convert.py +++ b/primap2/tests/test_convert.py @@ -121,7 +121,7 @@ def test_convert_BURDI(empty_ds: xr.Dataset): "7", ] - # build a DA categorized by BURDI and with 1 everywhere so results are easy + # build a DA categorised by BURDI and with 1 everywhere so results are easy # to see da = empty_ds["CO2"] da = da.expand_dims({"category (BURDI)": BURDI_categories})