From 952d9aca23bb2b21e9ea91841c235e1c9b39ff18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@pik-potsdam.de>
Date: Thu, 23 Sep 2021 19:55:50 +0200
Subject: [PATCH 01/36] Add initial support for converting DataArrays.

---
 primap2/_aggregate.py         |  28 ++-
 primap2/_alias_selection.py   |  28 ++-
 primap2/_convert.py           | 309 ++++++++++++++++++++++++++++++++++
 primap2/_downscale.py         |   8 +-
 primap2/accessors.py          |   2 +
 primap2/tests/test_convert.py |  20 +++
 setup.cfg                     |   1 +
 7 files changed, 369 insertions(+), 27 deletions(-)
 create mode 100644 primap2/_convert.py
 create mode 100644 primap2/tests/test_convert.py

diff --git a/primap2/_aggregate.py b/primap2/_aggregate.py
index b1e30afb..59539030 100644
--- a/primap2/_aggregate.py
+++ b/primap2/_aggregate.py
@@ -33,16 +33,15 @@ def select_no_scalar_dimension(
     """
     if sel is None:
         return obj
-    else:
-        sele: DatasetOrDataArray = obj.loc[sel]
-        if dim_names(obj) != dim_names(sele):
-            raise ValueError(
-                "The dimension of the selection doesn't match the dimension of the "
-                "orginal dataset. Likely you used a selection casting to a scalar "
-                "dimension, like sel={'axis': 'value'}. Please use "
-                "sel={'axis': ['value']} instead."
-            )
-        return sele
+    selection: DatasetOrDataArray = obj.loc[sel]
+    if dim_names(obj) != dim_names(selection):
+        raise ValueError(
+            "The dimension of the selection doesn't match the dimension of the "
+            "orginal dataset. Likely you used a selection casting to a scalar "
+            "dimension, like sel={'axis': 'value'}. Please use "
+            "sel={'axis': ['value']} instead."
+        )
+    return selection
 
 
 class DataArrayAggregationAccessor(BaseDataArrayAccessor):
@@ -54,11 +53,10 @@ def _reduce_dim(
                 "Only one of 'dim' and 'reduce_to_dim' may be supplied, not both."
             )
 
-        if dim is None:
-            if reduce_to_dim is not None:
-                if isinstance(reduce_to_dim, str):
-                    reduce_to_dim = [reduce_to_dim]
-                dim = set(self._da.dims) - set(reduce_to_dim)
+        if dim is None and reduce_to_dim is not None:
+            if isinstance(reduce_to_dim, str):
+                reduce_to_dim = [reduce_to_dim]
+            dim = set(self._da.dims) - set(reduce_to_dim)
 
         return dim
 
diff --git a/primap2/_alias_selection.py b/primap2/_alias_selection.py
index 9b16749c..73f75335 100644
--- a/primap2/_alias_selection.py
+++ b/primap2/_alias_selection.py
@@ -16,6 +16,22 @@ def __init__(self, dim):
 
 
 def translate(item: KeyT, translations: typing.Mapping[typing.Hashable, str]) -> KeyT:
+    """Translates a single str key or the keys of a dict using the given translations.
+
+    If a key is not found in the translations, return it untranslated.
+
+    Parameters
+    ----------
+    item : str or dict with str keys
+        The input to translate. Either a str or a dict with str keys.
+    translations : dict
+        The translations to apply.
+
+    Returns
+    -------
+    translated : str or dict with str keys
+        The same type as the input item, but translated.
+    """
     if isinstance(item, str):
         if item in translations:
             return translations[item]
@@ -54,10 +70,9 @@ def translations_from_dims(
 ) -> typing.Dict[typing.Hashable, str]:
     ret: typing.Dict[typing.Hashable, str] = {}
     for dim in dims:
-        if isinstance(dim, str):
-            if " (" in dim:
-                key: str = dim.split("(")[0][:-1]
-                ret[key] = dim
+        if isinstance(dim, str) and " (" in dim:
+            key: str = dim.split("(")[0][:-1]
+            ret[key] = dim
     if "scenario" in ret:
         ret["scen"] = ret["scenario"]
     if "category" in ret:
@@ -77,10 +92,7 @@ def alias(
         return dim
     else:
         try:
-            rdim = []
-            for idim in dim:
-                rdim.append(alias(idim, translations, dims))
-            return rdim
+            return [alias(idim, translations, dims) for idim in dim]
         except TypeError:  # not iterable, so some other hashable like int
             if dim not in dims:
                 raise DimensionNotExistingError(dim)
diff --git a/primap2/_convert.py b/primap2/_convert.py
new file mode 100644
index 00000000..96439f44
--- /dev/null
+++ b/primap2/_convert.py
@@ -0,0 +1,309 @@
+import typing
+from typing import Hashable
+
+import climate_categories
+import numpy as np
+import xarray as xr
+from loguru import logger
+
+from . import _accessor_base
+from ._alias_selection import alias_dims
+
+
+def extract_categorization_from_dim(dim: str) -> (str, str):
+    """Extract the pure dimension and the categorization from a composite dim.
+
+    Parameters
+    ----------
+    dim : str
+        Composite dim name like ``area (ISO3)`` where ``area`` is the pure dimension
+        name and ``ISO3`` is the used categorization.
+
+    Examples
+    --------
+    >>> extract_categorization_from_dim("area (ISO3)")
+    ('area', 'ISO3')
+    >>> extract_categorization_from_dim("area")
+    Traceback (most recent call last):
+    ...
+    ValueError: No categorization specified: 'area'.
+
+
+    Returns
+    -------
+    pure_dim, categorization : str, str
+        The pure_dim without categorization information and the categorization. If the
+        input dim does not contain categorization information, a ValueError is raised.
+    """
+    try:
+        pure, cat = dim.split("(", 1)
+    except ValueError:
+        raise ValueError(f"No categorization specified: {dim!r}.")
+    return pure[:-1], cat[:-1]
+
+
+def applicable_rule(conversion, category):
+    """Choose the best rule to derive the given category using the given conversion.
+
+    If there are multiple relevant rules, will prefer rules with:
+    1. the given category as the only target category.
+    2. only one source category
+    3. rules defined earlier in the CSV.
+
+    TODO: how to deal with restricted rules?
+    """
+    rules = conversion.relevant_rules({conversion.categorization_b[category]})
+    # a + b = c - d  can not be used to derive c nor d, only a and b
+    rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())]
+    # drop all restricted rules
+    # TODO do something smart with restricted rules
+    rules = [r for r in rules if not any(r.auxiliary_categories.values())]
+
+    if not rules:
+        raise KeyError(category)
+    # narrow down rules until we have exactly one rule to apply
+    # prefer rules where the target category is the only summand
+    if len(rules) != 1:
+        cardinalities = [r.cardinality_b for r in rules]
+        if "one" in cardinalities:
+            for i in range(len(rules)):
+                if cardinalities[i] == "many":
+                    rules.pop(i)
+    # prefer rules with exactly one source category
+    if len(rules) != 1:
+        cardinalities = [r.cardinality_a for r in rules]
+        if "one" in cardinalities:
+            for i in range(len(rules)):
+                if cardinalities[i] == "many":
+                    rules.pop(i)
+    # if we still have multiple eligible rules, just use the first
+    if len(rules) != 1:
+        rule_str = str(rules[0])
+        logger.info(
+            f"There are {len(rules)} rules to derive data for"
+            f" {category!r}, will"
+            f" use {rule_str!r} because it was defined earlier."
+        )
+
+    return rules[0]
+
+
+class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
+    @alias_dims(["dim"])
+    def convert(
+        self,
+        dim: typing.Union[Hashable, str],
+        categorization: typing.Union[climate_categories.Categorization, str],
+        *,
+        sum_rule: typing.Optional[str] = None,
+        input_weights: typing.Optional[xr.DataArray] = None,
+        output_weights: typing.Optional[xr.DataArray] = None,
+    ) -> xr.DataArray:
+        """Convert the data along the given dimension into the new categorization.
+
+        Maps the given dimension from one categorization (terminology) into another.
+        Fetches the rules to do the mapping from the climate_categories package, and
+        therefore will only work if there are conversions rules to convert from the
+        current categorization to the new categorization.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension to convert. Has to be a dimension from ``da.dims``.
+        categorization : climate_categories.Categorization or str
+            New categorization to convert the given dimension to. Either give the title
+            of the new categorization (like ``IPCC1996``) or a
+            ``climate_categories.Categorization`` object.
+        sum_rule : ``extensive``, ``intensive``, or None (default)
+            If data of categories has to be summed up or divided, we need information
+            whether the quantity measured is extensive (like, for example, total
+            emissions in a year subdivided into multiple sectoral categories) or
+            intensive (like, for example, average per-person emissions in a year
+            subdivided into different territorial entities). By default (None), a
+            warning is issued if data has to be summed up or divided.
+        input_weights : xr.DataArray, optional
+            If data in input categories has to be summed up and the sum_rule is
+            ``intensive``, weights for the input categories are required.
+            The weights can be given in any shape compatible with the DataArray that
+            is converted, e.g. to give different weights for industrial sectors by
+            country. However, at least the ``dim`` that is converted needs to be in
+            ``input_weights.dims``.
+            If no weights are specified but a rule requiring weights is specified
+            in the conversion rules, a warning is issued and the respective rule is
+            skipped (probably resulting in more NaNs in the output).
+        output_weights : xr.DataArray, optional
+            If data has to be divided into several output categories and the sum_rule is
+            ``extensive``, weights for the output categories are required.
+            The weights can be given in any shape compatible with the DataArray that
+            is converted, e.g. to give different weights for industrial sectors by
+            country. However, at least the ``dim`` that is converted needs to be in
+            ``output_weights.dims``.
+            If no weights are specified but a rule requiring weights is specified
+            in the conversion rules, a warning is issued and the respective rule is
+            skipped (probably resulting in more NaNs in the output).
+
+        Returns
+        -------
+        converted : xr.DataArray
+            A copy of the DataArray with the given dimension converted in the new
+            categorization.
+        """
+        if not isinstance(categorization, climate_categories.Categorization):
+            categorization = climate_categories.cats[categorization]
+
+        if sum_rule not in (None, "extensive", "intensive"):
+            raise ValueError(
+                f"sum_rule must bei either 'extensive' or 'intensive', not {sum_rule}"
+            )
+
+        dim_name, old_categorization_name = extract_categorization_from_dim(dim)
+        old_categorization: climate_categories.Categorization = climate_categories.cats[
+            old_categorization_name
+        ]
+        conversion = old_categorization.conversion_to(categorization)
+        new_dim = f"{dim_name} ({categorization.name})"
+
+        new_dims = []
+        new_shape = []
+        for i, old_dim in enumerate(self._da.dims):
+            if old_dim == dim:
+                new_dims.append(new_dim)
+                new_shape.append(len(categorization))
+            else:
+                new_dims.append(old_dim)
+                new_shape.append(self._da.shape[i])
+
+        new_coords = {}
+        for coord in self._da.coords:
+            if coord == dim:
+                new_coords[new_dim] = np.array(list(categorization.keys()))
+            elif dim in self._da.coords[coord].dims:
+                logger.info(
+                    f"Additional coordinate {coord} can not be converted automatically"
+                    f" and is skipped."
+                )
+                continue
+            else:
+                new_coords[coord] = self._da.coords[coord]
+
+        # initialize the converted array using all NA
+        all_na_array = np.empty(new_shape)
+        all_na_array[:] = np.nan
+        converted = xr.DataArray(
+            data=all_na_array,
+            dims=new_dims,
+            coords=new_coords,
+            name=self._da.name,
+            attrs=self._da.attrs,
+        )
+
+        converted_categories = []
+        for category in converted[new_dim]:
+            category = category.item()
+            if category in converted_categories:
+                continue
+            try:
+                rule = applicable_rule(conversion, category)
+            except KeyError:
+                logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
+                continue
+
+            # convert rule into xarray objects that will cleanly multiply regardless
+            # of dimensionality
+            input_selection = {
+                dim: [cat.codes[0] for cat in rule.factors_categories_a.keys()]
+            }
+            input_factors = xr.DataArray(
+                data=list(rule.factors_categories_a.values()),
+                dims=[dim],
+                coords=input_selection,
+            )
+            new_dim_values = [cat.codes[0] for cat in rule.factors_categories_b.keys()]
+            output_selection = {new_dim: new_dim_values}
+            output_factors = xr.DataArray(
+                data=list(rule.factors_categories_b.values()),
+                dims=[new_dim],
+                coords=output_selection,
+            )
+
+            # if the applicable rule is a multi-output rule, but some of the
+            # outputs are already converted, give up
+            already_converted = set(new_dim_values).intersection(
+                set(converted_categories)
+            )
+            if already_converted:
+                # TODO: maybe we can do better?
+                logger.warning(
+                    f"For category {category!r}, would want to use a "
+                    "rule with multiple outputs, but the following outputs "
+                    f"are already converted: {already_converted!r}. "
+                    "Skipping this category and leaving it NaN."
+                )
+                continue
+
+            # derive input and output weights (maybe trivial)
+            if rule.cardinality_a == "one" or sum_rule == "extensive":
+                effective_input_weights = 1
+            elif sum_rule == "intensive":
+                # summing intensive units requires weights
+                if input_weights is None:
+                    logger.warning(
+                        f"To derive data for {category!r}, we need to sum up"
+                        " multiple input categories. For sum_rule='intensive',"
+                        " this requires input_weights, but none are specified."
+                        " Will continue with NaN, specify input_weights to avoid this."
+                    )
+                    continue
+                effective_input_weights = input_weights.loc[input_selection]
+                # normalize so it is actually a weight, not a factor
+                effective_input_weights /= effective_input_weights.sum(dim=dim)
+            else:  # no sum rule specified, but needed
+                logger.warning(
+                    f"To derive data for {category!r}, we need to sum up"
+                    " multiple input categories, but the sum_rule is"
+                    " not specified. Will continue with NaN, specify the"
+                    " sum_rule to avoid this."
+                )
+                continue
+
+            if rule.cardinality_b == "one" or sum_rule == "intensive":
+                effective_output_weights = 1
+            elif sum_rule == "extensive":
+                # dividing extensive units requires weights
+                if output_weights is None:
+                    logger.warning(
+                        f"To derive data for {category!r}, we need to split up"
+                        " multiple output categories. For sum_rule='extensive',"
+                        " this requires output_weights, but none are specified."
+                        " Will continue with NaN, specify output_weights to avoid this."
+                    )
+                    continue
+                effective_output_weights = output_weights.loc[output_selection]
+                # normalize so it is actually a weight, not a factor
+                effective_output_weights /= effective_output_weights.sum(dim=dim)
+            else:  # no sum rule specified, but needed
+                logger.warning(
+                    f"To derive data for {category!r}, we need to split up"
+                    " multiple output categories, but the sum_rule is"
+                    " not specified. Will continue with NaN, specify the"
+                    " sum_rule to avoid this."
+                )
+                continue
+
+            # the left-hand side of the conversion formula summed up
+            lhs = (
+                input_factors * effective_input_weights * self._da.loc[input_selection]
+            ).sum(dim=dim)
+            # the right-hand side of the conversion formula split up
+            rhs = lhs / output_factors / effective_output_weights
+            # TODO: using pr.set here is not efficient because it makes copies
+            converted = converted.pr.set(
+                dim=new_dim,
+                key=new_dim_values,
+                value=rhs,
+            )
+
+            # mark all filled categories as converted
+            converted_categories += new_dim_values
+
+        return converted
diff --git a/primap2/_downscale.py b/primap2/_downscale.py
index 16b09134..a9eaf172 100644
--- a/primap2/_downscale.py
+++ b/primap2/_downscale.py
@@ -36,13 +36,13 @@ def downscale_timeseries(
         ----------
         dim: str
           The name of the dimension which contains the basket and its contents, has to
-          be one of the dimensions in ``ds.dims``.
+          be one of the dimensions in ``da.dims``.
         basket: str
           The name of the super-category for which values are known at higher temporal
-          resolution and/or for a wider range. A value from ``ds[dimension]``.
+          resolution and/or for a wider range. A value from ``da[dimension]``.
         basket_contents: list of str
           The name of the sub-categories. The sum of all sub-categories equals the
-          basket. Values from ``ds[dimension]``.
+          basket. Values from ``da[dimension]``.
         check_consistency: bool, default True
           If for all points where the basket and all basket_contents are defined,
           it should be checked if the sum of the basket_contents actually equals
@@ -50,7 +50,7 @@ def downscale_timeseries(
         sel: Selection dict, optional
           If the downscaling should only be done on a subset of the Dataset while
           retaining all other values unchanged, give a selection dictionary. The
-          downscaling will be done on ``ds.loc[sel]``.
+          downscaling will be done on ``da.loc[sel]``.
         skipna_evaluation_dims: list of str, optional
           Dimensions which should be evaluated to determine if NA values should be
           skipped entirely if missing fully. By default, no NA values are skipped.
diff --git a/primap2/accessors.py b/primap2/accessors.py
index 1323b188..e177362e 100644
--- a/primap2/accessors.py
+++ b/primap2/accessors.py
@@ -7,6 +7,7 @@
     DataArrayAliasSelectionAccessor,
     DatasetAliasSelectionAccessor,
 )
+from ._convert import DataArrayConversionAccessor
 from ._data_format import DatasetDataFormatAccessor
 from ._downscale import DataArrayDownscalingAccessor, DatasetDownscalingAccessor
 from ._metadata import DatasetMetadataAccessor
@@ -33,6 +34,7 @@ class PRIMAP2DatasetAccessor(
 class PRIMAP2DataArrayAccessor(
     DataArrayAggregationAccessor,
     DataArrayAliasSelectionAccessor,
+    DataArrayConversionAccessor,
     DataArrayDownscalingAccessor,
     DataArrayOverviewAccessor,
     DataArraySettersAccessor,
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
new file mode 100644
index 00000000..fbd7b9fb
--- /dev/null
+++ b/primap2/tests/test_convert.py
@@ -0,0 +1,20 @@
+"""Tests for _convert.py"""
+
+import climate_categories as cc
+import xarray as xr
+
+import primap2
+
+
+def test_convert_ipcc(empty_ds: xr.Dataset):
+    # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy
+    # to see
+    da = empty_ds["CO2"]
+    da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())})
+    arr = da.data.copy()
+    arr[:] = 1 * primap2.ureg("Gg CO2 / year")
+    da.data = arr
+
+    da.pr.convert("category", "IPCC2006", sum_rule="extensive")
+
+    # TODO test that values actually make sense
diff --git a/setup.cfg b/setup.cfg
index 8668f10d..4269e994 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,6 +47,7 @@ install_requires =
     ruamel.yaml
     strictyaml
     openpyxl
+    climate_categories>=0.5.0
 
 [options.extras_require]
 test =

From 2e5bb708f9c36a03d34714c3ea11b322eec3e16f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@pik-potsdam.de>
Date: Tue, 2 Nov 2021 16:15:54 +0100
Subject: [PATCH 02/36] Depend on version of climate_categories with conversion
 support.

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 4269e994..8acb6943 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
     ruamel.yaml
     strictyaml
     openpyxl
-    climate_categories>=0.5.0
+    climate_categories>=0.6.0
 
 [options.extras_require]
 test =

From 2afb274364303c62f5e9fb0129dd7e1df521d99f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@pik-potsdam.de>
Date: Tue, 2 Nov 2021 17:32:46 +0100
Subject: [PATCH 03/36] Conversions: refactor some things into own
 sub-functions, fix some oversights.

---
 primap2/_convert.py | 157 +++++++++++++++++++++++++++++++-------------
 1 file changed, 112 insertions(+), 45 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 96439f44..3edb5dba 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -1,3 +1,4 @@
+import copy
 import typing
 from typing import Hashable
 
@@ -88,6 +89,106 @@ def applicable_rule(conversion, category):
     return rules[0]
 
 
+def ensure_categorization_instance(
+    cat: typing.Union[str, climate_categories.Categorization]
+) -> climate_categories.Categorization:
+    """Takes a categorization name or object and returns the corresponding
+    categorization object."""
+    if isinstance(cat, climate_categories.Categorization):
+        return cat
+    return climate_categories.cats[cat]
+
+
+def check_valid_sum_rule_types(sum_rule: typing.Optional[str]):
+    """Checks if the sum_rule is either "intensive", "extensive", or None.
+
+    Raises a ValueError if an invalid sum_rule is used."""
+    if sum_rule not in (None, "extensive", "intensive"):
+        raise ValueError(
+            f"if defined, sum_rule must be either 'extensive' or 'intensive', not"
+            f" {sum_rule}"
+        )
+
+
+def initialize_empty_converted_da(
+    *,
+    old_da: xr.DataArray,
+    old_dim: typing.Union[Hashable, str],
+    new_dim: str,
+    new_categorization: climate_categories.Categorization,
+) -> xr.DataArray:
+    """Build a DataArray which can hold the data after conversion to a new
+    categorization.
+
+    Returns a new DataArray with the same dimensions and coordinates as the old
+    DataArray, but with the old_dim dimension replaced by new_dim using the
+    new_categorization.
+    The returned DataArray is filled with NaN.
+
+    Parameters
+    ----------
+    old_da: xr.DataArray
+        The unconverted array.
+    old_dim: str
+        The name of the dimension (including the categorization) which will be
+        converted. Example: "area (ISO3)"
+    new_dim: str
+        The name of the dimension (including the categorization) after conversion.
+        Example: "area (ISO2)"
+    new_categorization: climate_categories.Categorization
+        The new categorization object.
+
+    Returns
+    -------
+    new_da: xr.DataArray
+        An empty array with the right shape to hold the data after conversion.
+    """
+    new_dims = []
+    new_shape = []
+    for i, idim in enumerate(old_da.dims):
+        if idim == old_dim:
+            new_dims.append(new_dim)
+            new_shape.append(len(new_categorization))
+        else:
+            new_dims.append(idim)
+            new_shape.append(old_da.shape[i])
+
+    new_coords = {}
+    for coord in old_da.coords:
+        if coord == old_dim:
+            new_coords[new_dim] = np.array(list(new_categorization.keys()))
+        elif old_dim in old_da.coords[coord].dims:
+            # The additional coordinate has the old_dim as one dimension, but we
+            # won't be able to convert it
+            logger.info(
+                f"Additional coordinate {coord} can not be converted automatically"
+                f" and is skipped."
+            )
+            continue
+        else:
+            new_coords[coord] = old_da.coords[coord]
+
+    new_attrs = copy.deepcopy(old_da.attrs)
+    for pdim in ("area", "cat", "scen"):
+        if pdim in new_attrs and new_attrs[pdim] == old_dim:
+            new_attrs[pdim] = new_dim
+
+    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]:
+        new_attrs["sec_cats"].remove(old_dim)
+        new_attrs["sec_cats"].append(new_dim)
+
+    # initialize the converted array using all NA
+    all_na_array = np.empty(new_shape)
+    all_na_array[:] = np.nan
+    return xr.DataArray(
+        data=all_na_array,
+        dims=new_dims,
+        coords=new_coords,
+        name=old_da.name,
+        attrs=new_attrs,
+    )
+
+
 class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
     @alias_dims(["dim"])
     def convert(
@@ -148,53 +249,19 @@ def convert(
             A copy of the DataArray with the given dimension converted in the new
             categorization.
         """
-        if not isinstance(categorization, climate_categories.Categorization):
-            categorization = climate_categories.cats[categorization]
-
-        if sum_rule not in (None, "extensive", "intensive"):
-            raise ValueError(
-                f"sum_rule must bei either 'extensive' or 'intensive', not {sum_rule}"
-            )
+        new_categorization = ensure_categorization_instance(categorization)
+        check_valid_sum_rule_types(sum_rule)
 
         dim_name, old_categorization_name = extract_categorization_from_dim(dim)
-        old_categorization: climate_categories.Categorization = climate_categories.cats[
-            old_categorization_name
-        ]
-        conversion = old_categorization.conversion_to(categorization)
-        new_dim = f"{dim_name} ({categorization.name})"
-
-        new_dims = []
-        new_shape = []
-        for i, old_dim in enumerate(self._da.dims):
-            if old_dim == dim:
-                new_dims.append(new_dim)
-                new_shape.append(len(categorization))
-            else:
-                new_dims.append(old_dim)
-                new_shape.append(self._da.shape[i])
-
-        new_coords = {}
-        for coord in self._da.coords:
-            if coord == dim:
-                new_coords[new_dim] = np.array(list(categorization.keys()))
-            elif dim in self._da.coords[coord].dims:
-                logger.info(
-                    f"Additional coordinate {coord} can not be converted automatically"
-                    f" and is skipped."
-                )
-                continue
-            else:
-                new_coords[coord] = self._da.coords[coord]
-
-        # initialize the converted array using all NA
-        all_na_array = np.empty(new_shape)
-        all_na_array[:] = np.nan
-        converted = xr.DataArray(
-            data=all_na_array,
-            dims=new_dims,
-            coords=new_coords,
-            name=self._da.name,
-            attrs=self._da.attrs,
+        old_categorization = ensure_categorization_instance(old_categorization_name)
+        conversion = old_categorization.conversion_to(new_categorization)
+        new_dim = f"{dim_name} ({new_categorization.name})"
+
+        converted = initialize_empty_converted_da(
+            old_da=self._da,
+            old_dim=dim,
+            new_dim=new_dim,
+            new_categorization=new_categorization,
         )
 
         converted_categories = []

From 000dfc1d49204caf3a00a7b3f03e1db2018fc175 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@pik-potsdam.de>
Date: Fri, 5 Nov 2021 18:58:21 +0100
Subject: [PATCH 04/36] Conversions: refactor, do something useful with rules
 restricted to specific categories.

---
 primap2/_convert.py           | 624 +++++++++++++++++++++++-----------
 primap2/tests/test_convert.py |  12 +-
 setup.cfg                     |   2 +-
 3 files changed, 429 insertions(+), 209 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 3edb5dba..5917190d 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -11,6 +11,246 @@
 from ._alias_selection import alias_dims
 
 
+class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
+    @alias_dims(["dim"])
+    def convert(
+        self,
+        dim: typing.Union[Hashable, str],
+        categorization: typing.Union[climate_categories.Categorization, str],
+        *,
+        sum_rule: typing.Optional[str] = None,
+        input_weights: typing.Optional[xr.DataArray] = None,
+        output_weights: typing.Optional[xr.DataArray] = None,
+        auxiliary_dimensions: typing.Optional[typing.Dict[str, str]] = None,
+    ) -> xr.DataArray:
+        """Convert the data along the given dimension into the new categorization.
+
+        Maps the given dimension from one categorization (terminology) into another.
+        Fetches the rules to do the mapping from the climate_categories package, and
+        therefore will only work if there are conversions rules to convert from the
+        current categorization to the new categorization.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension to convert. Has to be a dimension from ``da.dims``.
+        categorization : climate_categories.Categorization or str
+            New categorization to convert the given dimension to. Either give the title
+            of the new categorization (like ``IPCC1996``) or a
+            ``climate_categories.Categorization`` object.
+        sum_rule : ``extensive``, ``intensive``, or None (default)
+            If data of categories has to be summed up or divided, we need information
+            whether the quantity measured is extensive (like, for example, total
+            emissions in a year subdivided into multiple sectoral categories) or
+            intensive (like, for example, average per-person emissions in a year
+            subdivided into different territorial entities). By default (None), a
+            warning is issued if data has to be summed up or divided.
+        input_weights : xr.DataArray, optional
+            If data in input categories has to be summed up and the sum_rule is
+            ``intensive``, weights for the input categories are required.
+            The weights can be given in any shape compatible with the DataArray that
+            is converted, e.g. to give different weights for industrial sectors by
+            country. However, at least the ``dim`` that is converted needs to be in
+            ``input_weights.dims``.
+            If no weights are specified but a rule requiring weights is specified
+            in the conversion rules, a warning is issued and the respective rule is
+            skipped (probably resulting in more NaNs in the output).
+        output_weights : xr.DataArray, optional
+            If data has to be divided into several output categories and the sum_rule is
+            ``extensive``, weights for the output categories are required.
+            The weights can be given in any shape compatible with the DataArray that
+            is converted, e.g. to give different weights for industrial sectors by
+            country. However, at least the ``dim`` that is converted needs to be in
+            ``output_weights.dims``.
+            If no weights are specified but a rule requiring weights is specified
+            in the conversion rules, a warning is issued and the respective rule is
+            skipped (probably resulting in more NaNs in the output).
+        auxiliary_dimensions : dict[str, str], optional
+            Mapping of auxiliary categorizations to dimension names used in this
+            DataArray. In conversions which contain rules which are valid only for
+            certain orthogonal dimensions (e.g. a conversion between different sectoral
+            terminologies, but some rules are only valid for specific countries), only
+            the categorization is specified. Therefore, in this case you have to specify
+            a mapping from categorization name to dimension name.
+            Example: {"ISO3": "area (ISO3)"}) .
+
+        Returns
+        -------
+        converted : xr.DataArray
+            A copy of the DataArray with the given dimension converted in the new
+            categorization.
+        """
+        new_categorization = ensure_categorization_instance(categorization)
+        check_valid_sum_rule_types(sum_rule)
+
+        dim_name, old_categorization_name = extract_categorization_from_dim(dim)
+        old_categorization = ensure_categorization_instance(old_categorization_name)
+        conversion = old_categorization.conversion_to(new_categorization)
+        auxiliary_dimensions = prepare_auxiliary_dimensions(
+            conversion, auxiliary_dimensions
+        )
+        new_dim = f"{dim_name} ({new_categorization.name})"
+
+        converted_da = initialize_empty_converted_da(
+            old_da=self._da,
+            old_dim=dim,
+            new_dim=new_dim,
+            new_categorization=new_categorization,
+        )
+
+        converted_categories = []
+        for category in converted_da[new_dim]:
+            if category in converted_categories:
+                continue
+            newly_converted_categories, converted_da = self._fill_category(
+                da=converted_da,
+                dim=dim,
+                new_dim=new_dim,
+                already_converted_categories=converted_categories,
+                category=category.item(),
+                conversion=conversion,
+                sum_rule=sum_rule,
+                auxiliary_dimensions=auxiliary_dimensions,
+                input_weights=input_weights,
+                output_weights=output_weights,
+            )
+            converted_categories += newly_converted_categories
+
+        return converted_da
+
+    def _fill_category(
+        self,
+        da: xr.DataArray,
+        dim: str,
+        new_dim: str,
+        already_converted_categories: typing.List[climate_categories.Category],
+        category: climate_categories.Category,
+        conversion: climate_categories.Conversion,
+        sum_rule: typing.Optional[str],
+        auxiliary_dimensions: typing.Optional[
+            typing.Dict[climate_categories.Categorization, str]
+        ],
+        input_weights: typing.Optional[xr.DataArray] = None,
+        output_weights: typing.Optional[xr.DataArray] = None,
+    ) -> typing.Tuple[typing.List[climate_categories.Category], xr.DataArray]:
+        """Return a copy of da with the given category filled by values converted
+        using the given conversion.
+
+        Parameters
+        ----------
+        da: xr.DataArray
+            The array which should be filled with the newly converted values.
+        dim: str
+            The source dimension.
+        new_dim: str
+            The target dimension.
+        already_converted_categories: list of climate_categories.Category
+            Categories which are already converted and should not be overwritten.
+            This is important if the category that should be filled can be filled
+            using compound rules which fill additional categories.
+        category: climate_categories.Category
+            The category from the new dimension which should be filled.
+        conversion: climate_categories.Conversion
+            The conversion to use to compute the values for the given category.
+        sum_rule: str, optional
+            See docstring of `convert`.
+        auxiliary_dimensions:
+            See docstring of `convert`.
+        input_weights: xr.DataArray, optional
+            See docstring of `convert`.
+        output_weights: xr.DataArray, optional
+            See docstring of `convert`.
+
+        Returns
+        -------
+        filled_categories, filled: list of climate_categories.category, xr.DataArray
+            The categories that were filled and the new DataArray.
+        """
+        try:
+            rules = applicable_rules(conversion, category)
+        except KeyError:
+            logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
+            return [], da
+
+        for rule in rules:
+            logger.debug(f"Processing rule {rule}.")
+            # iterate until a non-restricted rule was applied or all rules are
+            # exhausted
+            input_selection, input_factors = factors_categories_to_xarray(
+                dim=dim,
+                factors_categories=rule.factors_categories_a,
+                auxiliary_categories=rule.auxiliary_categories,
+                auxiliary_dimensions=auxiliary_dimensions,
+            )
+            output_selection, output_factors = factors_categories_to_xarray(
+                dim=new_dim,
+                factors_categories=rule.factors_categories_b,
+                auxiliary_categories=rule.auxiliary_categories,
+                auxiliary_dimensions=auxiliary_dimensions,
+            )
+
+            # if it is a multi-output rule, but some of the
+            # outputs are already converted, we can't use it
+            # TODO: instead, we could use the already converted output as
+            # *input*, which would probably be more correct, but also pretty
+            # difficult.
+            already_converted = set(output_selection[new_dim]).intersection(
+                set(already_converted_categories)
+            )
+            if already_converted:
+                logger.warning(
+                    f"For category {category!r}, would want to use a "
+                    "rule with multiple outputs, but the following outputs "
+                    f"are already converted: {already_converted!r}. "
+                    "Skipping this rule."
+                )
+                continue
+
+            try:
+                effective_input_weights = weights(
+                    dim=dim,
+                    category=category,
+                    rule=rule,
+                    operation_type="input",
+                    selection=input_selection,
+                    sum_rule=sum_rule,
+                    weights=input_weights,
+                )
+                effective_output_weights = weights(
+                    dim=new_dim,
+                    category=category,
+                    rule=rule,
+                    operation_type="output",
+                    selection=output_selection,
+                    sum_rule=sum_rule,
+                    weights=output_weights,
+                )
+            except WeightingInfoMissing as err:
+                logger.warning(str(err))
+                continue
+
+            # the left-hand side of the conversion formula summed up
+            lhs = (
+                input_factors * effective_input_weights * self._da.loc[input_selection]
+            ).sum(dim=dim)
+            # the right-hand side of the conversion formula split up
+            rhs = lhs / output_factors / effective_output_weights
+
+            # TODO: this is slow because it makes copies
+            # fillna behaviour (only overwrites NaN in converted)
+            da = da.combine_first(rhs)
+
+            if not rule.is_restricted:
+                # stop processing rules for this category
+                return output_selection[new_dim], da
+
+        logger.debug(
+            f"No unrestricted rule to derive data for {category!r} applied, some or "
+            f"all data for the category will be NaN."
+        )
+        return [], da
+
+
 def extract_categorization_from_dim(dim: str) -> (str, str):
     """Extract the pure dimension and the categorization from a composite dim.
 
@@ -43,50 +283,17 @@ def extract_categorization_from_dim(dim: str) -> (str, str):
     return pure[:-1], cat[:-1]
 
 
-def applicable_rule(conversion, category):
-    """Choose the best rule to derive the given category using the given conversion.
-
-    If there are multiple relevant rules, will prefer rules with:
-    1. the given category as the only target category.
-    2. only one source category
-    3. rules defined earlier in the CSV.
-
-    TODO: how to deal with restricted rules?
-    """
+def applicable_rules(
+    conversion, category
+) -> typing.List[climate_categories.ConversionRule]:
+    """Find the possible rules to derive the category using the given conversion."""
     rules = conversion.relevant_rules({conversion.categorization_b[category]})
     # a + b = c - d  can not be used to derive c nor d, only a and b
     rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())]
-    # drop all restricted rules
-    # TODO do something smart with restricted rules
-    rules = [r for r in rules if not any(r.auxiliary_categories.values())]
 
     if not rules:
         raise KeyError(category)
-    # narrow down rules until we have exactly one rule to apply
-    # prefer rules where the target category is the only summand
-    if len(rules) != 1:
-        cardinalities = [r.cardinality_b for r in rules]
-        if "one" in cardinalities:
-            for i in range(len(rules)):
-                if cardinalities[i] == "many":
-                    rules.pop(i)
-    # prefer rules with exactly one source category
-    if len(rules) != 1:
-        cardinalities = [r.cardinality_a for r in rules]
-        if "one" in cardinalities:
-            for i in range(len(rules)):
-                if cardinalities[i] == "many":
-                    rules.pop(i)
-    # if we still have multiple eligible rules, just use the first
-    if len(rules) != 1:
-        rule_str = str(rules[0])
-        logger.info(
-            f"There are {len(rules)} rules to derive data for"
-            f" {category!r}, will"
-            f" use {rule_str!r} because it was defined earlier."
-        )
-
-    return rules[0]
+    return rules
 
 
 def ensure_categorization_instance(
@@ -189,188 +396,191 @@ def initialize_empty_converted_da(
     )
 
 
-class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
-    @alias_dims(["dim"])
-    def convert(
-        self,
-        dim: typing.Union[Hashable, str],
-        categorization: typing.Union[climate_categories.Categorization, str],
-        *,
-        sum_rule: typing.Optional[str] = None,
-        input_weights: typing.Optional[xr.DataArray] = None,
-        output_weights: typing.Optional[xr.DataArray] = None,
-    ) -> xr.DataArray:
-        """Convert the data along the given dimension into the new categorization.
+def factors_categories_to_xarray(
+    *,
+    dim: str,
+    factors_categories: typing.Dict[climate_categories.Category, int],
+    auxiliary_categories: typing.Dict[
+        climate_categories.Categorization, typing.Set[climate_categories.Category]
+    ],
+    auxiliary_dimensions: typing.Dict[climate_categories.Categorization, str],
+) -> typing.Tuple[typing.Dict[str, typing.List[str]], xr.DataArray]:
+    """Convert dictionary mapping categories to factors into xarray-compatible objects.
+
+    Using the xarray objects ensures that in subsequent calculations, everything
+    will cleanly multiply reagardless of the dimensionality of the data.
 
-        Maps the given dimension from one categorization (terminology) into another.
-        Fetches the rules to do the mapping from the climate_categories package, and
-        therefore will only work if there are conversions rules to convert from the
-        current categorization to the new categorization.
+    Parameters
+    ----------
+    dim: str
+        Dimension which contains the categories.
+    factors_categories: dict[climate_categories.Category, int]
+        Dictionary mapping categories to factors.
+    auxiliary_categories: dict
+        If the rule is limited to specific categories from other dimensions,
+        their categorizations and categories are given here.
+    auxiliary_dimensions: dict[climate_categories.Categorization, str]
+        If the rule is limited to specific categories from other dimensions, the mapping
+        from the used Categorizations to the dimension names used in the data to be
+        converted has to be given.
 
-        Parameters
-        ----------
-        dim : str
-            Dimension to convert. Has to be a dimension from ``da.dims``.
-        categorization : climate_categories.Categorization or str
-            New categorization to convert the given dimension to. Either give the title
-            of the new categorization (like ``IPCC1996``) or a
-            ``climate_categories.Categorization`` object.
-        sum_rule : ``extensive``, ``intensive``, or None (default)
-            If data of categories has to be summed up or divided, we need information
-            whether the quantity measured is extensive (like, for example, total
-            emissions in a year subdivided into multiple sectoral categories) or
-            intensive (like, for example, average per-person emissions in a year
-            subdivided into different territorial entities). By default (None), a
-            warning is issued if data has to be summed up or divided.
-        input_weights : xr.DataArray, optional
-            If data in input categories has to be summed up and the sum_rule is
-            ``intensive``, weights for the input categories are required.
-            The weights can be given in any shape compatible with the DataArray that
-            is converted, e.g. to give different weights for industrial sectors by
-            country. However, at least the ``dim`` that is converted needs to be in
-            ``input_weights.dims``.
-            If no weights are specified but a rule requiring weights is specified
-            in the conversion rules, a warning is issued and the respective rule is
-            skipped (probably resulting in more NaNs in the output).
-        output_weights : xr.DataArray, optional
-            If data has to be divided into several output categories and the sum_rule is
-            ``extensive``, weights for the output categories are required.
-            The weights can be given in any shape compatible with the DataArray that
-            is converted, e.g. to give different weights for industrial sectors by
-            country. However, at least the ``dim`` that is converted needs to be in
-            ``output_weights.dims``.
-            If no weights are specified but a rule requiring weights is specified
-            in the conversion rules, a warning is issued and the respective rule is
-            skipped (probably resulting in more NaNs in the output).
+    Returns
+    -------
+    selection, factors: dict[str, list[str]], xr.DataArray
+        selection is a dictionary which can be used as a selector to select the
+        appropriate categories from an xarray object.
+        factors is an xarray DataArray which can be multiplied with an xarray object
+        after applying the selection.
+    """
+    selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]}
+    factors = xr.DataArray(
+        data=list(factors_categories.values()),
+        dims=[dim],
+        coords=selection,
+    )
 
-        Returns
-        -------
-        converted : xr.DataArray
-            A copy of the DataArray with the given dimension converted in the new
-            categorization.
-        """
-        new_categorization = ensure_categorization_instance(categorization)
-        check_valid_sum_rule_types(sum_rule)
+    for aux_categorization, aux_categories in auxiliary_categories.items():
+        if aux_categories:
+            aux_dim = auxiliary_dimensions[aux_categorization]
+            selection[aux_dim] = [cat.codes[0] for cat in aux_categories]
 
-        dim_name, old_categorization_name = extract_categorization_from_dim(dim)
-        old_categorization = ensure_categorization_instance(old_categorization_name)
-        conversion = old_categorization.conversion_to(new_categorization)
-        new_dim = f"{dim_name} ({new_categorization.name})"
+    return selection, factors
 
-        converted = initialize_empty_converted_da(
-            old_da=self._da,
-            old_dim=dim,
-            new_dim=new_dim,
-            new_categorization=new_categorization,
+
+class WeightingInfoMissing(ValueError):
+    """Some information to derive weighting factors for a rule is missing."""
+
+    def __init__(
+        self,
+        category: climate_categories.Category,
+        rule: climate_categories.ConversionRule,
+        message: str,
+    ):
+        full_message = (
+            f"Can not derive data for category {category!r} using rule"
+            f" '{rule}': {message} Skipping this rule."
         )
+        ValueError.__init__(self, full_message)
 
-        converted_categories = []
-        for category in converted[new_dim]:
-            category = category.item()
-            if category in converted_categories:
-                continue
-            try:
-                rule = applicable_rule(conversion, category)
-            except KeyError:
-                logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
-                continue
 
-            # convert rule into xarray objects that will cleanly multiply regardless
-            # of dimensionality
-            input_selection = {
-                dim: [cat.codes[0] for cat in rule.factors_categories_a.keys()]
-            }
-            input_factors = xr.DataArray(
-                data=list(rule.factors_categories_a.values()),
-                dims=[dim],
-                coords=input_selection,
-            )
-            new_dim_values = [cat.codes[0] for cat in rule.factors_categories_b.keys()]
-            output_selection = {new_dim: new_dim_values}
-            output_factors = xr.DataArray(
-                data=list(rule.factors_categories_b.values()),
-                dims=[new_dim],
-                coords=output_selection,
-            )
+def weights(
+    *,
+    dim: str,
+    category: climate_categories.Category,
+    rule: climate_categories.ConversionRule,
+    sum_rule: typing.Optional[str],
+    operation_type: str,
+    weights: typing.Optional[xr.DataArray],
+    selection: typing.Dict[str, typing.List[str]],
+) -> typing.Union[xr.DataArray, float]:
+    """Derive the weights to use for applying a specific rule.
 
-            # if the applicable rule is a multi-output rule, but some of the
-            # outputs are already converted, give up
-            already_converted = set(new_dim_values).intersection(
-                set(converted_categories)
+    Parameters
+    ----------
+    dim: str
+        Dimension which contains the categories.
+    category: climate_categories.Category
+        Category which should be derived.
+    rule: climate_categories.ConversionRule
+        Rule that should be used to derive the category.
+    sum_rule : ``extensive``, ``intensive``, or None (default)
+        If data of categories has to be summed up or divided, we need information
+        whether the quantity measured is extensive (like, for example, total
+        emissions in a year subdivided into multiple sectoral categories) or
+        intensive (like, for example, average per-person emissions in a year
+        subdivided into different territorial entities). By default (None), a
+        warning is issued if data has to be summed up or divided.
+    operation_type: ``input`` or ``output``
+        If weights for the source data (input) or the result data (output) should
+        be derived.
+    weights: xr.DataArray, optional
+        Weights for the individual categories.
+    selection: dict[str, list[str]]
+        Selection derived from the rule.
+
+    Returns
+    -------
+    factors: float or xr.DataArray
+        Object which can be multiplied with the input or output DataArray to apply
+        weights.
+    """
+    if operation_type == "input":
+        operation_verb = "sum up"
+        trivial_sum_rule = "extensive"
+        nontrivial_sum_rule = "intensive"
+        rule_cardinality = rule.cardinality_a
+    else:
+        operation_verb = "split"
+        trivial_sum_rule = "intensive"
+        nontrivial_sum_rule = "extensive"
+        rule_cardinality = rule.cardinality_b
+
+    # just one category or trivial sum rule, so no weights required
+    if rule_cardinality == "one" or sum_rule == trivial_sum_rule:
+        return 1.0
+    if sum_rule == nontrivial_sum_rule:
+        if weights is None:
+            raise WeightingInfoMissing(
+                category=category,
+                rule=rule,
+                message=f"We need to {operation_verb} multiple categories with"
+                f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
+                f" specified.",
             )
-            if already_converted:
-                # TODO: maybe we can do better?
-                logger.warning(
-                    f"For category {category!r}, would want to use a "
-                    "rule with multiple outputs, but the following outputs "
-                    f"are already converted: {already_converted!r}. "
-                    "Skipping this category and leaving it NaN."
-                )
-                continue
+        effective_weights = weights.loc[selection]
+        # normalize so it is actually a weight, not a factor
+        return effective_weights / effective_weights.sum(dim=dim)
+
+    raise WeightingInfoMissing(
+        category=category,
+        rule=rule,
+        message=f"We need to {operation_verb} multiple categories, but the sum_rule is"
+        f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
+        f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
+        f" specified.",
+    )
 
-            # derive input and output weights (maybe trivial)
-            if rule.cardinality_a == "one" or sum_rule == "extensive":
-                effective_input_weights = 1
-            elif sum_rule == "intensive":
-                # summing intensive units requires weights
-                if input_weights is None:
-                    logger.warning(
-                        f"To derive data for {category!r}, we need to sum up"
-                        " multiple input categories. For sum_rule='intensive',"
-                        " this requires input_weights, but none are specified."
-                        " Will continue with NaN, specify input_weights to avoid this."
-                    )
-                    continue
-                effective_input_weights = input_weights.loc[input_selection]
-                # normalize so it is actually a weight, not a factor
-                effective_input_weights /= effective_input_weights.sum(dim=dim)
-            else:  # no sum rule specified, but needed
-                logger.warning(
-                    f"To derive data for {category!r}, we need to sum up"
-                    " multiple input categories, but the sum_rule is"
-                    " not specified. Will continue with NaN, specify the"
-                    " sum_rule to avoid this."
-                )
-                continue
 
-            if rule.cardinality_b == "one" or sum_rule == "intensive":
-                effective_output_weights = 1
-            elif sum_rule == "extensive":
-                # dividing extensive units requires weights
-                if output_weights is None:
-                    logger.warning(
-                        f"To derive data for {category!r}, we need to split up"
-                        " multiple output categories. For sum_rule='extensive',"
-                        " this requires output_weights, but none are specified."
-                        " Will continue with NaN, specify output_weights to avoid this."
-                    )
-                    continue
-                effective_output_weights = output_weights.loc[output_selection]
-                # normalize so it is actually a weight, not a factor
-                effective_output_weights /= effective_output_weights.sum(dim=dim)
-            else:  # no sum rule specified, but needed
-                logger.warning(
-                    f"To derive data for {category!r}, we need to split up"
-                    " multiple output categories, but the sum_rule is"
-                    " not specified. Will continue with NaN, specify the"
-                    " sum_rule to avoid this."
-                )
-                continue
+def prepare_auxiliary_dimensions(
+    conversion: climate_categories.Conversion,
+    auxiliary_dimensions: typing.Optional[typing.Dict[str, str]],
+) -> typing.Optional[typing.Dict[climate_categories.Categorization, str]]:
+    """Prepare and check the auxiliary dimension mapping.
 
-            # the left-hand side of the conversion formula summed up
-            lhs = (
-                input_factors * effective_input_weights * self._da.loc[input_selection]
-            ).sum(dim=dim)
-            # the right-hand side of the conversion formula split up
-            rhs = lhs / output_factors / effective_output_weights
-            # TODO: using pr.set here is not efficient because it makes copies
-            converted = converted.pr.set(
-                dim=new_dim,
-                key=new_dim_values,
-                value=rhs,
+    Check if all auxiliary categorizations used in the conversion are matched in
+    auxiliary_dimensions.
+
+    Raises a ValueError if any dimension is missing.
+
+    Returns
+    -------
+    auxiliary_dimensions: dict mapping Categorization -> str
+        the auxiliary dimensions, but using Categorization objects instead of their
+        names.
+    """
+    if conversion.auxiliary_categorizations_names:
+        if auxiliary_dimensions is None:
+            raise ValueError(
+                "The conversion uses auxiliary categories, but a translation to"
+                " dimension names was not provided using the argument"
+                " auxiliary_dimensions. Please provide auxiliary_dimensions mapping"
+                f" {conversion.auxiliary_categorizations_names} to the dimension"
+                " names used in the data."
+            )
+        missing = set(conversion.auxiliary_categorizations_names).difference(
+            auxiliary_dimensions.keys()
+        )
+        if missing:
+            raise ValueError(
+                "A dimension name was not given for all auxiliary categories:"
+                f" {missing} are missing in the auxiliary_dimensions argument, please"
+                " provide translations to the dimension names used in the data."
             )
 
-            # mark all filled categories as converted
-            converted_categories += new_dim_values
+    if not auxiliary_dimensions:
+        return auxiliary_dimensions
 
-        return converted
+    return {
+        climate_categories.cats[name]: auxiliary_dimensions[name]
+        for name in auxiliary_dimensions
+    }
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index fbd7b9fb..32c3670d 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,6 +1,7 @@
 """Tests for _convert.py"""
 
 import climate_categories as cc
+import pytest
 import xarray as xr
 
 import primap2
@@ -11,10 +12,19 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     # to see
     da = empty_ds["CO2"]
     da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())})
+    da = da.expand_dims({"source (gas)": list(cc.gas.keys())})
     arr = da.data.copy()
     arr[:] = 1 * primap2.ureg("Gg CO2 / year")
     da.data = arr
 
-    da.pr.convert("category", "IPCC2006", sum_rule="extensive")
+    with pytest.raises(ValueError, match="The conversion uses auxiliary categories"):
+        da.pr.convert("category", "IPCC2006", sum_rule="extensive")
+
+    da.pr.convert(
+        "category",
+        "IPCC2006",
+        sum_rule="extensive",
+        auxiliary_dimensions={"gas": "source (gas)"},
+    )
 
     # TODO test that values actually make sense
diff --git a/setup.cfg b/setup.cfg
index 8acb6943..537fd060 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
     ruamel.yaml
     strictyaml
     openpyxl
-    climate_categories>=0.6.0
+    climate_categories>=0.6.2
 
 [options.extras_require]
 test =

From 8b9b86f69f28420759923cab8248b033830523e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@pik-potsdam.de>
Date: Fri, 5 Nov 2021 19:14:30 +0100
Subject: [PATCH 05/36] Avoid name clash between weight function and its own
 argument.

---
 primap2/_convert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 5917190d..df0b0936 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -207,7 +207,7 @@ def _fill_category(
                 continue
 
             try:
-                effective_input_weights = weights(
+                effective_input_weights = derive_weights(
                     dim=dim,
                     category=category,
                     rule=rule,
@@ -216,7 +216,7 @@ def _fill_category(
                     sum_rule=sum_rule,
                     weights=input_weights,
                 )
-                effective_output_weights = weights(
+                effective_output_weights = derive_weights(
                     dim=new_dim,
                     category=category,
                     rule=rule,
@@ -463,7 +463,7 @@ def __init__(
         ValueError.__init__(self, full_message)
 
 
-def weights(
+def derive_weights(
     *,
     dim: str,
     category: climate_categories.Category,

From f4cc526d15ddaaf001e54e8e02beb7a634871446 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@pik-potsdam.de>
Date: Fri, 5 Nov 2021 20:48:32 +0100
Subject: [PATCH 06/36] Require climate_categories >= 0.6.3, which introduces
 some API we need.

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 537fd060..1c873c9d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
     ruamel.yaml
     strictyaml
     openpyxl
-    climate_categories>=0.6.2
+    climate_categories>=0.6.3
 
 [options.extras_require]
 test =

From fdbec61cb5b8083d90fca4600b531fa54e1a3b1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Tue, 14 Nov 2023 16:50:15 +0100
Subject: [PATCH 07/36] style: ruff

---
 primap2/_convert.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index df0b0936..3f803251 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -1,6 +1,6 @@
 import copy
 import typing
-from typing import Hashable
+from collections.abc import Hashable
 
 import climate_categories
 import numpy as np
@@ -21,7 +21,7 @@ def convert(
         sum_rule: typing.Optional[str] = None,
         input_weights: typing.Optional[xr.DataArray] = None,
         output_weights: typing.Optional[xr.DataArray] = None,
-        auxiliary_dimensions: typing.Optional[typing.Dict[str, str]] = None,
+        auxiliary_dimensions: typing.Optional[dict[str, str]] = None,
     ) -> xr.DataArray:
         """Convert the data along the given dimension into the new categorization.
 
@@ -123,16 +123,16 @@ def _fill_category(
         da: xr.DataArray,
         dim: str,
         new_dim: str,
-        already_converted_categories: typing.List[climate_categories.Category],
+        already_converted_categories: list[climate_categories.Category],
         category: climate_categories.Category,
         conversion: climate_categories.Conversion,
         sum_rule: typing.Optional[str],
         auxiliary_dimensions: typing.Optional[
-            typing.Dict[climate_categories.Categorization, str]
+            dict[climate_categories.Categorization, str]
         ],
         input_weights: typing.Optional[xr.DataArray] = None,
         output_weights: typing.Optional[xr.DataArray] = None,
-    ) -> typing.Tuple[typing.List[climate_categories.Category], xr.DataArray]:
+    ) -> tuple[list[climate_categories.Category], xr.DataArray]:
         """Return a copy of da with the given category filled by values converted
         using the given conversion.
 
@@ -279,13 +279,11 @@ def extract_categorization_from_dim(dim: str) -> (str, str):
     try:
         pure, cat = dim.split("(", 1)
     except ValueError:
-        raise ValueError(f"No categorization specified: {dim!r}.")
+        raise ValueError(f"No categorization specified: {dim!r}.") from None
     return pure[:-1], cat[:-1]
 
 
-def applicable_rules(
-    conversion, category
-) -> typing.List[climate_categories.ConversionRule]:
+def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]:
     """Find the possible rules to derive the category using the given conversion."""
     rules = conversion.relevant_rules({conversion.categorization_b[category]})
     # a + b = c - d  can not be used to derive c nor d, only a and b
@@ -399,12 +397,12 @@ def initialize_empty_converted_da(
 def factors_categories_to_xarray(
     *,
     dim: str,
-    factors_categories: typing.Dict[climate_categories.Category, int],
-    auxiliary_categories: typing.Dict[
-        climate_categories.Categorization, typing.Set[climate_categories.Category]
+    factors_categories: dict[climate_categories.Category, int],
+    auxiliary_categories: dict[
+        climate_categories.Categorization, set[climate_categories.Category]
     ],
-    auxiliary_dimensions: typing.Dict[climate_categories.Categorization, str],
-) -> typing.Tuple[typing.Dict[str, typing.List[str]], xr.DataArray]:
+    auxiliary_dimensions: dict[climate_categories.Categorization, str],
+) -> tuple[dict[str, list[str]], xr.DataArray]:
     """Convert dictionary mapping categories to factors into xarray-compatible objects.
 
     Using the xarray objects ensures that in subsequent calculations, everything
@@ -471,7 +469,7 @@ def derive_weights(
     sum_rule: typing.Optional[str],
     operation_type: str,
     weights: typing.Optional[xr.DataArray],
-    selection: typing.Dict[str, typing.List[str]],
+    selection: dict[str, list[str]],
 ) -> typing.Union[xr.DataArray, float]:
     """Derive the weights to use for applying a specific rule.
 
@@ -543,8 +541,8 @@ def derive_weights(
 
 def prepare_auxiliary_dimensions(
     conversion: climate_categories.Conversion,
-    auxiliary_dimensions: typing.Optional[typing.Dict[str, str]],
-) -> typing.Optional[typing.Dict[climate_categories.Categorization, str]]:
+    auxiliary_dimensions: typing.Optional[dict[str, str]],
+) -> typing.Optional[dict[climate_categories.Categorization, str]]:
     """Prepare and check the auxiliary dimension mapping.
 
     Check if all auxiliary categorizations used in the conversion are matched in

From 1ecee182369312c497fac13f0fd5e710f4d0eee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Wed, 15 Nov 2023 10:40:46 +0100
Subject: [PATCH 08/36] fix: stub file generation

---
 primap-stubs.patch | 54 +++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/primap-stubs.patch b/primap-stubs.patch
index 334b78ce..63e28fde 100644
--- a/primap-stubs.patch
+++ b/primap-stubs.patch
@@ -1,40 +1,34 @@
-diff '--color=auto' -r -u xarray-orig/core/dataarray.pyi xarray/core/dataarray.pyi
---- xarray-orig/core/dataarray.pyi	2022-05-09 19:44:00.059497745 +0200
-+++ xarray/core/dataarray.pyi	2022-05-09 19:45:46.028378457 +0200
-@@ -26,6 +26,7 @@
- from dask.delayed import Delayed
- from iris.cube import Cube as iris_Cube
- from typing import Any, Callable, Hashable, Iterable, Literal, Mapping, Sequence
+diff '--color=auto' -ru xarray.orig/core/dataarray.pyi xarray/core/dataarray.pyi
+--- xarray.orig/core/dataarray.pyi	2023-11-15 10:36:00.509607027 +0100
++++ xarray/core/dataarray.pyi	2023-11-15 10:35:25.561354898 +0100
+@@ -1,3 +1,4 @@
 +import primap2
+ import datetime
+ import numpy as np
+ import pandas as pd
+@@ -48,6 +49,8 @@
+     def __setitem__(self, key, value) -> None: ...
 
- class _LocIndexer:
-     data_array: Incomplete
-@@ -36,6 +37,8 @@
- class DataArray(AbstractArray, DataWithCoords, DataArrayArithmetic):
-     dt: Incomplete
-     def __init__(self, data: Any = ..., coords: Union[Sequence[tuple], Mapping[Any, Any], None] = ..., dims: Union[Hashable, Sequence[Hashable], None] = ..., name: Hashable = ..., attrs: Mapping = ..., indexes: dict[Hashable, pd.Index] = ..., fastpath: bool = ...) -> None: ...
+ class DataArray(AbstractArray, DataWithCoords, DataArrayArithmetic, DataArrayAggregations):
 +    @property
 +    def pr(self) -> primap2.accessors.PRIMAP2DataArrayAccessor: ...
+     dt: Incomplete
+     def __init__(self, data: Any = ..., coords: Sequence[Sequence[Any] | pd.Index | DataArray] | Mapping[Any, Any] | None = ..., dims: Hashable | Sequence[Hashable] | None = ..., name: Hashable | None = ..., attrs: Mapping | None = ..., indexes: Mapping[Any, Index] | None = ..., fastpath: bool = ...) -> None: ...
      def to_dataset(self, dim: Hashable = ..., *, name: Hashable = ..., promote_attrs: bool = ...) -> Dataset: ...
-     @property
-     def name(self) -> Union[Hashable, None]: ...
-diff '--color=auto' -r -u xarray-orig/core/dataset.pyi xarray/core/dataset.pyi
---- xarray-orig/core/dataset.pyi	2022-05-09 19:44:00.059497745 +0200
-+++ xarray/core/dataset.pyi	2022-05-09 19:45:37.980311296 +0200
-@@ -27,6 +27,7 @@
- from numbers import Number
- from os import PathLike
- from typing import Any, Callable, Collection, Hashable, Iterable, Iterator, Literal, Mapping, MutableMapping, Sequence, overload
+diff '--color=auto' -ru xarray.orig/core/dataset.pyi xarray/core/dataset.pyi
+--- xarray.orig/core/dataset.pyi	2023-11-15 10:36:00.513607056 +0100
++++ xarray/core/dataset.pyi	2023-11-15 10:35:25.577355012 +0100
+@@ -1,3 +1,4 @@
 +import primap2
+ import datetime
+ import numpy as np
+ import pandas as pd
+@@ -59,6 +60,8 @@
+     def __setitem__(self, key, value) -> None: ...
 
- def calculate_dimensions(variables: Mapping[Any, Variable]) -> dict[Hashable, int]: ...
- def merge_indexes(indexes: Mapping[Any, Union[Hashable, Sequence[Hashable]]], variables: Mapping[Any, Variable], coord_names: set[Hashable], append: bool = ...) -> tuple[dict[Hashable, Variable], set[Hashable]]: ...
-@@ -50,6 +51,8 @@
-
- class Dataset(DataWithCoords, DatasetArithmetic, Mapping):
-     def __init__(self, data_vars: Mapping[Any, Any] = ..., coords: Mapping[Any, Any] = ..., attrs: Mapping[Any, Any] = ...) -> None: ...
+ class Dataset(DataWithCoords, DatasetAggregations, DatasetArithmetic, Mapping[Hashable, 'DataArray']):
 +    @property
 +    def pr(self) -> primap2.accessors.PRIMAP2DatasetAccessor: ...
+     def __init__(self, data_vars: DataVars | None = ..., coords: Mapping[Any, Any] | None = ..., attrs: Mapping[Any, Any] | None = ...) -> None: ...
+     def __eq__(self, other: DsCompatible) -> Self: ...
      @classmethod
-     def load_store(cls, store, decoder: Incomplete | None = ...) -> Dataset: ...
-     @property

From a7a1d9c54364ee3a927b7c3b1e951197bcd92824 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Wed, 15 Nov 2023 11:10:06 +0100
Subject: [PATCH 09/36] types: better typing for sum_rule

---
 primap2/_convert.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 3f803251..c38e33fd 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -18,7 +18,9 @@ def convert(
         dim: typing.Union[Hashable, str],
         categorization: typing.Union[climate_categories.Categorization, str],
         *,
-        sum_rule: typing.Optional[str] = None,
+        sum_rule: typing.Literal["intensive"]
+        | typing.Literal["extensive"]
+        | None = None,
         input_weights: typing.Optional[xr.DataArray] = None,
         output_weights: typing.Optional[xr.DataArray] = None,
         auxiliary_dimensions: typing.Optional[dict[str, str]] = None,

From 1a2f2ba8909236df9d2ebe9f22f9e975cbb9be4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Wed, 15 Nov 2023 12:26:11 +0100
Subject: [PATCH 10/36] test: some tests for correct results of convert()

---
 primap2/tests/test_convert.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 32c3670d..5cdf3092 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -20,11 +20,20 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     with pytest.raises(ValueError, match="The conversion uses auxiliary categories"):
         da.pr.convert("category", "IPCC2006", sum_rule="extensive")
 
-    da.pr.convert(
+    result = da.pr.convert(
         "category",
         "IPCC2006",
         sum_rule="extensive",
         auxiliary_dimensions={"gas": "source (gas)"},
     )
 
-    # TODO test that values actually make sense
+    assert (
+        (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year"))
+        .all()
+        .item()
+    )
+    assert (
+        (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year"))
+        .all()
+        .item()
+    )

From 7e66953e5e66251b83bdf45fbe3a909fe7284046 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Wed, 15 Nov 2023 12:27:42 +0100
Subject: [PATCH 11/36] perf: convert in-place

---
 primap2/_convert.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index c38e33fd..5ce8f719 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -238,9 +238,7 @@ def _fill_category(
             # the right-hand side of the conversion formula split up
             rhs = lhs / output_factors / effective_output_weights
 
-            # TODO: this is slow because it makes copies
-            # fillna behaviour (only overwrites NaN in converted)
-            da = da.combine_first(rhs)
+            da.loc[output_selection] = rhs
 
             if not rule.is_restricted:
                 # stop processing rules for this category
@@ -387,6 +385,7 @@ def initialize_empty_converted_da(
     # initialize the converted array using all NA
     all_na_array = np.empty(new_shape)
     all_na_array[:] = np.nan
+    all_na_array = all_na_array * old_da.pint.units
     return xr.DataArray(
         data=all_na_array,
         dims=new_dims,

From a76feea442c7b5834153a572806e279afc137c80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Wed, 15 Nov 2023 12:34:07 +0100
Subject: [PATCH 12/36] fix: types for 3.9

---
 primap2/_convert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 5ce8f719..64f6f35a 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -18,9 +18,9 @@ def convert(
         dim: typing.Union[Hashable, str],
         categorization: typing.Union[climate_categories.Categorization, str],
         *,
-        sum_rule: typing.Literal["intensive"]
-        | typing.Literal["extensive"]
-        | None = None,
+        sum_rule: typing.Optional[
+            typing.Union[typing.Literal["intensive"], typing.Literal["extensive"]]
+        ] = None,
         input_weights: typing.Optional[xr.DataArray] = None,
         output_weights: typing.Optional[xr.DataArray] = None,
         auxiliary_dimensions: typing.Optional[dict[str, str]] = None,

From 91f9b76919b5ad706739b0e19ea2a45e6ad7ffa9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:48:54 +0000
Subject: [PATCH 13/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 primap2/_convert.py           | 60 +++++++++++++++--------------------
 primap2/tests/test_convert.py | 12 ++-----
 2 files changed, 27 insertions(+), 45 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 64f6f35a..d755d69a 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -15,15 +15,13 @@ class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
     @alias_dims(["dim"])
     def convert(
         self,
-        dim: typing.Union[Hashable, str],
-        categorization: typing.Union[climate_categories.Categorization, str],
+        dim: Hashable | str,
+        categorization: climate_categories.Categorization | str,
         *,
-        sum_rule: typing.Optional[
-            typing.Union[typing.Literal["intensive"], typing.Literal["extensive"]]
-        ] = None,
-        input_weights: typing.Optional[xr.DataArray] = None,
-        output_weights: typing.Optional[xr.DataArray] = None,
-        auxiliary_dimensions: typing.Optional[dict[str, str]] = None,
+        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+        auxiliary_dimensions: dict[str, str] | None = None,
     ) -> xr.DataArray:
         """Convert the data along the given dimension into the new categorization.
 
@@ -88,9 +86,7 @@ def convert(
         dim_name, old_categorization_name = extract_categorization_from_dim(dim)
         old_categorization = ensure_categorization_instance(old_categorization_name)
         conversion = old_categorization.conversion_to(new_categorization)
-        auxiliary_dimensions = prepare_auxiliary_dimensions(
-            conversion, auxiliary_dimensions
-        )
+        auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
         new_dim = f"{dim_name} ({new_categorization.name})"
 
         converted_da = initialize_empty_converted_da(
@@ -128,12 +124,10 @@ def _fill_category(
         already_converted_categories: list[climate_categories.Category],
         category: climate_categories.Category,
         conversion: climate_categories.Conversion,
-        sum_rule: typing.Optional[str],
-        auxiliary_dimensions: typing.Optional[
-            dict[climate_categories.Categorization, str]
-        ],
-        input_weights: typing.Optional[xr.DataArray] = None,
-        output_weights: typing.Optional[xr.DataArray] = None,
+        sum_rule: str | None,
+        auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
     ) -> tuple[list[climate_categories.Category], xr.DataArray]:
         """Return a copy of da with the given category filled by values converted
         using the given conversion.
@@ -232,9 +226,9 @@ def _fill_category(
                 continue
 
             # the left-hand side of the conversion formula summed up
-            lhs = (
-                input_factors * effective_input_weights * self._da.loc[input_selection]
-            ).sum(dim=dim)
+            lhs = (input_factors * effective_input_weights * self._da.loc[input_selection]).sum(
+                dim=dim
+            )
             # the right-hand side of the conversion formula split up
             rhs = lhs / output_factors / effective_output_weights
 
@@ -295,7 +289,7 @@ def applicable_rules(conversion, category) -> list[climate_categories.Conversion
 
 
 def ensure_categorization_instance(
-    cat: typing.Union[str, climate_categories.Categorization]
+    cat: str | climate_categories.Categorization,
 ) -> climate_categories.Categorization:
     """Takes a categorization name or object and returns the corresponding
     categorization object."""
@@ -304,21 +298,20 @@ def ensure_categorization_instance(
     return climate_categories.cats[cat]
 
 
-def check_valid_sum_rule_types(sum_rule: typing.Optional[str]):
+def check_valid_sum_rule_types(sum_rule: str | None):
     """Checks if the sum_rule is either "intensive", "extensive", or None.
 
     Raises a ValueError if an invalid sum_rule is used."""
     if sum_rule not in (None, "extensive", "intensive"):
         raise ValueError(
-            f"if defined, sum_rule must be either 'extensive' or 'intensive', not"
-            f" {sum_rule}"
+            f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}"
         )
 
 
 def initialize_empty_converted_da(
     *,
     old_da: xr.DataArray,
-    old_dim: typing.Union[Hashable, str],
+    old_dim: Hashable | str,
     new_dim: str,
     new_categorization: climate_categories.Categorization,
 ) -> xr.DataArray:
@@ -399,9 +392,7 @@ def factors_categories_to_xarray(
     *,
     dim: str,
     factors_categories: dict[climate_categories.Category, int],
-    auxiliary_categories: dict[
-        climate_categories.Categorization, set[climate_categories.Category]
-    ],
+    auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
     auxiliary_dimensions: dict[climate_categories.Categorization, str],
 ) -> tuple[dict[str, list[str]], xr.DataArray]:
     """Convert dictionary mapping categories to factors into xarray-compatible objects.
@@ -467,11 +458,11 @@ def derive_weights(
     dim: str,
     category: climate_categories.Category,
     rule: climate_categories.ConversionRule,
-    sum_rule: typing.Optional[str],
+    sum_rule: str | None,
     operation_type: str,
-    weights: typing.Optional[xr.DataArray],
+    weights: xr.DataArray | None,
     selection: dict[str, list[str]],
-) -> typing.Union[xr.DataArray, float]:
+) -> xr.DataArray | float:
     """Derive the weights to use for applying a specific rule.
 
     Parameters
@@ -542,8 +533,8 @@ def derive_weights(
 
 def prepare_auxiliary_dimensions(
     conversion: climate_categories.Conversion,
-    auxiliary_dimensions: typing.Optional[dict[str, str]],
-) -> typing.Optional[dict[climate_categories.Categorization, str]]:
+    auxiliary_dimensions: dict[str, str] | None,
+) -> dict[climate_categories.Categorization, str] | None:
     """Prepare and check the auxiliary dimension mapping.
 
     Check if all auxiliary categorizations used in the conversion are matched in
@@ -580,6 +571,5 @@ def prepare_auxiliary_dimensions(
         return auxiliary_dimensions
 
     return {
-        climate_categories.cats[name]: auxiliary_dimensions[name]
-        for name in auxiliary_dimensions
+        climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
     }
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 5cdf3092..81dfb8a1 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -27,13 +27,5 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
         auxiliary_dimensions={"gas": "source (gas)"},
     )
 
-    assert (
-        (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year"))
-        .all()
-        .item()
-    )
-    assert (
-        (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year"))
-        .all()
-        .item()
-    )
+    assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()

From 084980512be0c58c189022593e38a17c33f79a7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Mon, 7 Oct 2024 14:51:24 +0200
Subject: [PATCH 14/36] fix: _alias_selection is called _selection now

---
 primap2/_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index d755d69a..ff70aedd 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -8,7 +8,7 @@
 from loguru import logger
 
 from . import _accessor_base
-from ._alias_selection import alias_dims
+from ._selection import alias_dims
 
 
 class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):

From e88f760d7a8b2822a19ee252a3d9fcf3cc9b90c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Mon, 7 Oct 2024 14:54:48 +0200
Subject: [PATCH 15/36] fix: bump minimum required version of
 climate_categories to something non-ancient

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index d09c22c9..b6dd2b41 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -53,7 +53,7 @@ install_requires =
     openpyxl>=3.1
     tqdm>=4.66
     msgpack>=1
-    climate_categories>=0.6.3
+    climate_categories>=0.10.1
 
 [options.extras_require]
 test =

From 57c8a5f753c47e699511192eaccf4024d71cee33 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 10 Oct 2024 09:33:03 +0200
Subject: [PATCH 16/36] BURDI test draft

---
 primap2/tests/test_convert.py | 64 +++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 81dfb8a1..a4f66791 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -29,3 +29,67 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
 
     assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
+
+def test_convert_BURDI(empty_ds: xr.Dataset):
+    # build a DA categorized by BURDI and with 1 everywhere so results are easy
+    # to see
+
+    # TODO this should come from climate categories
+    mapping_BURDI_to_IPCC2006_PRIMAP = {
+            "1" : "1",
+            "1.A" : "1.A",
+            "1.A.1" : "1.A.1",
+            "1.A.2" : "1.A.2",
+            "1.A.3" : "1.A.3",
+            "1.A.4" : "1.A.4",
+            "1.A.5" : "1.A.5",
+            "1.B" : "1.B",
+            "1.B.1" : "1.B.1",
+            "1.B.2" : "1.B.2",
+            "2" : "M.2.BURDI",
+            "2.A" : "2.A",
+            "2.B" : "M.2.B_2.B",
+            "2.C" : "2.C",
+            "2.D" : "M.2.H.1_2",
+            "2.E" : "M.2.B_2.E",
+            "2.F" : "2.F",
+            "2.G" : "2.H.3",
+            "3" : "2.D",
+            "4" : "M.AG",
+            "4.A" : "3.A.1",
+            "4.B" : "3.A.2",
+            "4.C" : "3.C.7",
+            "4.D" : "M.3.C.45.AG",
+            "4.E" : "3.C.1.c",
+            "4.F" : "3.C.1.b",
+            "4.G" : "3.C.8",
+            "5" : "M.LULUCF",
+            "6" : "4",
+            "6.A" : "4.A",
+            "6.B" : "4.D",
+            "6.C" : "4.C",
+            "6.D" : "4.E",
+            "24540" : "0",
+            "15163" : "M.0.EL",
+            "14637" : "M.BK",
+            "14424" : "M.BK.A",
+            "14423" : "M.BK.M",
+            "14638" : "M.BIO",
+            "7" : "5",
+    }  # 5.A-D ignored as not fitting 2006 cats
+
+
+    da = empty_ds["CO2"]
+    da = da.expand_dims({"category (IPCC1996)": list(mapping_BURDI_to_IPCC2006_PRIMAP.keys())})
+    da = da.expand_dims({"source (gas)": list(cc.gas.keys())})
+    arr = da.data.copy()
+
+    result = da.pr.convert(
+        "category",
+        "IPCC2006",
+        sum_rule="extensive",
+        auxiliary_dimensions={"gas" : "source (gas)"},
+    )
+
+    # TODO
+    assert False

From b9973cdc8783464f75d1a29d734c303f9df55887 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= <mika.pflueger@climate-resource.com>
Date: Thu, 10 Oct 2024 15:06:35 +0200
Subject: [PATCH 17/36] docs: add some algorithm notes

---
 primap2/_convert.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index ff70aedd..b908b32e 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -96,6 +96,10 @@ def convert(
             new_categorization=new_categorization,
         )
 
+        # idea: convert 1-to-1 mappings first, should be easy in a single xarray
+        # operation
+        # note: if you have multiple rules to fill a single category, we should
+        # use something like fillna
         converted_categories = []
         for category in converted_da[new_dim]:
             if category in converted_categories:

From 920999c53c072d79016e08dacd182465fd193ba0 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 14 Oct 2024 15:42:10 +0200
Subject: [PATCH 18/36] test for BURDI conversion

---
 primap2/_convert.py                     | 222 +++++++++++++-----------
 primap2/tests/data/BURDI_conversion.csv |  40 +++++
 primap2/tests/test_convert.py           | 117 +++++++------
 3 files changed, 225 insertions(+), 154 deletions(-)
 create mode 100644 primap2/tests/data/BURDI_conversion.csv

diff --git a/primap2/_convert.py b/primap2/_convert.py
index b908b32e..99c5b4da 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -11,18 +11,18 @@
 from ._selection import alias_dims
 
 
-class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
+class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) :
     @alias_dims(["dim"])
     def convert(
-        self,
-        dim: Hashable | str,
-        categorization: climate_categories.Categorization | str,
-        *,
-        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-        input_weights: xr.DataArray | None = None,
-        output_weights: xr.DataArray | None = None,
-        auxiliary_dimensions: dict[str, str] | None = None,
-    ) -> xr.DataArray:
+            self,
+            dim: Hashable | str,
+            categorization: climate_categories.Categorization | climate_categories._conversions.Conversion | str,
+            *,
+            sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+            input_weights: xr.DataArray | None = None,
+            output_weights: xr.DataArray | None = None,
+            auxiliary_dimensions: dict[str, str] | None = None,
+    ) -> xr.DataArray :
         """Convert the data along the given dimension into the new categorization.
 
         Maps the given dimension from one categorization (terminology) into another.
@@ -37,7 +37,8 @@ def convert(
         categorization : climate_categories.Categorization or str
             New categorization to convert the given dimension to. Either give the title
             of the new categorization (like ``IPCC1996``) or a
-            ``climate_categories.Categorization`` object.
+            ``climate_categories.Categorization`` object or a
+            ``climate_categories._conversions.Conversion`` object.
         sum_rule : ``extensive``, ``intensive``, or None (default)
             If data of categories has to be summed up or divided, we need information
             whether the quantity measured is extensive (like, for example, total
@@ -80,12 +81,25 @@ def convert(
             A copy of the DataArray with the given dimension converted in the new
             categorization.
         """
-        new_categorization = ensure_categorization_instance(categorization)
+        dim_name, old_categorization_name = extract_categorization_from_dim(dim)
+
+        # user put in str of new category or categorisation object
+        if isinstance(categorization, (climate_categories.Categorization, str)) :
+            new_categorization = ensure_categorization_instance(categorization)
+            old_categorization = ensure_categorization_instance(old_categorization_name)
+            conversion = old_categorization.conversion_to(new_categorization)
+        elif isinstance(categorization, climate_categories._conversions.Conversion):
+            new_categorization = ensure_categorization_instance(categorization.categorization_b_name)
+            conversion = categorization
+        else:
+            raise ValueError(
+                f"categorization must be of instance climate_categories.Categorization "
+                f"or climate_categories._conversions.Conversion. Got {type(categorization)}"
+                             )
+
         check_valid_sum_rule_types(sum_rule)
 
-        dim_name, old_categorization_name = extract_categorization_from_dim(dim)
-        old_categorization = ensure_categorization_instance(old_categorization_name)
-        conversion = old_categorization.conversion_to(new_categorization)
+
         auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
         new_dim = f"{dim_name} ({new_categorization.name})"
 
@@ -101,8 +115,8 @@ def convert(
         # note: if you have multiple rules to fill a single category, we should
         # use something like fillna
         converted_categories = []
-        for category in converted_da[new_dim]:
-            if category in converted_categories:
+        for category in converted_da[new_dim] :
+            if category in converted_categories :
                 continue
             newly_converted_categories, converted_da = self._fill_category(
                 da=converted_da,
@@ -121,18 +135,18 @@ def convert(
         return converted_da
 
     def _fill_category(
-        self,
-        da: xr.DataArray,
-        dim: str,
-        new_dim: str,
-        already_converted_categories: list[climate_categories.Category],
-        category: climate_categories.Category,
-        conversion: climate_categories.Conversion,
-        sum_rule: str | None,
-        auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
-        input_weights: xr.DataArray | None = None,
-        output_weights: xr.DataArray | None = None,
-    ) -> tuple[list[climate_categories.Category], xr.DataArray]:
+            self,
+            da: xr.DataArray,
+            dim: str,
+            new_dim: str,
+            already_converted_categories: list[climate_categories.Category],
+            category: climate_categories.Category,
+            conversion: climate_categories.Conversion,
+            sum_rule: str | None,
+            auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
+            input_weights: xr.DataArray | None = None,
+            output_weights: xr.DataArray | None = None,
+    ) -> tuple[list[climate_categories.Category], xr.DataArray] :
         """Return a copy of da with the given category filled by values converted
         using the given conversion.
 
@@ -166,13 +180,13 @@ def _fill_category(
         filled_categories, filled: list of climate_categories.category, xr.DataArray
             The categories that were filled and the new DataArray.
         """
-        try:
+        try :
             rules = applicable_rules(conversion, category)
-        except KeyError:
+        except KeyError :
             logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
             return [], da
 
-        for rule in rules:
+        for rule in rules :
             logger.debug(f"Processing rule {rule}.")
             # iterate until a non-restricted rule was applied or all rules are
             # exhausted
@@ -197,7 +211,7 @@ def _fill_category(
             already_converted = set(output_selection[new_dim]).intersection(
                 set(already_converted_categories)
             )
-            if already_converted:
+            if already_converted :
                 logger.warning(
                     f"For category {category!r}, would want to use a "
                     "rule with multiple outputs, but the following outputs "
@@ -206,7 +220,7 @@ def _fill_category(
                 )
                 continue
 
-            try:
+            try :
                 effective_input_weights = derive_weights(
                     dim=dim,
                     category=category,
@@ -225,7 +239,7 @@ def _fill_category(
                     sum_rule=sum_rule,
                     weights=output_weights,
                 )
-            except WeightingInfoMissing as err:
+            except WeightingInfoMissing as err :
                 logger.warning(str(err))
                 continue
 
@@ -238,7 +252,7 @@ def _fill_category(
 
             da.loc[output_selection] = rhs
 
-            if not rule.is_restricted:
+            if not rule.is_restricted :
                 # stop processing rules for this category
                 return output_selection[new_dim], da
 
@@ -249,7 +263,7 @@ def _fill_category(
         return [], da
 
 
-def extract_categorization_from_dim(dim: str) -> (str, str):
+def extract_categorization_from_dim(dim: str) -> (str, str) :
     """Extract the pure dimension and the categorization from a composite dim.
 
     Parameters
@@ -274,51 +288,51 @@ def extract_categorization_from_dim(dim: str) -> (str, str):
         The pure_dim without categorization information and the categorization. If the
         input dim does not contain categorization information, a ValueError is raised.
     """
-    try:
+    try :
         pure, cat = dim.split("(", 1)
-    except ValueError:
+    except ValueError :
         raise ValueError(f"No categorization specified: {dim!r}.") from None
     return pure[:-1], cat[:-1]
 
 
-def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]:
+def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] :
     """Find the possible rules to derive the category using the given conversion."""
     rules = conversion.relevant_rules({conversion.categorization_b[category]})
     # a + b = c - d  can not be used to derive c nor d, only a and b
     rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())]
 
-    if not rules:
+    if not rules :
         raise KeyError(category)
     return rules
 
 
 def ensure_categorization_instance(
-    cat: str | climate_categories.Categorization,
-) -> climate_categories.Categorization:
+        cat: str | climate_categories.Categorization,
+) -> climate_categories.Categorization :
     """Takes a categorization name or object and returns the corresponding
     categorization object."""
-    if isinstance(cat, climate_categories.Categorization):
+    if isinstance(cat, climate_categories.Categorization) :
         return cat
     return climate_categories.cats[cat]
 
 
-def check_valid_sum_rule_types(sum_rule: str | None):
+def check_valid_sum_rule_types(sum_rule: str | None) :
     """Checks if the sum_rule is either "intensive", "extensive", or None.
 
     Raises a ValueError if an invalid sum_rule is used."""
-    if sum_rule not in (None, "extensive", "intensive"):
+    if sum_rule not in (None, "extensive", "intensive") :
         raise ValueError(
             f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}"
         )
 
 
 def initialize_empty_converted_da(
-    *,
-    old_da: xr.DataArray,
-    old_dim: Hashable | str,
-    new_dim: str,
-    new_categorization: climate_categories.Categorization,
-) -> xr.DataArray:
+        *,
+        old_da: xr.DataArray,
+        old_dim: Hashable | str,
+        new_dim: str,
+        new_categorization: climate_categories.Categorization,
+) -> xr.DataArray :
     """Build a DataArray which can hold the data after conversion to a new
     categorization.
 
@@ -347,19 +361,19 @@ def initialize_empty_converted_da(
     """
     new_dims = []
     new_shape = []
-    for i, idim in enumerate(old_da.dims):
-        if idim == old_dim:
+    for i, idim in enumerate(old_da.dims) :
+        if idim == old_dim :
             new_dims.append(new_dim)
             new_shape.append(len(new_categorization))
-        else:
+        else :
             new_dims.append(idim)
             new_shape.append(old_da.shape[i])
 
     new_coords = {}
-    for coord in old_da.coords:
-        if coord == old_dim:
+    for coord in old_da.coords :
+        if coord == old_dim :
             new_coords[new_dim] = np.array(list(new_categorization.keys()))
-        elif old_dim in old_da.coords[coord].dims:
+        elif old_dim in old_da.coords[coord].dims :
             # The additional coordinate has the old_dim as one dimension, but we
             # won't be able to convert it
             logger.info(
@@ -367,15 +381,15 @@ def initialize_empty_converted_da(
                 f" and is skipped."
             )
             continue
-        else:
+        else :
             new_coords[coord] = old_da.coords[coord]
 
     new_attrs = copy.deepcopy(old_da.attrs)
-    for pdim in ("area", "cat", "scen"):
-        if pdim in new_attrs and new_attrs[pdim] == old_dim:
+    for pdim in ("area", "cat", "scen") :
+        if pdim in new_attrs and new_attrs[pdim] == old_dim :
             new_attrs[pdim] = new_dim
 
-    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]:
+    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] :
         new_attrs["sec_cats"].remove(old_dim)
         new_attrs["sec_cats"].append(new_dim)
 
@@ -393,12 +407,12 @@ def initialize_empty_converted_da(
 
 
 def factors_categories_to_xarray(
-    *,
-    dim: str,
-    factors_categories: dict[climate_categories.Category, int],
-    auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
-    auxiliary_dimensions: dict[climate_categories.Categorization, str],
-) -> tuple[dict[str, list[str]], xr.DataArray]:
+        *,
+        dim: str,
+        factors_categories: dict[climate_categories.Category, int],
+        auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
+        auxiliary_dimensions: dict[climate_categories.Categorization, str],
+) -> tuple[dict[str, list[str]], xr.DataArray] :
     """Convert dictionary mapping categories to factors into xarray-compatible objects.
 
     Using the xarray objects ensures that in subsequent calculations, everything
@@ -426,30 +440,30 @@ def factors_categories_to_xarray(
         factors is an xarray DataArray which can be multiplied with an xarray object
         after applying the selection.
     """
-    selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]}
+    selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]}
     factors = xr.DataArray(
         data=list(factors_categories.values()),
         dims=[dim],
         coords=selection,
     )
 
-    for aux_categorization, aux_categories in auxiliary_categories.items():
-        if aux_categories:
+    for aux_categorization, aux_categories in auxiliary_categories.items() :
+        if aux_categories :
             aux_dim = auxiliary_dimensions[aux_categorization]
             selection[aux_dim] = [cat.codes[0] for cat in aux_categories]
 
     return selection, factors
 
 
-class WeightingInfoMissing(ValueError):
+class WeightingInfoMissing(ValueError) :
     """Some information to derive weighting factors for a rule is missing."""
 
     def __init__(
-        self,
-        category: climate_categories.Category,
-        rule: climate_categories.ConversionRule,
-        message: str,
-    ):
+            self,
+            category: climate_categories.Category,
+            rule: climate_categories.ConversionRule,
+            message: str,
+    ) :
         full_message = (
             f"Can not derive data for category {category!r} using rule"
             f" '{rule}': {message} Skipping this rule."
@@ -458,15 +472,15 @@ def __init__(
 
 
 def derive_weights(
-    *,
-    dim: str,
-    category: climate_categories.Category,
-    rule: climate_categories.ConversionRule,
-    sum_rule: str | None,
-    operation_type: str,
-    weights: xr.DataArray | None,
-    selection: dict[str, list[str]],
-) -> xr.DataArray | float:
+        *,
+        dim: str,
+        category: climate_categories.Category,
+        rule: climate_categories.ConversionRule,
+        sum_rule: str | None,
+        operation_type: str,
+        weights: xr.DataArray | None,
+        selection: dict[str, list[str]],
+) -> xr.DataArray | float :
     """Derive the weights to use for applying a specific rule.
 
     Parameters
@@ -498,28 +512,28 @@ def derive_weights(
         Object which can be multiplied with the input or output DataArray to apply
         weights.
     """
-    if operation_type == "input":
+    if operation_type == "input" :
         operation_verb = "sum up"
         trivial_sum_rule = "extensive"
         nontrivial_sum_rule = "intensive"
         rule_cardinality = rule.cardinality_a
-    else:
+    else :
         operation_verb = "split"
         trivial_sum_rule = "intensive"
         nontrivial_sum_rule = "extensive"
         rule_cardinality = rule.cardinality_b
 
     # just one category or trivial sum rule, so no weights required
-    if rule_cardinality == "one" or sum_rule == trivial_sum_rule:
+    if rule_cardinality == "one" or sum_rule == trivial_sum_rule :
         return 1.0
-    if sum_rule == nontrivial_sum_rule:
-        if weights is None:
+    if sum_rule == nontrivial_sum_rule :
+        if weights is None :
             raise WeightingInfoMissing(
                 category=category,
                 rule=rule,
                 message=f"We need to {operation_verb} multiple categories with"
-                f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
-                f" specified.",
+                        f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
+                        f" specified.",
             )
         effective_weights = weights.loc[selection]
         # normalize so it is actually a weight, not a factor
@@ -529,16 +543,16 @@ def derive_weights(
         category=category,
         rule=rule,
         message=f"We need to {operation_verb} multiple categories, but the sum_rule is"
-        f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
-        f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
-        f" specified.",
+                f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
+                f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
+                f" specified.",
     )
 
 
 def prepare_auxiliary_dimensions(
-    conversion: climate_categories.Conversion,
-    auxiliary_dimensions: dict[str, str] | None,
-) -> dict[climate_categories.Categorization, str] | None:
+        conversion: climate_categories.Conversion,
+        auxiliary_dimensions: dict[str, str] | None,
+) -> dict[climate_categories.Categorization, str] | None :
     """Prepare and check the auxiliary dimension mapping.
 
     Check if all auxiliary categorizations used in the conversion are matched in
@@ -552,8 +566,8 @@ def prepare_auxiliary_dimensions(
         the auxiliary dimensions, but using Categorization objects instead of their
         names.
     """
-    if conversion.auxiliary_categorizations_names:
-        if auxiliary_dimensions is None:
+    if conversion.auxiliary_categorizations_names :
+        if auxiliary_dimensions is None :
             raise ValueError(
                 "The conversion uses auxiliary categories, but a translation to"
                 " dimension names was not provided using the argument"
@@ -564,16 +578,16 @@ def prepare_auxiliary_dimensions(
         missing = set(conversion.auxiliary_categorizations_names).difference(
             auxiliary_dimensions.keys()
         )
-        if missing:
+        if missing :
             raise ValueError(
                 "A dimension name was not given for all auxiliary categories:"
                 f" {missing} are missing in the auxiliary_dimensions argument, please"
                 " provide translations to the dimension names used in the data."
             )
 
-    if not auxiliary_dimensions:
+    if not auxiliary_dimensions :
         return auxiliary_dimensions
 
     return {
-        climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
+        climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions
     }
diff --git a/primap2/tests/data/BURDI_conversion.csv b/primap2/tests/data/BURDI_conversion.csv
new file mode 100644
index 00000000..82e0ce50
--- /dev/null
+++ b/primap2/tests/data/BURDI_conversion.csv
@@ -0,0 +1,40 @@
+# references: non_annex1_data repo
+# last_update: 2024-10-14
+BURDI,IPCC2006_PRIMAP,comment
+1,1
+1.A,1.A
+1.A.1,1.A.1
+1.A.2,1.A.2
+1.A.3,1.A.3
+1.A.4,1.A.4
+1.A.5,1.A.5
+1.B,1.B
+1.B.1,1.B.1
+1.B.2,1.B.2
+2 + 3,2
+2.A,2.A
+2.B + 2.E,2.B
+2.C,2.C
+2.F,2.F
+2.G + 2.D, 2.H
+3,2.D
+4,M.AG
+4.A,3.A.1
+4.B,3.A.2
+4.C,3.C.7
+4.D + 4.C + 4.E + 4.F + 4.G,3.C
+4.E,3.C.1.c
+4.F,3.C.1.b
+4.G,3.C.8
+5,M.LULUCF
+6,4
+6.A,4.A
+6.B,4.D
+6.C,4.C
+6.D,4.E
+24540,0
+15163,M.0.EL
+14637,M.BK
+14424,M.BK.A
+14423,M.BK.M, leaving 14638 --> M.BIO out for now, as it's not in climate categories
+7,5, 5.A-D ignored as not fitting 2006 cats
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index a4f66791..8202d43a 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,10 +1,12 @@
 """Tests for _convert.py"""
 
 import climate_categories as cc
+import climate_categories._conversions as conversions
 import pytest
 import xarray as xr
-
+import pathlib
 import primap2
+import numpy as np
 
 
 def test_convert_ipcc(empty_ds: xr.Dataset):
@@ -31,65 +33,80 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
 
 def test_convert_BURDI(empty_ds: xr.Dataset):
-    # build a DA categorized by BURDI and with 1 everywhere so results are easy
-    # to see
+    # make a sample conversion object in climate categories
+    filepath = pathlib.Path("data/BURDI_conversion.csv")
+    conv = conversions.ConversionSpec.from_csv(filepath)
+    conv = conv.hydrate(cats=cc.cats["BURDI"]._cats)
 
-    # TODO this should come from climate categories
-    mapping_BURDI_to_IPCC2006_PRIMAP = {
-            "1" : "1",
-            "1.A" : "1.A",
-            "1.A.1" : "1.A.1",
-            "1.A.2" : "1.A.2",
-            "1.A.3" : "1.A.3",
-            "1.A.4" : "1.A.4",
-            "1.A.5" : "1.A.5",
-            "1.B" : "1.B",
-            "1.B.1" : "1.B.1",
-            "1.B.2" : "1.B.2",
-            "2" : "M.2.BURDI",
-            "2.A" : "2.A",
-            "2.B" : "M.2.B_2.B",
-            "2.C" : "2.C",
-            "2.D" : "M.2.H.1_2",
-            "2.E" : "M.2.B_2.E",
-            "2.F" : "2.F",
-            "2.G" : "2.H.3",
-            "3" : "2.D",
-            "4" : "M.AG",
-            "4.A" : "3.A.1",
-            "4.B" : "3.A.2",
-            "4.C" : "3.C.7",
-            "4.D" : "M.3.C.45.AG",
-            "4.E" : "3.C.1.c",
-            "4.F" : "3.C.1.b",
-            "4.G" : "3.C.8",
-            "5" : "M.LULUCF",
-            "6" : "4",
-            "6.A" : "4.A",
-            "6.B" : "4.D",
-            "6.C" : "4.C",
-            "6.D" : "4.E",
-            "24540" : "0",
-            "15163" : "M.0.EL",
-            "14637" : "M.BK",
-            "14424" : "M.BK.A",
-            "14423" : "M.BK.M",
-            "14638" : "M.BIO",
-            "7" : "5",
-    }  # 5.A-D ignored as not fitting 2006 cats
+    # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
+    # unfccc_di_reader_config.py
+    BURDI_categories = [
+            "1",
+            "1.A",
+            "1.A.1",
+            "1.A.2",
+            "1.A.3",
+            "1.A.4",
+            "1.A.5",
+            "1.B",
+            "1.B.1",
+            "1.B.2",
+            "2",
+            "2.A",
+            "2.B",
+            "2.C",
+            "2.D",
+            "2.E",
+            "2.F",
+            "2.G",
+            "3",
+            "4",
+            "4.A",
+            "4.B",
+            "4.C",
+            "4.D",
+            "4.E",
+            "4.F",
+            "4.G",
+            "5",
+            "6",
+            "6.A",
+            "6.B",
+            "6.C",
+            "6.D",
+            "24540",
+            "15163",
+            "14637",
+            "14424",
+            "14423",
+            "14638",
+            "7"]
 
 
+    # build a DA categorized by BURDI and with 1 everywhere so results are easy
+    # to see
     da = empty_ds["CO2"]
-    da = da.expand_dims({"category (IPCC1996)": list(mapping_BURDI_to_IPCC2006_PRIMAP.keys())})
+    da = da.expand_dims({"category (BURDI)": BURDI_categories})
     da = da.expand_dims({"source (gas)": list(cc.gas.keys())})
     arr = da.data.copy()
+    arr[:] = 1 * primap2.ureg("Gg CO2 / year")
+    da.data = arr
 
     result = da.pr.convert(
         "category",
-        "IPCC2006",
+        conv,
         sum_rule="extensive",
         auxiliary_dimensions={"gas" : "source (gas)"},
     )
 
-    # TODO
-    assert False
+    # cat 2 + 3 in BURDI equals cat 2 in IPCC2006_PRIMAP
+    assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # cat 4.D + 4.C + 4.E + 4.F + 4.G in BURDI equals cat 3.C in IPCC2006_PRIMAP
+    assert (result.pr.loc[{"category" : "3.C"}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # cat 5 in BURDI equals cat M.LULUCF in IPCC2006_PRIMAP
+    assert (result.pr.loc[{"category" : "M.LULUCF"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # 2.E + 2.B = 2.E, 2.E should not be part of new data set
+    assert np.isnan(result.pr.loc[{"category" : "2.E"}].values).all()
+    # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
+    # TODO: This will fail. M.BIO is currently not listed in climate categories
+    assert (result.pr.loc[{"category" : "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
\ No newline at end of file

From 69c637bdcd7194cf1e9a234230b3c02af56fcaaa Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 14 Oct 2024 15:44:44 +0200
Subject: [PATCH 19/36] ruff

---
 primap2/_convert.py           | 209 +++++++++++++++++-----------------
 primap2/tests/test_convert.py | 105 +++++++++--------
 2 files changed, 163 insertions(+), 151 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 99c5b4da..f13c8ac1 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -11,18 +11,20 @@
 from ._selection import alias_dims
 
 
-class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) :
+class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
     @alias_dims(["dim"])
     def convert(
-            self,
-            dim: Hashable | str,
-            categorization: climate_categories.Categorization | climate_categories._conversions.Conversion | str,
-            *,
-            sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-            input_weights: xr.DataArray | None = None,
-            output_weights: xr.DataArray | None = None,
-            auxiliary_dimensions: dict[str, str] | None = None,
-    ) -> xr.DataArray :
+        self,
+        dim: Hashable | str,
+        categorization: climate_categories.Categorization
+        | climate_categories._conversions.Conversion
+        | str,
+        *,
+        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+        auxiliary_dimensions: dict[str, str] | None = None,
+    ) -> xr.DataArray:
         """Convert the data along the given dimension into the new categorization.
 
         Maps the given dimension from one categorization (terminology) into another.
@@ -84,22 +86,23 @@ def convert(
         dim_name, old_categorization_name = extract_categorization_from_dim(dim)
 
         # user put in str of new category or categorisation object
-        if isinstance(categorization, (climate_categories.Categorization, str)) :
+        if isinstance(categorization, (climate_categories.Categorization, str)):
             new_categorization = ensure_categorization_instance(categorization)
             old_categorization = ensure_categorization_instance(old_categorization_name)
             conversion = old_categorization.conversion_to(new_categorization)
         elif isinstance(categorization, climate_categories._conversions.Conversion):
-            new_categorization = ensure_categorization_instance(categorization.categorization_b_name)
+            new_categorization = ensure_categorization_instance(
+                categorization.categorization_b_name
+            )
             conversion = categorization
         else:
             raise ValueError(
                 f"categorization must be of instance climate_categories.Categorization "
                 f"or climate_categories._conversions.Conversion. Got {type(categorization)}"
-                             )
+            )
 
         check_valid_sum_rule_types(sum_rule)
 
-
         auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
         new_dim = f"{dim_name} ({new_categorization.name})"
 
@@ -115,8 +118,8 @@ def convert(
         # note: if you have multiple rules to fill a single category, we should
         # use something like fillna
         converted_categories = []
-        for category in converted_da[new_dim] :
-            if category in converted_categories :
+        for category in converted_da[new_dim]:
+            if category in converted_categories:
                 continue
             newly_converted_categories, converted_da = self._fill_category(
                 da=converted_da,
@@ -135,18 +138,18 @@ def convert(
         return converted_da
 
     def _fill_category(
-            self,
-            da: xr.DataArray,
-            dim: str,
-            new_dim: str,
-            already_converted_categories: list[climate_categories.Category],
-            category: climate_categories.Category,
-            conversion: climate_categories.Conversion,
-            sum_rule: str | None,
-            auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
-            input_weights: xr.DataArray | None = None,
-            output_weights: xr.DataArray | None = None,
-    ) -> tuple[list[climate_categories.Category], xr.DataArray] :
+        self,
+        da: xr.DataArray,
+        dim: str,
+        new_dim: str,
+        already_converted_categories: list[climate_categories.Category],
+        category: climate_categories.Category,
+        conversion: climate_categories.Conversion,
+        sum_rule: str | None,
+        auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+    ) -> tuple[list[climate_categories.Category], xr.DataArray]:
         """Return a copy of da with the given category filled by values converted
         using the given conversion.
 
@@ -180,13 +183,13 @@ def _fill_category(
         filled_categories, filled: list of climate_categories.category, xr.DataArray
             The categories that were filled and the new DataArray.
         """
-        try :
+        try:
             rules = applicable_rules(conversion, category)
-        except KeyError :
+        except KeyError:
             logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
             return [], da
 
-        for rule in rules :
+        for rule in rules:
             logger.debug(f"Processing rule {rule}.")
             # iterate until a non-restricted rule was applied or all rules are
             # exhausted
@@ -211,7 +214,7 @@ def _fill_category(
             already_converted = set(output_selection[new_dim]).intersection(
                 set(already_converted_categories)
             )
-            if already_converted :
+            if already_converted:
                 logger.warning(
                     f"For category {category!r}, would want to use a "
                     "rule with multiple outputs, but the following outputs "
@@ -220,7 +223,7 @@ def _fill_category(
                 )
                 continue
 
-            try :
+            try:
                 effective_input_weights = derive_weights(
                     dim=dim,
                     category=category,
@@ -239,7 +242,7 @@ def _fill_category(
                     sum_rule=sum_rule,
                     weights=output_weights,
                 )
-            except WeightingInfoMissing as err :
+            except WeightingInfoMissing as err:
                 logger.warning(str(err))
                 continue
 
@@ -252,7 +255,7 @@ def _fill_category(
 
             da.loc[output_selection] = rhs
 
-            if not rule.is_restricted :
+            if not rule.is_restricted:
                 # stop processing rules for this category
                 return output_selection[new_dim], da
 
@@ -263,7 +266,7 @@ def _fill_category(
         return [], da
 
 
-def extract_categorization_from_dim(dim: str) -> (str, str) :
+def extract_categorization_from_dim(dim: str) -> (str, str):
     """Extract the pure dimension and the categorization from a composite dim.
 
     Parameters
@@ -288,51 +291,51 @@ def extract_categorization_from_dim(dim: str) -> (str, str) :
         The pure_dim without categorization information and the categorization. If the
         input dim does not contain categorization information, a ValueError is raised.
     """
-    try :
+    try:
         pure, cat = dim.split("(", 1)
-    except ValueError :
+    except ValueError:
         raise ValueError(f"No categorization specified: {dim!r}.") from None
     return pure[:-1], cat[:-1]
 
 
-def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] :
+def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]:
     """Find the possible rules to derive the category using the given conversion."""
     rules = conversion.relevant_rules({conversion.categorization_b[category]})
     # a + b = c - d  can not be used to derive c nor d, only a and b
     rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())]
 
-    if not rules :
+    if not rules:
         raise KeyError(category)
     return rules
 
 
 def ensure_categorization_instance(
-        cat: str | climate_categories.Categorization,
-) -> climate_categories.Categorization :
+    cat: str | climate_categories.Categorization,
+) -> climate_categories.Categorization:
     """Takes a categorization name or object and returns the corresponding
     categorization object."""
-    if isinstance(cat, climate_categories.Categorization) :
+    if isinstance(cat, climate_categories.Categorization):
         return cat
     return climate_categories.cats[cat]
 
 
-def check_valid_sum_rule_types(sum_rule: str | None) :
+def check_valid_sum_rule_types(sum_rule: str | None):
     """Checks if the sum_rule is either "intensive", "extensive", or None.
 
     Raises a ValueError if an invalid sum_rule is used."""
-    if sum_rule not in (None, "extensive", "intensive") :
+    if sum_rule not in (None, "extensive", "intensive"):
         raise ValueError(
             f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}"
         )
 
 
 def initialize_empty_converted_da(
-        *,
-        old_da: xr.DataArray,
-        old_dim: Hashable | str,
-        new_dim: str,
-        new_categorization: climate_categories.Categorization,
-) -> xr.DataArray :
+    *,
+    old_da: xr.DataArray,
+    old_dim: Hashable | str,
+    new_dim: str,
+    new_categorization: climate_categories.Categorization,
+) -> xr.DataArray:
     """Build a DataArray which can hold the data after conversion to a new
     categorization.
 
@@ -361,19 +364,19 @@ def initialize_empty_converted_da(
     """
     new_dims = []
     new_shape = []
-    for i, idim in enumerate(old_da.dims) :
-        if idim == old_dim :
+    for i, idim in enumerate(old_da.dims):
+        if idim == old_dim:
             new_dims.append(new_dim)
             new_shape.append(len(new_categorization))
-        else :
+        else:
             new_dims.append(idim)
             new_shape.append(old_da.shape[i])
 
     new_coords = {}
-    for coord in old_da.coords :
-        if coord == old_dim :
+    for coord in old_da.coords:
+        if coord == old_dim:
             new_coords[new_dim] = np.array(list(new_categorization.keys()))
-        elif old_dim in old_da.coords[coord].dims :
+        elif old_dim in old_da.coords[coord].dims:
             # The additional coordinate has the old_dim as one dimension, but we
             # won't be able to convert it
             logger.info(
@@ -381,15 +384,15 @@ def initialize_empty_converted_da(
                 f" and is skipped."
             )
             continue
-        else :
+        else:
             new_coords[coord] = old_da.coords[coord]
 
     new_attrs = copy.deepcopy(old_da.attrs)
-    for pdim in ("area", "cat", "scen") :
-        if pdim in new_attrs and new_attrs[pdim] == old_dim :
+    for pdim in ("area", "cat", "scen"):
+        if pdim in new_attrs and new_attrs[pdim] == old_dim:
             new_attrs[pdim] = new_dim
 
-    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] :
+    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]:
         new_attrs["sec_cats"].remove(old_dim)
         new_attrs["sec_cats"].append(new_dim)
 
@@ -407,12 +410,12 @@ def initialize_empty_converted_da(
 
 
 def factors_categories_to_xarray(
-        *,
-        dim: str,
-        factors_categories: dict[climate_categories.Category, int],
-        auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
-        auxiliary_dimensions: dict[climate_categories.Categorization, str],
-) -> tuple[dict[str, list[str]], xr.DataArray] :
+    *,
+    dim: str,
+    factors_categories: dict[climate_categories.Category, int],
+    auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
+    auxiliary_dimensions: dict[climate_categories.Categorization, str],
+) -> tuple[dict[str, list[str]], xr.DataArray]:
     """Convert dictionary mapping categories to factors into xarray-compatible objects.
 
     Using the xarray objects ensures that in subsequent calculations, everything
@@ -440,30 +443,30 @@ def factors_categories_to_xarray(
         factors is an xarray DataArray which can be multiplied with an xarray object
         after applying the selection.
     """
-    selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]}
+    selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]}
     factors = xr.DataArray(
         data=list(factors_categories.values()),
         dims=[dim],
         coords=selection,
     )
 
-    for aux_categorization, aux_categories in auxiliary_categories.items() :
-        if aux_categories :
+    for aux_categorization, aux_categories in auxiliary_categories.items():
+        if aux_categories:
             aux_dim = auxiliary_dimensions[aux_categorization]
             selection[aux_dim] = [cat.codes[0] for cat in aux_categories]
 
     return selection, factors
 
 
-class WeightingInfoMissing(ValueError) :
+class WeightingInfoMissing(ValueError):
     """Some information to derive weighting factors for a rule is missing."""
 
     def __init__(
-            self,
-            category: climate_categories.Category,
-            rule: climate_categories.ConversionRule,
-            message: str,
-    ) :
+        self,
+        category: climate_categories.Category,
+        rule: climate_categories.ConversionRule,
+        message: str,
+    ):
         full_message = (
             f"Can not derive data for category {category!r} using rule"
             f" '{rule}': {message} Skipping this rule."
@@ -472,15 +475,15 @@ def __init__(
 
 
 def derive_weights(
-        *,
-        dim: str,
-        category: climate_categories.Category,
-        rule: climate_categories.ConversionRule,
-        sum_rule: str | None,
-        operation_type: str,
-        weights: xr.DataArray | None,
-        selection: dict[str, list[str]],
-) -> xr.DataArray | float :
+    *,
+    dim: str,
+    category: climate_categories.Category,
+    rule: climate_categories.ConversionRule,
+    sum_rule: str | None,
+    operation_type: str,
+    weights: xr.DataArray | None,
+    selection: dict[str, list[str]],
+) -> xr.DataArray | float:
     """Derive the weights to use for applying a specific rule.
 
     Parameters
@@ -512,28 +515,28 @@ def derive_weights(
         Object which can be multiplied with the input or output DataArray to apply
         weights.
     """
-    if operation_type == "input" :
+    if operation_type == "input":
         operation_verb = "sum up"
         trivial_sum_rule = "extensive"
         nontrivial_sum_rule = "intensive"
         rule_cardinality = rule.cardinality_a
-    else :
+    else:
         operation_verb = "split"
         trivial_sum_rule = "intensive"
         nontrivial_sum_rule = "extensive"
         rule_cardinality = rule.cardinality_b
 
     # just one category or trivial sum rule, so no weights required
-    if rule_cardinality == "one" or sum_rule == trivial_sum_rule :
+    if rule_cardinality == "one" or sum_rule == trivial_sum_rule:
         return 1.0
-    if sum_rule == nontrivial_sum_rule :
-        if weights is None :
+    if sum_rule == nontrivial_sum_rule:
+        if weights is None:
             raise WeightingInfoMissing(
                 category=category,
                 rule=rule,
                 message=f"We need to {operation_verb} multiple categories with"
-                        f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
-                        f" specified.",
+                f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
+                f" specified.",
             )
         effective_weights = weights.loc[selection]
         # normalize so it is actually a weight, not a factor
@@ -543,16 +546,16 @@ def derive_weights(
         category=category,
         rule=rule,
         message=f"We need to {operation_verb} multiple categories, but the sum_rule is"
-                f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
-                f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
-                f" specified.",
+        f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
+        f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
+        f" specified.",
     )
 
 
 def prepare_auxiliary_dimensions(
-        conversion: climate_categories.Conversion,
-        auxiliary_dimensions: dict[str, str] | None,
-) -> dict[climate_categories.Categorization, str] | None :
+    conversion: climate_categories.Conversion,
+    auxiliary_dimensions: dict[str, str] | None,
+) -> dict[climate_categories.Categorization, str] | None:
     """Prepare and check the auxiliary dimension mapping.
 
     Check if all auxiliary categorizations used in the conversion are matched in
@@ -566,8 +569,8 @@ def prepare_auxiliary_dimensions(
         the auxiliary dimensions, but using Categorization objects instead of their
         names.
     """
-    if conversion.auxiliary_categorizations_names :
-        if auxiliary_dimensions is None :
+    if conversion.auxiliary_categorizations_names:
+        if auxiliary_dimensions is None:
             raise ValueError(
                 "The conversion uses auxiliary categories, but a translation to"
                 " dimension names was not provided using the argument"
@@ -578,16 +581,16 @@ def prepare_auxiliary_dimensions(
         missing = set(conversion.auxiliary_categorizations_names).difference(
             auxiliary_dimensions.keys()
         )
-        if missing :
+        if missing:
             raise ValueError(
                 "A dimension name was not given for all auxiliary categories:"
                 f" {missing} are missing in the auxiliary_dimensions argument, please"
                 " provide translations to the dimension names used in the data."
             )
 
-    if not auxiliary_dimensions :
+    if not auxiliary_dimensions:
         return auxiliary_dimensions
 
     return {
-        climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions
+        climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
     }
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 8202d43a..0cdaa97b 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,12 +1,14 @@
 """Tests for _convert.py"""
 
+import pathlib
+
 import climate_categories as cc
 import climate_categories._conversions as conversions
+import numpy as np
 import pytest
 import xarray as xr
-import pathlib
+
 import primap2
-import numpy as np
 
 
 def test_convert_ipcc(empty_ds: xr.Dataset):
@@ -32,6 +34,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
 
+
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
     filepath = pathlib.Path("data/BURDI_conversion.csv")
@@ -41,47 +44,47 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
     # unfccc_di_reader_config.py
     BURDI_categories = [
-            "1",
-            "1.A",
-            "1.A.1",
-            "1.A.2",
-            "1.A.3",
-            "1.A.4",
-            "1.A.5",
-            "1.B",
-            "1.B.1",
-            "1.B.2",
-            "2",
-            "2.A",
-            "2.B",
-            "2.C",
-            "2.D",
-            "2.E",
-            "2.F",
-            "2.G",
-            "3",
-            "4",
-            "4.A",
-            "4.B",
-            "4.C",
-            "4.D",
-            "4.E",
-            "4.F",
-            "4.G",
-            "5",
-            "6",
-            "6.A",
-            "6.B",
-            "6.C",
-            "6.D",
-            "24540",
-            "15163",
-            "14637",
-            "14424",
-            "14423",
-            "14638",
-            "7"]
-
+        "1",
+        "1.A",
+        "1.A.1",
+        "1.A.2",
+        "1.A.3",
+        "1.A.4",
+        "1.A.5",
+        "1.B",
+        "1.B.1",
+        "1.B.2",
+        "2",
+        "2.A",
+        "2.B",
+        "2.C",
+        "2.D",
+        "2.E",
+        "2.F",
+        "2.G",
+        "3",
+        "4",
+        "4.A",
+        "4.B",
+        "4.C",
+        "4.D",
+        "4.E",
+        "4.F",
+        "4.G",
+        "5",
+        "6",
+        "6.A",
+        "6.B",
+        "6.C",
+        "6.D",
+        "24540",
+        "15163",
+        "14637",
+        "14424",
+        "14423",
+        "14638",
+        "7",
+    ]
 
     # build a DA categorized by BURDI and with 1 everywhere so results are easy
     # to see
@@ -96,17 +99,23 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
         "category",
         conv,
         sum_rule="extensive",
-        auxiliary_dimensions={"gas" : "source (gas)"},
+        auxiliary_dimensions={"gas": "source (gas)"},
     )
 
     # cat 2 + 3 in BURDI equals cat 2 in IPCC2006_PRIMAP
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
     # cat 4.D + 4.C + 4.E + 4.F + 4.G in BURDI equals cat 3.C in IPCC2006_PRIMAP
-    assert (result.pr.loc[{"category" : "3.C"}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    assert (result.pr.loc[{"category": "3.C"}] == 5.0 * primap2.ureg("Gg CO2 / year")).all().item()
     # cat 5 in BURDI equals cat M.LULUCF in IPCC2006_PRIMAP
-    assert (result.pr.loc[{"category" : "M.LULUCF"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    assert (
+        (result.pr.loc[{"category": "M.LULUCF"}] == 1.0 * primap2.ureg("Gg CO2 / year"))
+        .all()
+        .item()
+    )
     # 2.E + 2.B = 2.E, 2.E should not be part of new data set
-    assert np.isnan(result.pr.loc[{"category" : "2.E"}].values).all()
+    assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
     # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
     # TODO: This will fail. M.BIO is currently not listed in climate categories
-    assert (result.pr.loc[{"category" : "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
\ No newline at end of file
+    assert (
+        (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    )

From 113abda5e8d674a8d1a25172d03ea3f3eed67d90 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 14 Oct 2024 15:47:19 +0200
Subject: [PATCH 20/36] comments

---
 primap2/_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index f13c8ac1..4577cc4e 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -85,11 +85,11 @@ def convert(
         """
         dim_name, old_categorization_name = extract_categorization_from_dim(dim)
 
-        # user put in str of new category or categorisation object
         if isinstance(categorization, (climate_categories.Categorization, str)):
             new_categorization = ensure_categorization_instance(categorization)
             old_categorization = ensure_categorization_instance(old_categorization_name)
             conversion = old_categorization.conversion_to(new_categorization)
+        # TODO: Refactor or change variable name for categorization. Conversion is not really the same
         elif isinstance(categorization, climate_categories._conversions.Conversion):
             new_categorization = ensure_categorization_instance(
                 categorization.categorization_b_name

From 7428d51d291694c16841aec5c3eed8528617f81a Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 17 Oct 2024 09:51:58 +0200
Subject: [PATCH 21/36] add test for custom categorisations and custom
 conversion

---
 primap2/_convert.py                           | 22 +++++---
 .../tests/data/simple_categorisation_a.yaml   | 35 ++++++++++++
 .../tests/data/simple_categorisation_b.yaml   | 27 +++++++++
 primap2/tests/data/simple_conversion.csv      |  5 ++
 primap2/tests/test_convert.py                 | 56 ++++++++++++++++++-
 5 files changed, 137 insertions(+), 8 deletions(-)
 create mode 100644 primap2/tests/data/simple_categorisation_a.yaml
 create mode 100644 primap2/tests/data/simple_categorisation_b.yaml
 create mode 100644 primap2/tests/data/simple_conversion.csv

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 4577cc4e..05c90416 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -16,10 +16,11 @@ class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
     def convert(
         self,
         dim: Hashable | str,
-        categorization: climate_categories.Categorization
-        | climate_categories._conversions.Conversion
-        | str,
+        # TODO naming
+        categorization: climate_categories.Categorization | str,
         *,
+        custom_categorisation_a : climate_categories.Categorization | None = None,
+        custom_categorisation_b : climate_categories.Categorization | None = None,
         sum_rule: typing.Literal["intensive", "extensive"] | None = None,
         input_weights: xr.DataArray | None = None,
         output_weights: xr.DataArray | None = None,
@@ -34,6 +35,7 @@ def convert(
 
         Parameters
         ----------
+        # TODO
         dim : str
             Dimension to convert. Has to be a dimension from ``da.dims``.
         categorization : climate_categories.Categorization or str
@@ -85,16 +87,22 @@ def convert(
         """
         dim_name, old_categorization_name = extract_categorization_from_dim(dim)
 
+        # TODO find better logic for all this
         if isinstance(categorization, (climate_categories.Categorization, str)):
             new_categorization = ensure_categorization_instance(categorization)
             old_categorization = ensure_categorization_instance(old_categorization_name)
             conversion = old_categorization.conversion_to(new_categorization)
         # TODO: Refactor or change variable name for categorization. Conversion is not really the same
         elif isinstance(categorization, climate_categories._conversions.Conversion):
-            new_categorization = ensure_categorization_instance(
-                categorization.categorization_b_name
-            )
-            conversion = categorization
+            if custom_categorisation_a and custom_categorisation_b:
+                old_categorization = ensure_categorization_instance(custom_categorisation_a)
+                new_categorization = ensure_categorization_instance(custom_categorisation_b)
+                conversion = categorization
+            else:
+                new_categorization = ensure_categorization_instance(
+                    categorization.categorization_b_name
+                )
+                conversion = categorization
         else:
             raise ValueError(
                 f"categorization must be of instance climate_categories.Categorization "
diff --git a/primap2/tests/data/simple_categorisation_a.yaml b/primap2/tests/data/simple_categorisation_a.yaml
new file mode 100644
index 00000000..1656c97b
--- /dev/null
+++ b/primap2/tests/data/simple_categorisation_a.yaml
@@ -0,0 +1,35 @@
+name: A
+title: Simple Categorization
+comment: A simple example categorization without relationships between categories
+references: doi:00000/00000
+institution: PIK
+last_update: 2021-02-23
+hierarchical: no
+version: 1
+categories:
+  1:
+    title: Category 1
+    comment: The first category
+    alternative_codes:
+      - A
+      - CatA
+    info:
+      important_data:
+        - A
+        - B
+        - C
+      other_important_thing: ABC
+  2:
+    title: Category 2
+    comment: The second category
+    alternative_codes:
+      - B
+      - CatB
+  3:
+    title: Category 3
+    comment: The third category
+    alternative_codes:
+      - C
+      - CatC
+  unnumbered:
+    title: The unnumbered category
\ No newline at end of file
diff --git a/primap2/tests/data/simple_categorisation_b.yaml b/primap2/tests/data/simple_categorisation_b.yaml
new file mode 100644
index 00000000..35751f9b
--- /dev/null
+++ b/primap2/tests/data/simple_categorisation_b.yaml
@@ -0,0 +1,27 @@
+name: B
+title: Simple Categorization
+comment: A simple example categorization without relationships between categories
+references: doi:00000/00000
+institution: PIK
+last_update: 2021-02-23
+hierarchical: no
+version: 1
+categories:
+  1:
+    title: Category 1
+    comment: The first category
+    alternative_codes:
+      - A
+      - CatA
+    info:
+      important_data:
+        - A
+        - B
+        - C
+      other_important_thing: ABC
+  2:
+    title: Category 2
+    comment: The second category
+    alternative_codes:
+      - B
+      - CatB
\ No newline at end of file
diff --git a/primap2/tests/data/simple_conversion.csv b/primap2/tests/data/simple_conversion.csv
new file mode 100644
index 00000000..724f62d9
--- /dev/null
+++ b/primap2/tests/data/simple_conversion.csv
@@ -0,0 +1,5 @@
+# references: test
+# last_update: 2024-10-14
+A,B,comment
+1,1, no comment
+2+3,2
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 0cdaa97b..30644ac3 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -11,6 +11,7 @@
 import primap2
 
 
+# test with existing conversion and two existing categorisations
 def test_convert_ipcc(empty_ds: xr.Dataset):
     # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy
     # to see
@@ -35,11 +36,12 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
 
 
+# test with new conversion and two existing categorisations
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
     filepath = pathlib.Path("data/BURDI_conversion.csv")
     conv = conversions.ConversionSpec.from_csv(filepath)
-    conv = conv.hydrate(cats=cc.cats["BURDI"]._cats)
+    conv = conv.hydrate(cats=cc.cats)
 
     # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
     # unfccc_di_reader_config.py
@@ -112,6 +114,11 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
         .all()
         .item()
     )
+    # 3.C.7 (converted from 4.C) should still be part of the data set,
+    # although it apprears in two conversion rules
+    assert (
+        (result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    )
     # 2.E + 2.B = 2.E, 2.E should not be part of new data set
     assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
     # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
@@ -119,3 +126,50 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     assert (
         (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
     )
+
+
+# test with new conversion and new categorisations
+def test_simple__custom_conversion_and_categorisation(empty_ds):
+    # make categorisation A from yaml
+    categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml")
+
+    # make categorisation B from yaml
+    categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml")
+
+    # make conversion from csv
+    conv = conversions.ConversionSpec.from_csv("data/simple_conversion.csv")
+    # categories not part of climate categories so we need to add them manually
+    cats = {
+        "A": categorisation_a,
+        "B": categorisation_b,
+    }
+    conv = conv.hydrate(cats=cats)
+
+    # make a dummy dataset based on A cats
+    da = empty_ds["CO2"]
+    da = da.expand_dims({"category (A)": list(categorisation_a.keys())})
+    arr = da.data.copy()
+    arr[:] = 1 * primap2.ureg("Gg CO2 / year")
+    da.data = arr
+
+    # convert to categorisation B
+    result = da.pr.convert(
+        "category",
+        categorization=conv,
+        custom_categorisation_a=categorisation_a,
+        custom_categorisation_b=categorisation_b,
+        sum_rule="extensive",
+    )
+
+    # category name includes B - the target categorisation
+    assert sorted(result.coords) == ["area (ISO3)", "category (B)", "source", "time"]
+
+    # check 1 -> 1
+    assert (result.pr.loc[{"category": "1"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+
+    # check 2 + 3 -> 2
+    assert (result.pr.loc[{"category": "2"}] == 2.0 * primap2.ureg("Gg CO2 / year")).all().item()
+
+    # check result has 2 categories (input categorisation had 3)
+    # TODO this is ambiguous when order changes
+    assert result.shape == (2, 21, 4, 1)

From 868afbc3c0269dba64cd4894c08f1d8385fb7d45 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 21 Oct 2024 12:50:24 +0200
Subject: [PATCH 22/36] refactor convert

---
 primap2/_convert.py           | 302 ++++++++++++++++++----------------
 primap2/tests/test_convert.py |  35 ++--
 2 files changed, 186 insertions(+), 151 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 05c90416..1284740a 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -11,21 +11,22 @@
 from ._selection import alias_dims
 
 
-class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
-    @alias_dims(["dim"])
-    def convert(
-        self,
-        dim: Hashable | str,
-        # TODO naming
-        categorization: climate_categories.Categorization | str,
-        *,
-        custom_categorisation_a : climate_categories.Categorization | None = None,
-        custom_categorisation_b : climate_categories.Categorization | None = None,
-        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-        input_weights: xr.DataArray | None = None,
-        output_weights: xr.DataArray | None = None,
-        auxiliary_dimensions: dict[str, str] | None = None,
-    ) -> xr.DataArray:
+class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) :
+
+    def convert_inner(
+            self,
+            dim: Hashable | str,
+            # TODO type will change to climate_categories.Conversion when
+            #  https://github.com/primap-community/climate_categories/pull/164 is merged
+            *,
+            conversion: climate_categories._conversions.Conversion,
+            old_categorization: climate_categories.Categorization,
+            new_categorization: climate_categories.Categorization,
+            sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+            input_weights: xr.DataArray | None = None,
+            output_weights: xr.DataArray | None = None,
+            auxiliary_dimensions: dict[str, str] | None = None,
+    ) -> xr.DataArray :
         """Convert the data along the given dimension into the new categorization.
 
         Maps the given dimension from one categorization (terminology) into another.
@@ -35,14 +36,22 @@ def convert(
 
         Parameters
         ----------
-        # TODO
         dim : str
             Dimension to convert. Has to be a dimension from ``da.dims``.
-        categorization : climate_categories.Categorization or str
+        # TODO type will change to climate_categories.Conversion when climate_categories/pull/164 is merged
+        conversion : climate_categories.Categorization or str or climate_categories._conversions.Conversion
             New categorization to convert the given dimension to. Either give the title
             of the new categorization (like ``IPCC1996``) or a
             ``climate_categories.Categorization`` object or a
             ``climate_categories._conversions.Conversion`` object.
+        custom_categorization_input
+            A custom categorization for the input data. Must be provided if conversion uses
+            input categorisation that is not in ``climate_categories``.
+            Overwrites categorisation in conversion if both are provided.
+        custom_categorization_output
+            A custom categorization for the output data. Must be provided if conversion uses
+            output categorisation that is not in ``climate_categories``.
+            Overwrites categorisation in conversion if both are provided.
         sum_rule : ``extensive``, ``intensive``, or None (default)
             If data of categories has to be summed up or divided, we need information
             whether the quantity measured is extensive (like, for example, total
@@ -85,33 +94,13 @@ def convert(
             A copy of the DataArray with the given dimension converted in the new
             categorization.
         """
-        dim_name, old_categorization_name = extract_categorization_from_dim(dim)
-
-        # TODO find better logic for all this
-        if isinstance(categorization, (climate_categories.Categorization, str)):
-            new_categorization = ensure_categorization_instance(categorization)
-            old_categorization = ensure_categorization_instance(old_categorization_name)
-            conversion = old_categorization.conversion_to(new_categorization)
-        # TODO: Refactor or change variable name for categorization. Conversion is not really the same
-        elif isinstance(categorization, climate_categories._conversions.Conversion):
-            if custom_categorisation_a and custom_categorisation_b:
-                old_categorization = ensure_categorization_instance(custom_categorisation_a)
-                new_categorization = ensure_categorization_instance(custom_categorisation_b)
-                conversion = categorization
-            else:
-                new_categorization = ensure_categorization_instance(
-                    categorization.categorization_b_name
-                )
-                conversion = categorization
-        else:
-            raise ValueError(
-                f"categorization must be of instance climate_categories.Categorization "
-                f"or climate_categories._conversions.Conversion. Got {type(categorization)}"
-            )
 
         check_valid_sum_rule_types(sum_rule)
 
         auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
+
+        # TODO maybe dim_name as argument from one level above
+        dim_name, old_categorization = extract_categorization_from_dim(dim)
         new_dim = f"{dim_name} ({new_categorization.name})"
 
         converted_da = initialize_empty_converted_da(
@@ -126,8 +115,8 @@ def convert(
         # note: if you have multiple rules to fill a single category, we should
         # use something like fillna
         converted_categories = []
-        for category in converted_da[new_dim]:
-            if category in converted_categories:
+        for category in converted_da[new_dim] :
+            if category in converted_categories :
                 continue
             newly_converted_categories, converted_da = self._fill_category(
                 da=converted_da,
@@ -145,19 +134,65 @@ def convert(
 
         return converted_da
 
+    @alias_dims(["dim"])
+    def convert(self,
+                dim,
+                new_categorization = None,
+                conversion = None,
+                sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+                input_weights: xr.DataArray | None = None,
+                output_weights: xr.DataArray | None = None,
+                auxiliary_dimensions: dict[str, str] | None = None,
+                ):
+
+        if (not new_categorization and not conversion):
+            raise ValueError(
+                "conversion or new_categorization must be provided."
+            )
+
+        # TODO clean up algorithm
+        if (conversion and not new_categorization):
+            old_categorization = conversion.categorization_a
+            new_categorization = conversion.categorization_b
+        elif (new_categorization and not conversion):
+            new_categorization = ensure_categorization_instance(new_categorization)
+            dim_name, old_categorization = extract_categorization_from_dim(dim)
+            old_categorization = ensure_categorization_instance(old_categorization)
+            conversion = old_categorization.conversion_to(new_categorization)
+        elif (new_categorization and conversion):
+            new_categorization = ensure_categorization_instance(new_categorization)
+            if new_categorization != conversion.categorization_b:
+                raise ValueError(
+                    "New categorization is different to target categorisation in conversion."
+                )
+            old_categorization = conversion.categorization_a
+            new_categorization = conversion.categorization_b
+
+
+        return self.convert_inner(
+            dim,
+            conversion = conversion,
+            old_categorization = old_categorization,
+            new_categorization = new_categorization,
+            sum_rule=sum_rule,
+            input_weights=input_weights,
+            output_weights=output_weights,
+            auxiliary_dimensions=auxiliary_dimensions,
+        )
+
     def _fill_category(
-        self,
-        da: xr.DataArray,
-        dim: str,
-        new_dim: str,
-        already_converted_categories: list[climate_categories.Category],
-        category: climate_categories.Category,
-        conversion: climate_categories.Conversion,
-        sum_rule: str | None,
-        auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
-        input_weights: xr.DataArray | None = None,
-        output_weights: xr.DataArray | None = None,
-    ) -> tuple[list[climate_categories.Category], xr.DataArray]:
+            self,
+            da: xr.DataArray,
+            dim: str,
+            new_dim: str,
+            already_converted_categories: list[climate_categories.Category],
+            category: climate_categories.Category,
+            conversion: climate_categories.Conversion,
+            sum_rule: str | None,
+            auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
+            input_weights: xr.DataArray | None = None,
+            output_weights: xr.DataArray | None = None,
+    ) -> tuple[list[climate_categories.Category], xr.DataArray] :
         """Return a copy of da with the given category filled by values converted
         using the given conversion.
 
@@ -191,13 +226,13 @@ def _fill_category(
         filled_categories, filled: list of climate_categories.category, xr.DataArray
             The categories that were filled and the new DataArray.
         """
-        try:
+        try :
             rules = applicable_rules(conversion, category)
-        except KeyError:
+        except KeyError :
             logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
             return [], da
 
-        for rule in rules:
+        for rule in rules :
             logger.debug(f"Processing rule {rule}.")
             # iterate until a non-restricted rule was applied or all rules are
             # exhausted
@@ -222,7 +257,7 @@ def _fill_category(
             already_converted = set(output_selection[new_dim]).intersection(
                 set(already_converted_categories)
             )
-            if already_converted:
+            if already_converted :
                 logger.warning(
                     f"For category {category!r}, would want to use a "
                     "rule with multiple outputs, but the following outputs "
@@ -231,7 +266,7 @@ def _fill_category(
                 )
                 continue
 
-            try:
+            try :
                 effective_input_weights = derive_weights(
                     dim=dim,
                     category=category,
@@ -250,7 +285,7 @@ def _fill_category(
                     sum_rule=sum_rule,
                     weights=output_weights,
                 )
-            except WeightingInfoMissing as err:
+            except WeightingInfoMissing as err :
                 logger.warning(str(err))
                 continue
 
@@ -263,7 +298,7 @@ def _fill_category(
 
             da.loc[output_selection] = rhs
 
-            if not rule.is_restricted:
+            if not rule.is_restricted :
                 # stop processing rules for this category
                 return output_selection[new_dim], da
 
@@ -273,8 +308,7 @@ def _fill_category(
         )
         return [], da
 
-
-def extract_categorization_from_dim(dim: str) -> (str, str):
+def extract_categorization_from_dim(dim: str) -> (str, str) :
     """Extract the pure dimension and the categorization from a composite dim.
 
     Parameters
@@ -299,51 +333,47 @@ def extract_categorization_from_dim(dim: str) -> (str, str):
         The pure_dim without categorization information and the categorization. If the
         input dim does not contain categorization information, a ValueError is raised.
     """
-    try:
+    try :
         pure, cat = dim.split("(", 1)
-    except ValueError:
+    except ValueError :
         raise ValueError(f"No categorization specified: {dim!r}.") from None
     return pure[:-1], cat[:-1]
 
-
-def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]:
+def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] :
     """Find the possible rules to derive the category using the given conversion."""
     rules = conversion.relevant_rules({conversion.categorization_b[category]})
     # a + b = c - d  can not be used to derive c nor d, only a and b
     rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())]
 
-    if not rules:
+    if not rules :
         raise KeyError(category)
     return rules
 
-
 def ensure_categorization_instance(
-    cat: str | climate_categories.Categorization,
-) -> climate_categories.Categorization:
+        cat: str | climate_categories.Categorization,
+) -> climate_categories.Categorization :
     """Takes a categorization name or object and returns the corresponding
     categorization object."""
-    if isinstance(cat, climate_categories.Categorization):
+    if isinstance(cat, climate_categories.Categorization) :
         return cat
     return climate_categories.cats[cat]
 
-
-def check_valid_sum_rule_types(sum_rule: str | None):
+def check_valid_sum_rule_types(sum_rule: str | None) :
     """Checks if the sum_rule is either "intensive", "extensive", or None.
 
     Raises a ValueError if an invalid sum_rule is used."""
-    if sum_rule not in (None, "extensive", "intensive"):
+    if sum_rule not in (None, "extensive", "intensive") :
         raise ValueError(
             f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}"
         )
 
-
 def initialize_empty_converted_da(
-    *,
-    old_da: xr.DataArray,
-    old_dim: Hashable | str,
-    new_dim: str,
-    new_categorization: climate_categories.Categorization,
-) -> xr.DataArray:
+        *,
+        old_da: xr.DataArray,
+        old_dim: Hashable | str,
+        new_dim: str,
+        new_categorization: climate_categories.Categorization,
+) -> xr.DataArray :
     """Build a DataArray which can hold the data after conversion to a new
     categorization.
 
@@ -372,19 +402,19 @@ def initialize_empty_converted_da(
     """
     new_dims = []
     new_shape = []
-    for i, idim in enumerate(old_da.dims):
-        if idim == old_dim:
+    for i, idim in enumerate(old_da.dims) :
+        if idim == old_dim :
             new_dims.append(new_dim)
             new_shape.append(len(new_categorization))
-        else:
+        else :
             new_dims.append(idim)
             new_shape.append(old_da.shape[i])
 
     new_coords = {}
-    for coord in old_da.coords:
-        if coord == old_dim:
+    for coord in old_da.coords :
+        if coord == old_dim :
             new_coords[new_dim] = np.array(list(new_categorization.keys()))
-        elif old_dim in old_da.coords[coord].dims:
+        elif old_dim in old_da.coords[coord].dims :
             # The additional coordinate has the old_dim as one dimension, but we
             # won't be able to convert it
             logger.info(
@@ -392,15 +422,15 @@ def initialize_empty_converted_da(
                 f" and is skipped."
             )
             continue
-        else:
+        else :
             new_coords[coord] = old_da.coords[coord]
 
     new_attrs = copy.deepcopy(old_da.attrs)
-    for pdim in ("area", "cat", "scen"):
-        if pdim in new_attrs and new_attrs[pdim] == old_dim:
+    for pdim in ("area", "cat", "scen") :
+        if pdim in new_attrs and new_attrs[pdim] == old_dim :
             new_attrs[pdim] = new_dim
 
-    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]:
+    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] :
         new_attrs["sec_cats"].remove(old_dim)
         new_attrs["sec_cats"].append(new_dim)
 
@@ -416,14 +446,13 @@ def initialize_empty_converted_da(
         attrs=new_attrs,
     )
 
-
 def factors_categories_to_xarray(
-    *,
-    dim: str,
-    factors_categories: dict[climate_categories.Category, int],
-    auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
-    auxiliary_dimensions: dict[climate_categories.Categorization, str],
-) -> tuple[dict[str, list[str]], xr.DataArray]:
+        *,
+        dim: str,
+        factors_categories: dict[climate_categories.Category, int],
+        auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
+        auxiliary_dimensions: dict[climate_categories.Categorization, str],
+) -> tuple[dict[str, list[str]], xr.DataArray] :
     """Convert dictionary mapping categories to factors into xarray-compatible objects.
 
     Using the xarray objects ensures that in subsequent calculations, everything
@@ -451,47 +480,45 @@ def factors_categories_to_xarray(
         factors is an xarray DataArray which can be multiplied with an xarray object
         after applying the selection.
     """
-    selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]}
+    selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]}
     factors = xr.DataArray(
         data=list(factors_categories.values()),
         dims=[dim],
         coords=selection,
     )
 
-    for aux_categorization, aux_categories in auxiliary_categories.items():
-        if aux_categories:
+    for aux_categorization, aux_categories in auxiliary_categories.items() :
+        if aux_categories :
             aux_dim = auxiliary_dimensions[aux_categorization]
             selection[aux_dim] = [cat.codes[0] for cat in aux_categories]
 
     return selection, factors
 
-
-class WeightingInfoMissing(ValueError):
+class WeightingInfoMissing(ValueError) :
     """Some information to derive weighting factors for a rule is missing."""
 
     def __init__(
-        self,
-        category: climate_categories.Category,
-        rule: climate_categories.ConversionRule,
-        message: str,
-    ):
+            self,
+            category: climate_categories.Category,
+            rule: climate_categories.ConversionRule,
+            message: str,
+    ) :
         full_message = (
             f"Can not derive data for category {category!r} using rule"
             f" '{rule}': {message} Skipping this rule."
         )
         ValueError.__init__(self, full_message)
 
-
 def derive_weights(
-    *,
-    dim: str,
-    category: climate_categories.Category,
-    rule: climate_categories.ConversionRule,
-    sum_rule: str | None,
-    operation_type: str,
-    weights: xr.DataArray | None,
-    selection: dict[str, list[str]],
-) -> xr.DataArray | float:
+        *,
+        dim: str,
+        category: climate_categories.Category,
+        rule: climate_categories.ConversionRule,
+        sum_rule: str | None,
+        operation_type: str,
+        weights: xr.DataArray | None,
+        selection: dict[str, list[str]],
+) -> xr.DataArray | float :
     """Derive the weights to use for applying a specific rule.
 
     Parameters
@@ -523,28 +550,28 @@ def derive_weights(
         Object which can be multiplied with the input or output DataArray to apply
         weights.
     """
-    if operation_type == "input":
+    if operation_type == "input" :
         operation_verb = "sum up"
         trivial_sum_rule = "extensive"
         nontrivial_sum_rule = "intensive"
         rule_cardinality = rule.cardinality_a
-    else:
+    else :
         operation_verb = "split"
         trivial_sum_rule = "intensive"
         nontrivial_sum_rule = "extensive"
         rule_cardinality = rule.cardinality_b
 
     # just one category or trivial sum rule, so no weights required
-    if rule_cardinality == "one" or sum_rule == trivial_sum_rule:
+    if rule_cardinality == "one" or sum_rule == trivial_sum_rule :
         return 1.0
-    if sum_rule == nontrivial_sum_rule:
-        if weights is None:
+    if sum_rule == nontrivial_sum_rule :
+        if weights is None :
             raise WeightingInfoMissing(
                 category=category,
                 rule=rule,
                 message=f"We need to {operation_verb} multiple categories with"
-                f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
-                f" specified.",
+                        f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
+                        f" specified.",
             )
         effective_weights = weights.loc[selection]
         # normalize so it is actually a weight, not a factor
@@ -554,16 +581,15 @@ def derive_weights(
         category=category,
         rule=rule,
         message=f"We need to {operation_verb} multiple categories, but the sum_rule is"
-        f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
-        f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
-        f" specified.",
+                f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
+                f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
+                f" specified.",
     )
 
-
 def prepare_auxiliary_dimensions(
-    conversion: climate_categories.Conversion,
-    auxiliary_dimensions: dict[str, str] | None,
-) -> dict[climate_categories.Categorization, str] | None:
+        conversion: climate_categories.Conversion,
+        auxiliary_dimensions: dict[str, str] | None,
+) -> dict[climate_categories.Categorization, str] | None :
     """Prepare and check the auxiliary dimension mapping.
 
     Check if all auxiliary categorizations used in the conversion are matched in
@@ -577,8 +603,8 @@ def prepare_auxiliary_dimensions(
         the auxiliary dimensions, but using Categorization objects instead of their
         names.
     """
-    if conversion.auxiliary_categorizations_names:
-        if auxiliary_dimensions is None:
+    if conversion.auxiliary_categorizations_names :
+        if auxiliary_dimensions is None :
             raise ValueError(
                 "The conversion uses auxiliary categories, but a translation to"
                 " dimension names was not provided using the argument"
@@ -589,16 +615,16 @@ def prepare_auxiliary_dimensions(
         missing = set(conversion.auxiliary_categorizations_names).difference(
             auxiliary_dimensions.keys()
         )
-        if missing:
+        if missing :
             raise ValueError(
                 "A dimension name was not given for all auxiliary categories:"
                 f" {missing} are missing in the auxiliary_dimensions argument, please"
                 " provide translations to the dimension names used in the data."
             )
 
-    if not auxiliary_dimensions:
+    if not auxiliary_dimensions :
         return auxiliary_dimensions
 
     return {
-        climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
+        climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions
     }
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 30644ac3..f2800172 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -22,12 +22,18 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     arr[:] = 1 * primap2.ureg("Gg CO2 / year")
     da.data = arr
 
+    new_categorization_name = "IPCC2006"
+
     with pytest.raises(ValueError, match="The conversion uses auxiliary categories"):
-        da.pr.convert("category", "IPCC2006", sum_rule="extensive")
+        da.pr.convert(
+            dim="category",
+            new_categorization=new_categorization_name,
+            sum_rule="extensive",
+        )
 
     result = da.pr.convert(
-        "category",
-        "IPCC2006",
+        dim="category",
+        new_categorization=new_categorization_name,
         sum_rule="extensive",
         auxiliary_dimensions={"gas": "source (gas)"},
     )
@@ -98,8 +104,8 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     da.data = arr
 
     result = da.pr.convert(
-        "category",
-        conv,
+        dim="category",
+        conversion=conv,
         sum_rule="extensive",
         auxiliary_dimensions={"gas": "source (gas)"},
     )
@@ -123,13 +129,17 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
     # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
     # TODO: This will fail. M.BIO is currently not listed in climate categories
-    assert (
-        (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
-    )
+    # assert (
+    #     (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    # )
+
+
+# def test_with_custom_conversion_and_one_custom_categorisation(empty_ds):
+#     assert False
 
 
 # test with new conversion and new categorisations
-def test_simple__custom_conversion_and_categorisation(empty_ds):
+def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     # make categorisation A from yaml
     categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml")
 
@@ -154,10 +164,9 @@ def test_simple__custom_conversion_and_categorisation(empty_ds):
 
     # convert to categorisation B
     result = da.pr.convert(
-        "category",
-        categorization=conv,
-        custom_categorisation_a=categorisation_a,
-        custom_categorisation_b=categorisation_b,
+        dim="category",
+        conversion=conv,
+        new_categorization=categorisation_b,
         sum_rule="extensive",
     )
 

From 4ea5398ca7f6f9e2e750302fc510475eba9a021d Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 21 Oct 2024 12:52:14 +0200
Subject: [PATCH 23/36] ruff

---
 primap2/_convert.py | 255 ++++++++++++++++++++++----------------------
 1 file changed, 130 insertions(+), 125 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 1284740a..c4fa9faf 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -11,22 +11,21 @@
 from ._selection import alias_dims
 
 
-class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor) :
-
+class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
     def convert_inner(
-            self,
-            dim: Hashable | str,
-            # TODO type will change to climate_categories.Conversion when
-            #  https://github.com/primap-community/climate_categories/pull/164 is merged
-            *,
-            conversion: climate_categories._conversions.Conversion,
-            old_categorization: climate_categories.Categorization,
-            new_categorization: climate_categories.Categorization,
-            sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-            input_weights: xr.DataArray | None = None,
-            output_weights: xr.DataArray | None = None,
-            auxiliary_dimensions: dict[str, str] | None = None,
-    ) -> xr.DataArray :
+        self,
+        dim: Hashable | str,
+        # TODO type will change to climate_categories.Conversion when
+        #  https://github.com/primap-community/climate_categories/pull/164 is merged
+        *,
+        conversion: climate_categories._conversions.Conversion,
+        old_categorization: climate_categories.Categorization,
+        new_categorization: climate_categories.Categorization,
+        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+        auxiliary_dimensions: dict[str, str] | None = None,
+    ) -> xr.DataArray:
         """Convert the data along the given dimension into the new categorization.
 
         Maps the given dimension from one categorization (terminology) into another.
@@ -115,8 +114,8 @@ def convert_inner(
         # note: if you have multiple rules to fill a single category, we should
         # use something like fillna
         converted_categories = []
-        for category in converted_da[new_dim] :
-            if category in converted_categories :
+        for category in converted_da[new_dim]:
+            if category in converted_categories:
                 continue
             newly_converted_categories, converted_da = self._fill_category(
                 da=converted_da,
@@ -135,31 +134,29 @@ def convert_inner(
         return converted_da
 
     @alias_dims(["dim"])
-    def convert(self,
-                dim,
-                new_categorization = None,
-                conversion = None,
-                sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-                input_weights: xr.DataArray | None = None,
-                output_weights: xr.DataArray | None = None,
-                auxiliary_dimensions: dict[str, str] | None = None,
-                ):
-
-        if (not new_categorization and not conversion):
-            raise ValueError(
-                "conversion or new_categorization must be provided."
-            )
+    def convert(
+        self,
+        dim,
+        new_categorization=None,
+        conversion=None,
+        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+        auxiliary_dimensions: dict[str, str] | None = None,
+    ):
+        if not new_categorization and not conversion:
+            raise ValueError("conversion or new_categorization must be provided.")
 
         # TODO clean up algorithm
-        if (conversion and not new_categorization):
+        if conversion and not new_categorization:
             old_categorization = conversion.categorization_a
             new_categorization = conversion.categorization_b
-        elif (new_categorization and not conversion):
+        elif new_categorization and not conversion:
             new_categorization = ensure_categorization_instance(new_categorization)
             dim_name, old_categorization = extract_categorization_from_dim(dim)
             old_categorization = ensure_categorization_instance(old_categorization)
             conversion = old_categorization.conversion_to(new_categorization)
-        elif (new_categorization and conversion):
+        elif new_categorization and conversion:
             new_categorization = ensure_categorization_instance(new_categorization)
             if new_categorization != conversion.categorization_b:
                 raise ValueError(
@@ -168,12 +165,11 @@ def convert(self,
             old_categorization = conversion.categorization_a
             new_categorization = conversion.categorization_b
 
-
         return self.convert_inner(
             dim,
-            conversion = conversion,
-            old_categorization = old_categorization,
-            new_categorization = new_categorization,
+            conversion=conversion,
+            old_categorization=old_categorization,
+            new_categorization=new_categorization,
             sum_rule=sum_rule,
             input_weights=input_weights,
             output_weights=output_weights,
@@ -181,18 +177,18 @@ def convert(self,
         )
 
     def _fill_category(
-            self,
-            da: xr.DataArray,
-            dim: str,
-            new_dim: str,
-            already_converted_categories: list[climate_categories.Category],
-            category: climate_categories.Category,
-            conversion: climate_categories.Conversion,
-            sum_rule: str | None,
-            auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
-            input_weights: xr.DataArray | None = None,
-            output_weights: xr.DataArray | None = None,
-    ) -> tuple[list[climate_categories.Category], xr.DataArray] :
+        self,
+        da: xr.DataArray,
+        dim: str,
+        new_dim: str,
+        already_converted_categories: list[climate_categories.Category],
+        category: climate_categories.Category,
+        conversion: climate_categories.Conversion,
+        sum_rule: str | None,
+        auxiliary_dimensions: dict[climate_categories.Categorization, str] | None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+    ) -> tuple[list[climate_categories.Category], xr.DataArray]:
         """Return a copy of da with the given category filled by values converted
         using the given conversion.
 
@@ -226,13 +222,13 @@ def _fill_category(
         filled_categories, filled: list of climate_categories.category, xr.DataArray
             The categories that were filled and the new DataArray.
         """
-        try :
+        try:
             rules = applicable_rules(conversion, category)
-        except KeyError :
+        except KeyError:
             logger.debug(f"No rule to derive data for {category!r}, will be NaN.")
             return [], da
 
-        for rule in rules :
+        for rule in rules:
             logger.debug(f"Processing rule {rule}.")
             # iterate until a non-restricted rule was applied or all rules are
             # exhausted
@@ -257,7 +253,7 @@ def _fill_category(
             already_converted = set(output_selection[new_dim]).intersection(
                 set(already_converted_categories)
             )
-            if already_converted :
+            if already_converted:
                 logger.warning(
                     f"For category {category!r}, would want to use a "
                     "rule with multiple outputs, but the following outputs "
@@ -266,7 +262,7 @@ def _fill_category(
                 )
                 continue
 
-            try :
+            try:
                 effective_input_weights = derive_weights(
                     dim=dim,
                     category=category,
@@ -285,7 +281,7 @@ def _fill_category(
                     sum_rule=sum_rule,
                     weights=output_weights,
                 )
-            except WeightingInfoMissing as err :
+            except WeightingInfoMissing as err:
                 logger.warning(str(err))
                 continue
 
@@ -298,7 +294,7 @@ def _fill_category(
 
             da.loc[output_selection] = rhs
 
-            if not rule.is_restricted :
+            if not rule.is_restricted:
                 # stop processing rules for this category
                 return output_selection[new_dim], da
 
@@ -308,7 +304,8 @@ def _fill_category(
         )
         return [], da
 
-def extract_categorization_from_dim(dim: str) -> (str, str) :
+
+def extract_categorization_from_dim(dim: str) -> (str, str):
     """Extract the pure dimension and the categorization from a composite dim.
 
     Parameters
@@ -333,47 +330,51 @@ def extract_categorization_from_dim(dim: str) -> (str, str) :
         The pure_dim without categorization information and the categorization. If the
         input dim does not contain categorization information, a ValueError is raised.
     """
-    try :
+    try:
         pure, cat = dim.split("(", 1)
-    except ValueError :
+    except ValueError:
         raise ValueError(f"No categorization specified: {dim!r}.") from None
     return pure[:-1], cat[:-1]
 
-def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule] :
+
+def applicable_rules(conversion, category) -> list[climate_categories.ConversionRule]:
     """Find the possible rules to derive the category using the given conversion."""
     rules = conversion.relevant_rules({conversion.categorization_b[category]})
     # a + b = c - d  can not be used to derive c nor d, only a and b
     rules = [r for r in rules if all(f > 0 for f in r.factors_categories_b.values())]
 
-    if not rules :
+    if not rules:
         raise KeyError(category)
     return rules
 
+
 def ensure_categorization_instance(
-        cat: str | climate_categories.Categorization,
-) -> climate_categories.Categorization :
+    cat: str | climate_categories.Categorization,
+) -> climate_categories.Categorization:
     """Takes a categorization name or object and returns the corresponding
     categorization object."""
-    if isinstance(cat, climate_categories.Categorization) :
+    if isinstance(cat, climate_categories.Categorization):
         return cat
     return climate_categories.cats[cat]
 
-def check_valid_sum_rule_types(sum_rule: str | None) :
+
+def check_valid_sum_rule_types(sum_rule: str | None):
     """Checks if the sum_rule is either "intensive", "extensive", or None.
 
     Raises a ValueError if an invalid sum_rule is used."""
-    if sum_rule not in (None, "extensive", "intensive") :
+    if sum_rule not in (None, "extensive", "intensive"):
         raise ValueError(
             f"if defined, sum_rule must be either 'extensive' or 'intensive', not" f" {sum_rule}"
         )
 
+
 def initialize_empty_converted_da(
-        *,
-        old_da: xr.DataArray,
-        old_dim: Hashable | str,
-        new_dim: str,
-        new_categorization: climate_categories.Categorization,
-) -> xr.DataArray :
+    *,
+    old_da: xr.DataArray,
+    old_dim: Hashable | str,
+    new_dim: str,
+    new_categorization: climate_categories.Categorization,
+) -> xr.DataArray:
     """Build a DataArray which can hold the data after conversion to a new
     categorization.
 
@@ -402,19 +403,19 @@ def initialize_empty_converted_da(
     """
     new_dims = []
     new_shape = []
-    for i, idim in enumerate(old_da.dims) :
-        if idim == old_dim :
+    for i, idim in enumerate(old_da.dims):
+        if idim == old_dim:
             new_dims.append(new_dim)
             new_shape.append(len(new_categorization))
-        else :
+        else:
             new_dims.append(idim)
             new_shape.append(old_da.shape[i])
 
     new_coords = {}
-    for coord in old_da.coords :
-        if coord == old_dim :
+    for coord in old_da.coords:
+        if coord == old_dim:
             new_coords[new_dim] = np.array(list(new_categorization.keys()))
-        elif old_dim in old_da.coords[coord].dims :
+        elif old_dim in old_da.coords[coord].dims:
             # The additional coordinate has the old_dim as one dimension, but we
             # won't be able to convert it
             logger.info(
@@ -422,15 +423,15 @@ def initialize_empty_converted_da(
                 f" and is skipped."
             )
             continue
-        else :
+        else:
             new_coords[coord] = old_da.coords[coord]
 
     new_attrs = copy.deepcopy(old_da.attrs)
-    for pdim in ("area", "cat", "scen") :
-        if pdim in new_attrs and new_attrs[pdim] == old_dim :
+    for pdim in ("area", "cat", "scen"):
+        if pdim in new_attrs and new_attrs[pdim] == old_dim:
             new_attrs[pdim] = new_dim
 
-    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"] :
+    if "sec cats" in new_attrs and old_dim in new_attrs["sec_cats"]:
         new_attrs["sec_cats"].remove(old_dim)
         new_attrs["sec_cats"].append(new_dim)
 
@@ -446,13 +447,14 @@ def initialize_empty_converted_da(
         attrs=new_attrs,
     )
 
+
 def factors_categories_to_xarray(
-        *,
-        dim: str,
-        factors_categories: dict[climate_categories.Category, int],
-        auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
-        auxiliary_dimensions: dict[climate_categories.Categorization, str],
-) -> tuple[dict[str, list[str]], xr.DataArray] :
+    *,
+    dim: str,
+    factors_categories: dict[climate_categories.Category, int],
+    auxiliary_categories: dict[climate_categories.Categorization, set[climate_categories.Category]],
+    auxiliary_dimensions: dict[climate_categories.Categorization, str],
+) -> tuple[dict[str, list[str]], xr.DataArray]:
     """Convert dictionary mapping categories to factors into xarray-compatible objects.
 
     Using the xarray objects ensures that in subsequent calculations, everything
@@ -480,45 +482,47 @@ def factors_categories_to_xarray(
         factors is an xarray DataArray which can be multiplied with an xarray object
         after applying the selection.
     """
-    selection = {dim : [cat.codes[0] for cat in factors_categories.keys()]}
+    selection = {dim: [cat.codes[0] for cat in factors_categories.keys()]}
     factors = xr.DataArray(
         data=list(factors_categories.values()),
         dims=[dim],
         coords=selection,
     )
 
-    for aux_categorization, aux_categories in auxiliary_categories.items() :
-        if aux_categories :
+    for aux_categorization, aux_categories in auxiliary_categories.items():
+        if aux_categories:
             aux_dim = auxiliary_dimensions[aux_categorization]
             selection[aux_dim] = [cat.codes[0] for cat in aux_categories]
 
     return selection, factors
 
-class WeightingInfoMissing(ValueError) :
+
+class WeightingInfoMissing(ValueError):
     """Some information to derive weighting factors for a rule is missing."""
 
     def __init__(
-            self,
-            category: climate_categories.Category,
-            rule: climate_categories.ConversionRule,
-            message: str,
-    ) :
+        self,
+        category: climate_categories.Category,
+        rule: climate_categories.ConversionRule,
+        message: str,
+    ):
         full_message = (
             f"Can not derive data for category {category!r} using rule"
             f" '{rule}': {message} Skipping this rule."
         )
         ValueError.__init__(self, full_message)
 
+
 def derive_weights(
-        *,
-        dim: str,
-        category: climate_categories.Category,
-        rule: climate_categories.ConversionRule,
-        sum_rule: str | None,
-        operation_type: str,
-        weights: xr.DataArray | None,
-        selection: dict[str, list[str]],
-) -> xr.DataArray | float :
+    *,
+    dim: str,
+    category: climate_categories.Category,
+    rule: climate_categories.ConversionRule,
+    sum_rule: str | None,
+    operation_type: str,
+    weights: xr.DataArray | None,
+    selection: dict[str, list[str]],
+) -> xr.DataArray | float:
     """Derive the weights to use for applying a specific rule.
 
     Parameters
@@ -550,28 +554,28 @@ def derive_weights(
         Object which can be multiplied with the input or output DataArray to apply
         weights.
     """
-    if operation_type == "input" :
+    if operation_type == "input":
         operation_verb = "sum up"
         trivial_sum_rule = "extensive"
         nontrivial_sum_rule = "intensive"
         rule_cardinality = rule.cardinality_a
-    else :
+    else:
         operation_verb = "split"
         trivial_sum_rule = "intensive"
         nontrivial_sum_rule = "extensive"
         rule_cardinality = rule.cardinality_b
 
     # just one category or trivial sum rule, so no weights required
-    if rule_cardinality == "one" or sum_rule == trivial_sum_rule :
+    if rule_cardinality == "one" or sum_rule == trivial_sum_rule:
         return 1.0
-    if sum_rule == nontrivial_sum_rule :
-        if weights is None :
+    if sum_rule == nontrivial_sum_rule:
+        if weights is None:
             raise WeightingInfoMissing(
                 category=category,
                 rule=rule,
                 message=f"We need to {operation_verb} multiple categories with"
-                        f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
-                        f" specified.",
+                f" sum_rule={nontrivial_sum_rule}, but no {operation_type}_weights are"
+                f" specified.",
             )
         effective_weights = weights.loc[selection]
         # normalize so it is actually a weight, not a factor
@@ -581,15 +585,16 @@ def derive_weights(
         category=category,
         rule=rule,
         message=f"We need to {operation_verb} multiple categories, but the sum_rule is"
-                f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
-                f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
-                f" specified.",
+        f" not specified. Rule can only be used if sum_rule={trivial_sum_rule!r} or"
+        f" sum_rule={nontrivial_sum_rule} and {operation_type}_weights are"
+        f" specified.",
     )
 
+
 def prepare_auxiliary_dimensions(
-        conversion: climate_categories.Conversion,
-        auxiliary_dimensions: dict[str, str] | None,
-) -> dict[climate_categories.Categorization, str] | None :
+    conversion: climate_categories.Conversion,
+    auxiliary_dimensions: dict[str, str] | None,
+) -> dict[climate_categories.Categorization, str] | None:
     """Prepare and check the auxiliary dimension mapping.
 
     Check if all auxiliary categorizations used in the conversion are matched in
@@ -603,8 +608,8 @@ def prepare_auxiliary_dimensions(
         the auxiliary dimensions, but using Categorization objects instead of their
         names.
     """
-    if conversion.auxiliary_categorizations_names :
-        if auxiliary_dimensions is None :
+    if conversion.auxiliary_categorizations_names:
+        if auxiliary_dimensions is None:
             raise ValueError(
                 "The conversion uses auxiliary categories, but a translation to"
                 " dimension names was not provided using the argument"
@@ -615,16 +620,16 @@ def prepare_auxiliary_dimensions(
         missing = set(conversion.auxiliary_categorizations_names).difference(
             auxiliary_dimensions.keys()
         )
-        if missing :
+        if missing:
             raise ValueError(
                 "A dimension name was not given for all auxiliary categories:"
                 f" {missing} are missing in the auxiliary_dimensions argument, please"
                 " provide translations to the dimension names used in the data."
             )
 
-    if not auxiliary_dimensions :
+    if not auxiliary_dimensions:
         return auxiliary_dimensions
 
     return {
-        climate_categories.cats[name] : auxiliary_dimensions[name] for name in auxiliary_dimensions
+        climate_categories.cats[name]: auxiliary_dimensions[name] for name in auxiliary_dimensions
     }

From 44c2caef61f893797ac0ae2ffe9943e26d9458fb Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 21 Oct 2024 14:56:13 +0200
Subject: [PATCH 24/36] clean up

---
 primap2/_convert.py | 143 ++++++++++++++++++++++----------------------
 1 file changed, 72 insertions(+), 71 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index c4fa9faf..e2b16a79 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -19,12 +19,65 @@ def convert_inner(
         #  https://github.com/primap-community/climate_categories/pull/164 is merged
         *,
         conversion: climate_categories._conversions.Conversion,
-        old_categorization: climate_categories.Categorization,
         new_categorization: climate_categories.Categorization,
         sum_rule: typing.Literal["intensive", "extensive"] | None = None,
         input_weights: xr.DataArray | None = None,
         output_weights: xr.DataArray | None = None,
         auxiliary_dimensions: dict[str, str] | None = None,
+    ) -> xr.DataArray:
+        """
+        See docstring of `convert` for details on arguments and behavior.
+        """
+
+        check_valid_sum_rule_types(sum_rule)
+
+        auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
+
+        # TODO maybe dim_name as argument from one level above
+        dim_name, old_categorization = extract_categorization_from_dim(dim)
+        new_dim = f"{dim_name} ({new_categorization.name})"
+
+        converted_da = initialize_empty_converted_da(
+            old_da=self._da,
+            old_dim=dim,
+            new_dim=new_dim,
+            new_categorization=new_categorization,
+        )
+
+        # idea: convert 1-to-1 mappings first, should be easy in a single xarray
+        # operation
+        # note: if you have multiple rules to fill a single category, we should
+        # use something like fillna
+        converted_categories = []
+        for category in converted_da[new_dim]:
+            if category in converted_categories:
+                continue
+            newly_converted_categories, converted_da = self._fill_category(
+                da=converted_da,
+                dim=dim,
+                new_dim=new_dim,
+                already_converted_categories=converted_categories,
+                category=category.item(),
+                conversion=conversion,
+                sum_rule=sum_rule,
+                auxiliary_dimensions=auxiliary_dimensions,
+                input_weights=input_weights,
+                output_weights=output_weights,
+            )
+            converted_categories += newly_converted_categories
+
+        return converted_da
+
+    @alias_dims(["dim"])
+    def convert(
+        self,
+        dim:  Hashable | str,
+        conversion: climate_categories._conversions.Conversion = None,
+        new_categorization: climate_categories.Categorization = None,
+        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
+        input_weights: xr.DataArray | None = None,
+        output_weights: xr.DataArray | None = None,
+        auxiliary_dimensions: dict[str, str] | None = None,
     ) -> xr.DataArray:
         """Convert the data along the given dimension into the new categorization.
 
@@ -37,20 +90,15 @@ def convert_inner(
         ----------
         dim : str
             Dimension to convert. Has to be a dimension from ``da.dims``.
-        # TODO type will change to climate_categories.Conversion when climate_categories/pull/164 is merged
-        conversion : climate_categories.Categorization or str or climate_categories._conversions.Conversion
-            New categorization to convert the given dimension to. Either give the title
-            of the new categorization (like ``IPCC1996``) or a
-            ``climate_categories.Categorization`` object or a
-            ``climate_categories._conversions.Conversion`` object.
-        custom_categorization_input
-            A custom categorization for the input data. Must be provided if conversion uses
-            input categorisation that is not in ``climate_categories``.
-            Overwrites categorisation in conversion if both are provided.
-        custom_categorization_output
-            A custom categorization for the output data. Must be provided if conversion uses
-            output categorisation that is not in ``climate_categories``.
-            Overwrites categorisation in conversion if both are provided.
+        conversion : climate_categories.Conversion
+            The conversion rules that describe the conversion from the old to the new categorization.
+            Contains ``climate_categories.Categorization`` object for old and new categorisation.
+        new_categorization: str
+            New categorization to convert the given dimension to. If the categorization
+            is part of climate categories the title of the new categorization (like ``IPCC1996``)
+            will work. A ``climate_categories.Categorization`` object can be used regardless
+            of wether it is part of climate_categories. When providing just the new categorization,
+            the old categorization as well as the conversion must be part of climate_categories.
         sum_rule : ``extensive``, ``intensive``, or None (default)
             If data of categories has to be summed up or divided, we need information
             whether the quantity measured is extensive (like, for example, total
@@ -94,69 +142,20 @@ def convert_inner(
             categorization.
         """
 
-        check_valid_sum_rule_types(sum_rule)
-
-        auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
-
-        # TODO maybe dim_name as argument from one level above
-        dim_name, old_categorization = extract_categorization_from_dim(dim)
-        new_dim = f"{dim_name} ({new_categorization.name})"
-
-        converted_da = initialize_empty_converted_da(
-            old_da=self._da,
-            old_dim=dim,
-            new_dim=new_dim,
-            new_categorization=new_categorization,
-        )
-
-        # idea: convert 1-to-1 mappings first, should be easy in a single xarray
-        # operation
-        # note: if you have multiple rules to fill a single category, we should
-        # use something like fillna
-        converted_categories = []
-        for category in converted_da[new_dim]:
-            if category in converted_categories:
-                continue
-            newly_converted_categories, converted_da = self._fill_category(
-                da=converted_da,
-                dim=dim,
-                new_dim=new_dim,
-                already_converted_categories=converted_categories,
-                category=category.item(),
-                conversion=conversion,
-                sum_rule=sum_rule,
-                auxiliary_dimensions=auxiliary_dimensions,
-                input_weights=input_weights,
-                output_weights=output_weights,
-            )
-            converted_categories += newly_converted_categories
-
-        return converted_da
-
-    @alias_dims(["dim"])
-    def convert(
-        self,
-        dim,
-        new_categorization=None,
-        conversion=None,
-        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-        input_weights: xr.DataArray | None = None,
-        output_weights: xr.DataArray | None = None,
-        auxiliary_dimensions: dict[str, str] | None = None,
-    ):
-        if not new_categorization and not conversion:
-            raise ValueError("conversion or new_categorization must be provided.")
-
-        # TODO clean up algorithm
+        # user provides only conversion
         if conversion and not new_categorization:
             old_categorization = conversion.categorization_a
             new_categorization = conversion.categorization_b
+        # user provides only new categorisation
         elif new_categorization and not conversion:
             new_categorization = ensure_categorization_instance(new_categorization)
             dim_name, old_categorization = extract_categorization_from_dim(dim)
             old_categorization = ensure_categorization_instance(old_categorization)
             conversion = old_categorization.conversion_to(new_categorization)
-        elif new_categorization and conversion:
+        # user provides conversion AND new categorisation
+        # TODO: There is no additional value in providing both because all
+        # the information is already in conversion. Maybe we shouldn't allow this case?
+        elif new_categorization:
             new_categorization = ensure_categorization_instance(new_categorization)
             if new_categorization != conversion.categorization_b:
                 raise ValueError(
@@ -164,11 +163,13 @@ def convert(
                 )
             old_categorization = conversion.categorization_a
             new_categorization = conversion.categorization_b
+        # User does not provide anything
+        else:
+            raise ValueError("conversion or new_categorization must be provided.")
 
         return self.convert_inner(
             dim,
             conversion=conversion,
-            old_categorization=old_categorization,
             new_categorization=new_categorization,
             sum_rule=sum_rule,
             input_weights=input_weights,

From 7ffe50637ea5802c01b09507cf2d05d54c52310d Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 21 Oct 2024 14:59:08 +0200
Subject: [PATCH 25/36] more cleanup

---
 primap2/_convert.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index e2b16a79..4a980c2a 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -153,17 +153,15 @@ def convert(
             old_categorization = ensure_categorization_instance(old_categorization)
             conversion = old_categorization.conversion_to(new_categorization)
         # user provides conversion AND new categorisation
-        # TODO: There is no additional value in providing both because all
-        # the information is already in conversion. Maybe we shouldn't allow this case?
+        # TODO: There is no additional value in providing conversion AND new_categorization because all
+        # information is already in conversion. Maybe we shouldn't allow this case?
         elif new_categorization:
-            new_categorization = ensure_categorization_instance(new_categorization)
             if new_categorization != conversion.categorization_b:
                 raise ValueError(
                     "New categorization is different to target categorisation in conversion."
                 )
-            old_categorization = conversion.categorization_a
             new_categorization = conversion.categorization_b
-        # User does not provide anything
+        # User provides neither conversion nor new categorization
         else:
             raise ValueError("conversion or new_categorization must be provided.")
 

From 0db737363cdcd0e21f5ce081207aa52efca12784 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 21 Oct 2024 15:33:14 +0200
Subject: [PATCH 26/36] docstring and argument passing from outer to inner
 convert function

---
 primap2/_convert.py           | 114 ++++++++++++++++++++++++----------
 primap2/tests/test_convert.py |   4 --
 2 files changed, 82 insertions(+), 36 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 4a980c2a..39d4bfd2 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -12,29 +12,82 @@
 
 
 class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
-    def convert_inner(
+    def _convert_inner(
         self,
         dim: Hashable | str,
+        *,
         # TODO type will change to climate_categories.Conversion when
         #  https://github.com/primap-community/climate_categories/pull/164 is merged
-        *,
         conversion: climate_categories._conversions.Conversion,
-        new_categorization: climate_categories.Categorization,
         sum_rule: typing.Literal["intensive", "extensive"] | None = None,
         input_weights: xr.DataArray | None = None,
         output_weights: xr.DataArray | None = None,
         auxiliary_dimensions: dict[str, str] | None = None,
     ) -> xr.DataArray:
-        """
-        See docstring of `convert` for details on arguments and behavior.
+        """Convert the data along the given dimension into the new categorization.
+
+        Maps the given dimension from one categorization (terminology) into another.
+        Fetches the rules to do the mapping from the climate_categories package, and
+        therefore will only work if there are conversions rules to convert from the
+        current categorization to the new categorization.
+
+        Parameters
+        ----------
+        dim : str
+            Dimension to convert. Has to be a dimension from ``da.dims``.
+        conversion : climate_categories.Conversion
+            The conversion rules that describe the conversion from the old to the new
+            categorization. Contains ``climate_categories.Categorization``
+            object for old and new categorization.
+        sum_rule : ``extensive``, ``intensive``, or None (default)
+            If data of categories has to be summed up or divided, we need information
+            whether the quantity measured is extensive (like, for example, total
+            emissions in a year subdivided into multiple sectoral categories) or
+            intensive (like, for example, average per-person emissions in a year
+            subdivided into different territorial entities). By default (None), a
+            warning is issued if data has to be summed up or divided.
+        input_weights : xr.DataArray, optional
+            If data in input categories has to be summed up and the sum_rule is
+            ``intensive``, weights for the input categories are required.
+            The weights can be given in any shape compatible with the DataArray that
+            is converted, e.g. to give different weights for industrial sectors by
+            country. However, at least the ``dim`` that is converted needs to be in
+            ``input_weights.dims``.
+            If no weights are specified but a rule requiring weights is specified
+            in the conversion rules, a warning is issued and the respective rule is
+            skipped (probably resulting in more NaNs in the output).
+        output_weights : xr.DataArray, optional
+            If data has to be divided into several output categories and the sum_rule is
+            ``extensive``, weights for the output categories are required.
+            The weights can be given in any shape compatible with the DataArray that
+            is converted, e.g. to give different weights for industrial sectors by
+            country. However, at least the ``dim`` that is converted needs to be in
+            ``output_weights.dims``.
+            If no weights are specified but a rule requiring weights is specified
+            in the conversion rules, a warning is issued and the respective rule is
+            skipped (probably resulting in more NaNs in the output).
+        auxiliary_dimensions : dict[str, str], optional
+            Mapping of auxiliary categorizations to dimension names used in this
+            DataArray. In conversions which contain rules which are valid only for
+            certain orthogonal dimensions (e.g. a conversion between different sectoral
+            terminologies, but some rules are only valid for specific countries), only
+            the categorization is specified. Therefore, in this case you have to specify
+            a mapping from categorization name to dimension name.
+            Example: {"ISO3": "area (ISO3)"}) .
+
+        Returns
+        -------
+        converted : xr.DataArray
+            A copy of the DataArray with the given dimension converted in the new
+            categorization.
         """
 
         check_valid_sum_rule_types(sum_rule)
 
         auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
 
-        # TODO maybe dim_name as argument from one level above
         dim_name, old_categorization = extract_categorization_from_dim(dim)
+        new_categorization = conversion.categorization_b
         new_dim = f"{dim_name} ({new_categorization.name})"
 
         converted_da = initialize_empty_converted_da(
@@ -72,8 +125,8 @@ def convert_inner(
     def convert(
         self,
         dim:  Hashable | str,
-        conversion: climate_categories._conversions.Conversion = None,
-        new_categorization: climate_categories.Categorization = None,
+        conversion: climate_categories._conversions.Conversion | None = None,
+        new_categorization: climate_categories.Categorization | str | None = None,
         sum_rule: typing.Literal["intensive", "extensive"] | None = None,
         input_weights: xr.DataArray | None = None,
         output_weights: xr.DataArray | None = None,
@@ -81,7 +134,8 @@ def convert(
     ) -> xr.DataArray:
         """Convert the data along the given dimension into the new categorization.
 
-        Maps the given dimension from one categorization (terminology) into another.
+        Generates conversion and new categorization from given inputs. Maps the given dimension
+        from one categorization (terminology) into another.
         Fetches the rules to do the mapping from the climate_categories package, and
         therefore will only work if there are conversions rules to convert from the
         current categorization to the new categorization.
@@ -92,13 +146,15 @@ def convert(
             Dimension to convert. Has to be a dimension from ``da.dims``.
         conversion : climate_categories.Conversion
             The conversion rules that describe the conversion from the old to the new categorization.
-            Contains ``climate_categories.Categorization`` object for old and new categorisation.
+            Contains ``climate_categories.Categorization`` object for old and new categorization.
+            Either conversion or new_categorization must be provided.
         new_categorization: str
             New categorization to convert the given dimension to. If the categorization
             is part of climate categories the title of the new categorization (like ``IPCC1996``)
             will work. A ``climate_categories.Categorization`` object can be used regardless
-            of wether it is part of climate_categories. When providing just the new categorization,
+            of whether it is part of climate_categories. When providing just the new categorization,
             the old categorization as well as the conversion must be part of climate_categories.
+            Either conversion or new_categorization must be provided.
         sum_rule : ``extensive``, ``intensive``, or None (default)
             If data of categories has to be summed up or divided, we need information
             whether the quantity measured is extensive (like, for example, total
@@ -142,33 +198,27 @@ def convert(
             categorization.
         """
 
-        # user provides only conversion
-        if conversion and not new_categorization:
-            old_categorization = conversion.categorization_a
-            new_categorization = conversion.categorization_b
-        # user provides only new categorisation
-        elif new_categorization and not conversion:
-            new_categorization = ensure_categorization_instance(new_categorization)
-            dim_name, old_categorization = extract_categorization_from_dim(dim)
-            old_categorization = ensure_categorization_instance(old_categorization)
-            conversion = old_categorization.conversion_to(new_categorization)
-        # user provides conversion AND new categorisation
-        # TODO: There is no additional value in providing conversion AND new_categorization because all
-        # information is already in conversion. Maybe we shouldn't allow this case?
-        elif new_categorization:
-            if new_categorization != conversion.categorization_b:
+
+        # User provides neither conversion nor new categorization
+        if (not new_categorization and not conversion):
+            raise ValueError("conversion or new_categorization must be provided.")
+        if new_categorization:
+            # User provides only new_categorization
+            if not conversion:
+                new_categorization = ensure_categorization_instance(new_categorization)
+                dim_name, old_categorization = extract_categorization_from_dim(dim)
+                old_categorization = ensure_categorization_instance(old_categorization)
+                conversion = old_categorization.conversion_to(new_categorization)
+            # User provides new_categorizatiom amd conversion, but they don't match
+            # TODO: What's the use case of provoding both? Maybe remove
+            elif new_categorization != conversion.categorization_b:
                 raise ValueError(
                     "New categorization is different to target categorisation in conversion."
                 )
-            new_categorization = conversion.categorization_b
-        # User provides neither conversion nor new categorization
-        else:
-            raise ValueError("conversion or new_categorization must be provided.")
 
-        return self.convert_inner(
+        return self._convert_inner(
             dim,
             conversion=conversion,
-            new_categorization=new_categorization,
             sum_rule=sum_rule,
             input_weights=input_weights,
             output_weights=output_weights,
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index f2800172..2e23aab4 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -134,10 +134,6 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     # )
 
 
-# def test_with_custom_conversion_and_one_custom_categorisation(empty_ds):
-#     assert False
-
-
 # test with new conversion and new categorisations
 def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     # make categorisation A from yaml

From adb202c22a34963bf1a8a40233f61a3fdfc2ae19 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 21 Oct 2024 15:41:57 +0200
Subject: [PATCH 27/36] ruff and docstring

---
 primap2/_convert.py           | 15 +++++++--------
 primap2/tests/test_convert.py | 10 ++++++++++
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 39d4bfd2..56fc00f5 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -124,7 +124,7 @@ def _convert_inner(
     @alias_dims(["dim"])
     def convert(
         self,
-        dim:  Hashable | str,
+        dim: Hashable | str,
         conversion: climate_categories._conversions.Conversion | None = None,
         new_categorization: climate_categories.Categorization | str | None = None,
         sum_rule: typing.Literal["intensive", "extensive"] | None = None,
@@ -151,10 +151,10 @@ def convert(
         new_categorization: str
             New categorization to convert the given dimension to. If the categorization
             is part of climate categories the title of the new categorization (like ``IPCC1996``)
-            will work. A ``climate_categories.Categorization`` object can be used regardless
-            of whether it is part of climate_categories. When providing just the new categorization,
+            will work. When providing just the new categorization,
             the old categorization as well as the conversion must be part of climate_categories.
-            Either conversion or new_categorization must be provided.
+            Either conversion or new_categorization must be provided. Note that if both
+            new_categorization and conversion are provided, conversion will be prioritized.
         sum_rule : ``extensive``, ``intensive``, or None (default)
             If data of categories has to be summed up or divided, we need information
             whether the quantity measured is extensive (like, for example, total
@@ -198,9 +198,8 @@ def convert(
             categorization.
         """
 
-
         # User provides neither conversion nor new categorization
-        if (not new_categorization and not conversion):
+        if not new_categorization and not conversion:
             raise ValueError("conversion or new_categorization must be provided.")
         if new_categorization:
             # User provides only new_categorization
@@ -209,8 +208,8 @@ def convert(
                 dim_name, old_categorization = extract_categorization_from_dim(dim)
                 old_categorization = ensure_categorization_instance(old_categorization)
                 conversion = old_categorization.conversion_to(new_categorization)
-            # User provides new_categorizatiom amd conversion, but they don't match
-            # TODO: What's the use case of provoding both? Maybe remove
+            # User provides new_categorization and conversion, but they don't match
+            # TODO: What's the use case of providing both? Maybe remove
             elif new_categorization != conversion.categorization_b:
                 raise ValueError(
                     "New categorization is different to target categorisation in conversion."
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 2e23aab4..854cbde7 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -11,6 +11,16 @@
 import primap2
 
 
+def test_conversion_and_new_categorisation_missing(empty_ds: xr.Dataset):
+    da = empty_ds["CO2"]
+    da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())})
+    da = da.expand_dims({"source (gas)": list(cc.gas.keys())})
+    with pytest.raises(ValueError, match="conversion or new_categorization must be provided."):
+        da.pr.convert(
+            dim="category",
+        )
+
+
 # test with existing conversion and two existing categorisations
 def test_convert_ipcc(empty_ds: xr.Dataset):
     # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy

From b896f642d992fcac8202eef6be3aa74d1330f2c5 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Tue, 22 Oct 2024 10:43:35 +0200
Subject: [PATCH 28/36] remove _convert_inner wrapper

---
 primap2/_convert.py           | 116 ++++------------------------------
 primap2/tests/test_convert.py |  30 ++++++---
 2 files changed, 34 insertions(+), 112 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 56fc00f5..295ac437 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -12,7 +12,8 @@
 
 
 class DataArrayConversionAccessor(_accessor_base.BaseDataArrayAccessor):
-    def _convert_inner(
+    @alias_dims(["dim"])
+    def convert(
         self,
         dim: Hashable | str,
         *,
@@ -87,6 +88,16 @@ def _convert_inner(
         auxiliary_dimensions = prepare_auxiliary_dimensions(conversion, auxiliary_dimensions)
 
         dim_name, old_categorization = extract_categorization_from_dim(dim)
+
+        if conversion.categorization_a_name != old_categorization:
+            msg = (
+                "The source categorization in the conversion "
+                f"({conversion.categorization_a_name}) does "
+                "not match the categorization in the data set "
+                f"({old_categorization})."
+            )
+            raise ValueError(msg)
+
         new_categorization = conversion.categorization_b
         new_dim = f"{dim_name} ({new_categorization.name})"
 
@@ -121,109 +132,6 @@ def _convert_inner(
 
         return converted_da
 
-    @alias_dims(["dim"])
-    def convert(
-        self,
-        dim: Hashable | str,
-        conversion: climate_categories._conversions.Conversion | None = None,
-        new_categorization: climate_categories.Categorization | str | None = None,
-        sum_rule: typing.Literal["intensive", "extensive"] | None = None,
-        input_weights: xr.DataArray | None = None,
-        output_weights: xr.DataArray | None = None,
-        auxiliary_dimensions: dict[str, str] | None = None,
-    ) -> xr.DataArray:
-        """Convert the data along the given dimension into the new categorization.
-
-        Generates conversion and new categorization from given inputs. Maps the given dimension
-        from one categorization (terminology) into another.
-        Fetches the rules to do the mapping from the climate_categories package, and
-        therefore will only work if there are conversions rules to convert from the
-        current categorization to the new categorization.
-
-        Parameters
-        ----------
-        dim : str
-            Dimension to convert. Has to be a dimension from ``da.dims``.
-        conversion : climate_categories.Conversion
-            The conversion rules that describe the conversion from the old to the new categorization.
-            Contains ``climate_categories.Categorization`` object for old and new categorization.
-            Either conversion or new_categorization must be provided.
-        new_categorization: str
-            New categorization to convert the given dimension to. If the categorization
-            is part of climate categories the title of the new categorization (like ``IPCC1996``)
-            will work. When providing just the new categorization,
-            the old categorization as well as the conversion must be part of climate_categories.
-            Either conversion or new_categorization must be provided. Note that if both
-            new_categorization and conversion are provided, conversion will be prioritized.
-        sum_rule : ``extensive``, ``intensive``, or None (default)
-            If data of categories has to be summed up or divided, we need information
-            whether the quantity measured is extensive (like, for example, total
-            emissions in a year subdivided into multiple sectoral categories) or
-            intensive (like, for example, average per-person emissions in a year
-            subdivided into different territorial entities). By default (None), a
-            warning is issued if data has to be summed up or divided.
-        input_weights : xr.DataArray, optional
-            If data in input categories has to be summed up and the sum_rule is
-            ``intensive``, weights for the input categories are required.
-            The weights can be given in any shape compatible with the DataArray that
-            is converted, e.g. to give different weights for industrial sectors by
-            country. However, at least the ``dim`` that is converted needs to be in
-            ``input_weights.dims``.
-            If no weights are specified but a rule requiring weights is specified
-            in the conversion rules, a warning is issued and the respective rule is
-            skipped (probably resulting in more NaNs in the output).
-        output_weights : xr.DataArray, optional
-            If data has to be divided into several output categories and the sum_rule is
-            ``extensive``, weights for the output categories are required.
-            The weights can be given in any shape compatible with the DataArray that
-            is converted, e.g. to give different weights for industrial sectors by
-            country. However, at least the ``dim`` that is converted needs to be in
-            ``output_weights.dims``.
-            If no weights are specified but a rule requiring weights is specified
-            in the conversion rules, a warning is issued and the respective rule is
-            skipped (probably resulting in more NaNs in the output).
-        auxiliary_dimensions : dict[str, str], optional
-            Mapping of auxiliary categorizations to dimension names used in this
-            DataArray. In conversions which contain rules which are valid only for
-            certain orthogonal dimensions (e.g. a conversion between different sectoral
-            terminologies, but some rules are only valid for specific countries), only
-            the categorization is specified. Therefore, in this case you have to specify
-            a mapping from categorization name to dimension name.
-            Example: {"ISO3": "area (ISO3)"}) .
-
-        Returns
-        -------
-        converted : xr.DataArray
-            A copy of the DataArray with the given dimension converted in the new
-            categorization.
-        """
-
-        # User provides neither conversion nor new categorization
-        if not new_categorization and not conversion:
-            raise ValueError("conversion or new_categorization must be provided.")
-        if new_categorization:
-            # User provides only new_categorization
-            if not conversion:
-                new_categorization = ensure_categorization_instance(new_categorization)
-                dim_name, old_categorization = extract_categorization_from_dim(dim)
-                old_categorization = ensure_categorization_instance(old_categorization)
-                conversion = old_categorization.conversion_to(new_categorization)
-            # User provides new_categorization and conversion, but they don't match
-            # TODO: What's the use case of providing both? Maybe remove
-            elif new_categorization != conversion.categorization_b:
-                raise ValueError(
-                    "New categorization is different to target categorisation in conversion."
-                )
-
-        return self._convert_inner(
-            dim,
-            conversion=conversion,
-            sum_rule=sum_rule,
-            input_weights=input_weights,
-            output_weights=output_weights,
-            auxiliary_dimensions=auxiliary_dimensions,
-        )
-
     def _fill_category(
         self,
         da: xr.DataArray,
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 854cbde7..b8543a27 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,6 +1,7 @@
 """Tests for _convert.py"""
 
 import pathlib
+import re
 
 import climate_categories as cc
 import climate_categories._conversions as conversions
@@ -11,17 +12,31 @@
 import primap2
 
 
-def test_conversion_and_new_categorisation_missing(empty_ds: xr.Dataset):
+def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
+    # make a data set with IPCC1996 categories
     da = empty_ds["CO2"]
     da = da.expand_dims({"category (IPCC1996)": list(cc.IPCC1996.keys())})
     da = da.expand_dims({"source (gas)": list(cc.gas.keys())})
-    with pytest.raises(ValueError, match="conversion or new_categorization must be provided."):
-        da.pr.convert(
+    arr = da.data.copy()
+    arr[:] = 1 * primap2.ureg("Gg CO2 / year")
+    da.data = arr
+
+    # load the BURDI to IPCC2006 category conversion
+    filepath = pathlib.Path("data/BURDI_conversion.csv")
+    conv = conversions.ConversionSpec.from_csv(filepath)
+    conv = conv.hydrate(cats=cc.cats)
+
+    msg = (
+        "The source categorization in the conversion (BURDI) "
+        "does not match the categorization in the data set (IPCC1996)."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        result = da.pr.convert(
             dim="category",
+            conversion=conv,
         )
 
 
-# test with existing conversion and two existing categorisations
 def test_convert_ipcc(empty_ds: xr.Dataset):
     # build a DA categorized by IPCC1996 and with 1 everywhere so results are easy
     # to see
@@ -32,18 +47,18 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
     arr[:] = 1 * primap2.ureg("Gg CO2 / year")
     da.data = arr
 
-    new_categorization_name = "IPCC2006"
+    conversion = cc.IPCC1996.conversion_to(cc.IPCC2006)
 
     with pytest.raises(ValueError, match="The conversion uses auxiliary categories"):
         da.pr.convert(
             dim="category",
-            new_categorization=new_categorization_name,
+            conversion=conversion,
             sum_rule="extensive",
         )
 
     result = da.pr.convert(
         dim="category",
-        new_categorization=new_categorization_name,
+        conversion=conversion,
         sum_rule="extensive",
         auxiliary_dimensions={"gas": "source (gas)"},
     )
@@ -172,7 +187,6 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     result = da.pr.convert(
         dim="category",
         conversion=conv,
-        new_categorization=categorisation_b,
         sum_rule="extensive",
     )
 

From e62291d2b53c73678aaa5b41670252f6da87ec75 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 24 Oct 2024 14:53:15 +0200
Subject: [PATCH 29/36] update climate categories

---
 primap2/tests/data/BURDI_conversion.csv |  3 ++-
 primap2/tests/test_convert.py           | 21 +++++++++------------
 setup.cfg                               |  2 +-
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/primap2/tests/data/BURDI_conversion.csv b/primap2/tests/data/BURDI_conversion.csv
index 82e0ce50..028247c7 100644
--- a/primap2/tests/data/BURDI_conversion.csv
+++ b/primap2/tests/data/BURDI_conversion.csv
@@ -36,5 +36,6 @@ BURDI,IPCC2006_PRIMAP,comment
 15163,M.0.EL
 14637,M.BK
 14424,M.BK.A
-14423,M.BK.M, leaving 14638 --> M.BIO out for now, as it's not in climate categories
+14423,M.BK.M,
+14638, M.BIO
 7,5, 5.A-D ignored as not fitting 2006 cats
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index b8543a27..20f750a6 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -4,7 +4,6 @@
 import re
 
 import climate_categories as cc
-import climate_categories._conversions as conversions
 import numpy as np
 import pytest
 import xarray as xr
@@ -23,8 +22,8 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
 
     # load the BURDI to IPCC2006 category conversion
     filepath = pathlib.Path("data/BURDI_conversion.csv")
-    conv = conversions.ConversionSpec.from_csv(filepath)
-    conv = conv.hydrate(cats=cc.cats)
+
+    conv = cc.Conversion.from_csv(filepath)
 
     msg = (
         "The source categorization in the conversion (BURDI) "
@@ -71,8 +70,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
     filepath = pathlib.Path("data/BURDI_conversion.csv")
-    conv = conversions.ConversionSpec.from_csv(filepath)
-    conv = conv.hydrate(cats=cc.cats)
+    conv = cc.Conversion.from_csv(filepath)
 
     # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
     # unfccc_di_reader_config.py
@@ -153,10 +151,9 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
     # 2.E + 2.B = 2.E, 2.E should not be part of new data set
     assert np.isnan(result.pr.loc[{"category": "2.E"}].values).all()
     # cat 14638 in BURDI equals cat M.BIO in IPCC2006_PRIMAP
-    # TODO: This will fail. M.BIO is currently not listed in climate categories
-    # assert (
-    #     (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
-    # )
+    assert (
+        (result.pr.loc[{"category": "M.BIO"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
+    )
 
 
 # test with new conversion and new categorisations
@@ -167,14 +164,14 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     # make categorisation B from yaml
     categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml")
 
-    # make conversion from csv
-    conv = conversions.ConversionSpec.from_csv("data/simple_conversion.csv")
     # categories not part of climate categories so we need to add them manually
     cats = {
         "A": categorisation_a,
         "B": categorisation_b,
     }
-    conv = conv.hydrate(cats=cats)
+
+    # make conversion from csv
+    conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats)
 
     # make a dummy dataset based on A cats
     da = empty_ds["CO2"]
diff --git a/setup.cfg b/setup.cfg
index fd6b0e2a..cf8d3065 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -53,7 +53,7 @@ install_requires =
     openpyxl>=3.1
     tqdm>=4.66
     msgpack>=1
-    climate_categories>=0.10.1
+    climate_categories>=0.10.2
 
 [options.extras_require]
 test =

From bd9b91d1a47f0d9c9d803a95db29e44ba0d58491 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 24 Oct 2024 12:53:53 +0000
Subject: [PATCH 30/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 primap2/tests/data/simple_categorisation_a.yaml | 2 +-
 primap2/tests/data/simple_categorisation_b.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/primap2/tests/data/simple_categorisation_a.yaml b/primap2/tests/data/simple_categorisation_a.yaml
index 1656c97b..beef5533 100644
--- a/primap2/tests/data/simple_categorisation_a.yaml
+++ b/primap2/tests/data/simple_categorisation_a.yaml
@@ -32,4 +32,4 @@ categories:
       - C
       - CatC
   unnumbered:
-    title: The unnumbered category
\ No newline at end of file
+    title: The unnumbered category
diff --git a/primap2/tests/data/simple_categorisation_b.yaml b/primap2/tests/data/simple_categorisation_b.yaml
index 35751f9b..05e1dc07 100644
--- a/primap2/tests/data/simple_categorisation_b.yaml
+++ b/primap2/tests/data/simple_categorisation_b.yaml
@@ -24,4 +24,4 @@ categories:
     comment: The second category
     alternative_codes:
       - B
-      - CatB
\ No newline at end of file
+      - CatB

From 1b714fd5d4c12e32f9da811118d97a57a2ea54aa Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 24 Oct 2024 15:07:26 +0200
Subject: [PATCH 31/36] get test data with importlib

---
 primap2/tests/test_convert.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 20f750a6..79327765 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,6 +1,7 @@
 """Tests for _convert.py"""
 
-import pathlib
+import importlib
+import importlib.resources
 import re
 
 import climate_categories as cc
@@ -11,6 +12,10 @@
 import primap2
 
 
+def get_test_data_filepath(fname: str):
+    return importlib.resources.files("primap2.tests.data").joinpath(fname)
+
+
 def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     # make a data set with IPCC1996 categories
     da = empty_ds["CO2"]
@@ -21,7 +26,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     da.data = arr
 
     # load the BURDI to IPCC2006 category conversion
-    filepath = pathlib.Path("data/BURDI_conversion.csv")
+    filepath = get_test_data_filepath("BURDI_conversion.csv")
 
     conv = cc.Conversion.from_csv(filepath)
 
@@ -30,7 +35,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
         "does not match the categorization in the data set (IPCC1996)."
     )
     with pytest.raises(ValueError, match=re.escape(msg)):
-        result = da.pr.convert(
+        result = da.pr.convert(  # noqa: F841
             dim="category",
             conversion=conv,
         )
@@ -69,7 +74,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
 # test with new conversion and two existing categorisations
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
-    filepath = pathlib.Path("data/BURDI_conversion.csv")
+    filepath = get_test_data_filepath("BURDI_conversion.csv")
     conv = cc.Conversion.from_csv(filepath)
 
     # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
@@ -159,10 +164,10 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
 # test with new conversion and new categorisations
 def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     # make categorisation A from yaml
-    categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml")
+    categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml"))
 
     # make categorisation B from yaml
-    categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml")
+    categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml"))
 
     # categories not part of climate categories so we need to add them manually
     cats = {
@@ -171,7 +176,7 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     }
 
     # make conversion from csv
-    conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats)
+    conv = cc.Conversion.from_csv(get_test_data_filepath("simple_conversion.csv"), cats=cats)
 
     # make a dummy dataset based on A cats
     da = empty_ds["CO2"]

From 73ce9599c33a29303cbada2de128f61f772a2a1c Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 24 Oct 2024 15:11:07 +0200
Subject: [PATCH 32/36] Revert "get test data with importlib"

This reverts commit 1b714fd5d4c12e32f9da811118d97a57a2ea54aa.
---
 primap2/tests/test_convert.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 79327765..20f750a6 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,7 +1,6 @@
 """Tests for _convert.py"""
 
-import importlib
-import importlib.resources
+import pathlib
 import re
 
 import climate_categories as cc
@@ -12,10 +11,6 @@
 import primap2
 
 
-def get_test_data_filepath(fname: str):
-    return importlib.resources.files("primap2.tests.data").joinpath(fname)
-
-
 def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     # make a data set with IPCC1996 categories
     da = empty_ds["CO2"]
@@ -26,7 +21,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     da.data = arr
 
     # load the BURDI to IPCC2006 category conversion
-    filepath = get_test_data_filepath("BURDI_conversion.csv")
+    filepath = pathlib.Path("data/BURDI_conversion.csv")
 
     conv = cc.Conversion.from_csv(filepath)
 
@@ -35,7 +30,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
         "does not match the categorization in the data set (IPCC1996)."
     )
     with pytest.raises(ValueError, match=re.escape(msg)):
-        result = da.pr.convert(  # noqa: F841
+        result = da.pr.convert(
             dim="category",
             conversion=conv,
         )
@@ -74,7 +69,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
 # test with new conversion and two existing categorisations
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
-    filepath = get_test_data_filepath("BURDI_conversion.csv")
+    filepath = pathlib.Path("data/BURDI_conversion.csv")
     conv = cc.Conversion.from_csv(filepath)
 
     # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
@@ -164,10 +159,10 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
 # test with new conversion and new categorisations
 def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     # make categorisation A from yaml
-    categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml"))
+    categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml")
 
     # make categorisation B from yaml
-    categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml"))
+    categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml")
 
     # categories not part of climate categories so we need to add them manually
     cats = {
@@ -176,7 +171,7 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     }
 
     # make conversion from csv
-    conv = cc.Conversion.from_csv(get_test_data_filepath("simple_conversion.csv"), cats=cats)
+    conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats)
 
     # make a dummy dataset based on A cats
     da = empty_ds["CO2"]

From c90bb018d082d5347793e40e4e47f271af7a973b Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 24 Oct 2024 15:14:01 +0200
Subject: [PATCH 33/36] importlib

---
 primap2/tests/test_convert.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 20f750a6..79327765 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -1,6 +1,7 @@
 """Tests for _convert.py"""
 
-import pathlib
+import importlib
+import importlib.resources
 import re
 
 import climate_categories as cc
@@ -11,6 +12,10 @@
 import primap2
 
 
+def get_test_data_filepath(fname: str):
+    return importlib.resources.files("primap2.tests.data").joinpath(fname)
+
+
 def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     # make a data set with IPCC1996 categories
     da = empty_ds["CO2"]
@@ -21,7 +26,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
     da.data = arr
 
     # load the BURDI to IPCC2006 category conversion
-    filepath = pathlib.Path("data/BURDI_conversion.csv")
+    filepath = get_test_data_filepath("BURDI_conversion.csv")
 
     conv = cc.Conversion.from_csv(filepath)
 
@@ -30,7 +35,7 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
         "does not match the categorization in the data set (IPCC1996)."
     )
     with pytest.raises(ValueError, match=re.escape(msg)):
-        result = da.pr.convert(
+        result = da.pr.convert(  # noqa: F841
             dim="category",
             conversion=conv,
         )
@@ -69,7 +74,7 @@ def test_convert_ipcc(empty_ds: xr.Dataset):
 # test with new conversion and two existing categorisations
 def test_convert_BURDI(empty_ds: xr.Dataset):
     # make a sample conversion object in climate categories
-    filepath = pathlib.Path("data/BURDI_conversion.csv")
+    filepath = get_test_data_filepath("BURDI_conversion.csv")
     conv = cc.Conversion.from_csv(filepath)
 
     # taken from UNFCCC_non-AnnexI_data/src/unfccc_ghg_data/unfccc_di_reader/
@@ -159,10 +164,10 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
 # test with new conversion and new categorisations
 def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     # make categorisation A from yaml
-    categorisation_a = cc.from_yaml("data/simple_categorisation_a.yaml")
+    categorisation_a = cc.from_yaml(get_test_data_filepath("simple_categorisation_a.yaml"))
 
     # make categorisation B from yaml
-    categorisation_b = cc.from_yaml("data/simple_categorisation_b.yaml")
+    categorisation_b = cc.from_yaml(get_test_data_filepath("simple_categorisation_b.yaml"))
 
     # categories not part of climate categories so we need to add them manually
     cats = {
@@ -171,7 +176,7 @@ def test_custom_conversion_and_two_custom_categorisations(empty_ds):
     }
 
     # make conversion from csv
-    conv = cc.Conversion.from_csv("data/simple_conversion.csv", cats=cats)
+    conv = cc.Conversion.from_csv(get_test_data_filepath("simple_conversion.csv"), cats=cats)
 
     # make a dummy dataset based on A cats
     da = empty_ds["CO2"]

From fffb84af5d5c4b6faed8035776f656c4bf81afd4 Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Thu, 24 Oct 2024 15:21:20 +0200
Subject: [PATCH 34/36] clean up

---
 primap2/_convert.py           | 4 +---
 primap2/tests/test_convert.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/primap2/_convert.py b/primap2/_convert.py
index 295ac437..36662e23 100644
--- a/primap2/_convert.py
+++ b/primap2/_convert.py
@@ -17,9 +17,7 @@ def convert(
         self,
         dim: Hashable | str,
         *,
-        # TODO type will change to climate_categories.Conversion when
-        #  https://github.com/primap-community/climate_categories/pull/164 is merged
-        conversion: climate_categories._conversions.Conversion,
+        conversion: climate_categories.Conversion,
         sum_rule: typing.Literal["intensive", "extensive"] | None = None,
         input_weights: xr.DataArray | None = None,
         output_weights: xr.DataArray | None = None,
diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 79327765..2126b6cf 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -27,7 +27,6 @@ def test_conversion_source_does_not_match_dataset_dimension(empty_ds):
 
     # load the BURDI to IPCC2006 category conversion
     filepath = get_test_data_filepath("BURDI_conversion.csv")
-
     conv = cc.Conversion.from_csv(filepath)
 
     msg = (

From 4b9bf2b7028a83b77a76e1174901025cdeaf683c Mon Sep 17 00:00:00 2001
From: Daniel Busch <danielbusch@climate-resource.com>
Date: Mon, 28 Oct 2024 10:04:32 +0100
Subject: [PATCH 35/36] test signed commit

---
 primap2/tests/test_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index 2126b6cf..e91cd9a0 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -148,7 +148,7 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
         .item()
     )
     # 3.C.7 (converted from 4.C) should still be part of the data set,
-    # although it apprears in two conversion rules
+    # although it appears in two conversion rules
     assert (
         (result.pr.loc[{"category": "3.C.7"}] == 1.0 * primap2.ureg("Gg CO2 / year")).all().item()
     )

From 86aec445e80d16ca7ff9627108ef65f7d84e3423 Mon Sep 17 00:00:00 2001
From: Daniel Busch <daniel.busch@climate-resource.com>
Date: Mon, 28 Oct 2024 10:08:36 +0100
Subject: [PATCH 36/36] update email for verified commits

---
 primap2/tests/test_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primap2/tests/test_convert.py b/primap2/tests/test_convert.py
index e91cd9a0..2edfe5af 100644
--- a/primap2/tests/test_convert.py
+++ b/primap2/tests/test_convert.py
@@ -121,7 +121,7 @@ def test_convert_BURDI(empty_ds: xr.Dataset):
         "7",
     ]
 
-    # build a DA categorized by BURDI and with 1 everywhere so results are easy
+    # build a DA categorised by BURDI and with 1 everywhere so results are easy
     # to see
     da = empty_ds["CO2"]
     da = da.expand_dims({"category (BURDI)": BURDI_categories})