Merge pull request #277 from primap-community/rm-sec-cats

primap-community · Oct 25, 2024 · ea9c42a · ea9c42a
2 parents 693a1ce + 3930b2b
commit ea9c42a
Show file tree

Hide file tree

Showing 13 changed files with 30 additions and 99 deletions.
diff --git a/changelog/277.breaking.md b/changelog/277.breaking.md
@@ -0,0 +1,6 @@
+We removed the `sec_cats` entry from the metadata in a dataset's `attrs` in the native format
+as well as the interchange format. It did not add much value, but maintaining it was work, so on balance
+we decided to remove it.
+When reading datasets from disk (from the interchange format or netcdf files), `sec_cats` will be ignored
+so that datasets written with earlier versions of primap2 can still be read and produce valid in-memory
+datasets.
diff --git a/docs/source/data_format/data_format_details.md b/docs/source/data_format/data_format_details.md
@@ -12,16 +12,16 @@ For all dimensions, defined names have to be used and additional metadata about
 dimensions is stored in the datasets `attrs`.
 The dimensions are:
 
-| dimension       | dimension key         | req. | notes                     | attrs                                 |
-|-----------------|-----------------------|------|---------------------------|---------------------------------------|
-| time            | time                  | ✗    | for periods, the start    |                                       |
-| area            | area (<category-set>) | ✗    | pre-defined category sets | `'area': 'area (<category set>)'`     |
-| category        | category (<c-set>)    |      | primary category          | `'cat': 'category (<c-set>)'`         |
-| sec. categories | <type> (<c-set>)      |      | there can be multiple     | `'sec_cats': ['<type> (<c-set>)', …]` |
-| scenario        | scenario (<c-set>)    |      |                           | `'scen': 'scenario (<c-set>)'`        |
-| provenance      | provenance            |      | values from fixed set     |                                       |
-| model           | model                 |      |                           |                                       |
-| source          | source                | ✗    | a short source identifier |                                       |
+| dimension       | dimension key           | req. | notes                     | attrs                             |
+|-----------------|-------------------------|------|---------------------------|-----------------------------------|
+| `time`          | `time`                  | ✗    | for periods, the start    |                                   |
+| `area`          | `area (<category-set>)` | ✗    | pre-defined category sets | `'area': 'area (<category set>)'` |
+| `category`      | `category (<c-set>)`    |      | primary category          | `'cat': 'category (<c-set>)'`     |
+| sec. categories | `<type> (<c-set>)`      |      | there can be multiple     |                                   |
+| `scenario`      | `scenario (<c-set>)`    |      |                           | `'scen': 'scenario (<c-set>)'`    |
+| `provenance`    | `provenance`            |      | values from fixed set     |                                   |
+| `model`         | `model`                 |      |                           |                                   |
+| `source`        | `source`                | ✗    | a short source identifier |                                   |
 
 For some dimensions, the meaning of the data is directly visible from the data type
 (`time` uses an xarray datetime data type) or the values come from a pre-defined list

diff --git a/docs/source/data_format/data_format_examples.md b/docs/source/data_format/data_format_examples.md
@@ -138,7 +138,6 @@ opulent = xr.Dataset(
         "entity_terminology": "primap2",
         "area": "area (ISO3)",
         "cat": "category (IPCC 2006)",
-        "sec_cats": ["animal (FAOSTAT)", "product (FAOSTAT)"],
         "scen": "scenario (FAOSTAT)",
         "references": "doi:10.1012",
         "rights": "Use however you want.",
@@ -205,11 +204,8 @@ Compared to the minimal example, this data set has a lot more to unpack:
   specific set of
   categories given in parentheses and with appropriate metadata in the `attrs`.
   The `scenario` is a standard dimension, and the metadata in `attrs` is given using
-  the `scen` key. The `animal` and `product` dimensions are nonstandard, and are
-  included in the
-  secondary categories at `attrs['sec_cats']`. Note that `sec_cats` contains a list, so
-  that multiple nonstandard dimensions can be included if needed.
-* There is also s coordinate which is not defining a dimensions, `category names`. It
+  the `scen` key. The `animal` and `product` dimensions are nonstandard.
+* There is also a coordinate which is not defining a dimension, `category names`. It
   gives additional information about categories, which can be helpful for humans
   trying to make sense of the category codes without looking them up. Note that
   because this coordinate is not used as an index for a dimension, the category
@@ -218,7 +214,7 @@ Compared to the minimal example, this data set has a lot more to unpack:
   the population data does not use all dimensions. For each data variable, only the
   dimensions which make sense have to be used.
 * In the `attrs`, the terminology for the entities is explicitly defined, so that the
-  meaning of the entity attributes is unambigously defined.
+  meaning of the entity attributes is unambiguously defined.
 * In the `attrs`, additional metadata useful for humans is included: citable
   `references`, usage `rights`, a descriptive `title`, a long-form `comment`,
    an email address to `contact` for questions about the data set, and the `publication_date`.

diff --git a/docs/source/data_format/interchange_format_examples.md b/docs/source/data_format/interchange_format_examples.md
@@ -52,9 +52,6 @@ They are listed as secondary columns in the metadata dict.
 Column names correspond to the dimension key of the xarray format, i.e. they contain
 the terminology in parentheses (e.g. `area (ISO3)`).
 
-Additional columns are currently not possible, but the option will be added
-in a future release ([#25](https://github.com/pik-primap/primap2/issues/25)).
-
 The metadata dict has an `attrs` entry, which corresponds to the `attrs` dict of the
 xarray format (see [Data format details](data_format_details.md)).
 Additionally, the metadata dict contains information on the `dimensions` of the

diff --git a/primap2/_data_format.py b/primap2/_data_format.py
@@ -13,10 +13,8 @@
 from attr import define
 from loguru import logger
 
-from primap2._selection import translations_from_dims
-
 from . import _accessor_base, pm2io
-from ._dim_names import dim_names
+from ._selection import translations_from_dims
 from ._units import ureg
 
 
@@ -79,8 +77,9 @@ def open_dataset(
         backend_kwargs=backend_kwargs,
         engine="h5netcdf",
     ).pint.quantify(unit_registry=ureg)
+    # backwards compatibility: ignore sec_cats
     if "sec_cats" in ds.attrs:
-        ds.attrs["sec_cats"] = list(ds.attrs["sec_cats"])
+        del ds.attrs["sec_cats"]
     if "publication_date" in ds.attrs:
         ds.attrs["publication_date"] = datetime.date.fromisoformat(ds.attrs["publication_date"])
     for entity in ds:
@@ -389,7 +388,6 @@ def ensure_valid_attributes(ds: xr.Dataset):
         "institution",
         "area",
         "cat",
-        "sec_cats",
         "scen",
         "entity_terminology",
         "publication_date",
@@ -527,8 +525,7 @@ def ensure_valid_coordinate_values(ds: xr.Dataset):
 def ensure_valid_dimensions(ds: xr.Dataset):
     required_direct_dims = {"time", "source"}
     required_indirect_dims = {"area"}
-    optional_direct_dims = {"provenance", "model"}
-    optional_indirect_dims = {"cat", "scen"}  # sec_cats is special
+    optional_indirect_dims = {"cat", "scen"}
 
     for req_dim in required_direct_dims:
         if req_dim not in ds.dims:
@@ -573,32 +570,6 @@ def ensure_valid_dimensions(ds: xr.Dataset):
                 logger.error(f"{long_name!r} defined as {opt_dim}, but not found in dims.")
                 raise ValueError(f"{opt_dim!r} not in dims")
 
-    if "sec_cats" in ds.attrs:
-        for sec_cat in ds.attrs["sec_cats"]:
-            included_optional_dims.append(sec_cat)
-            if sec_cat not in ds.dims:
-                logger.error(
-                    f"Secondary category {sec_cat!r} defined, but not found in dims: " f"{ds.dims}."
-                )
-                raise ValueError(f"Secondary category {sec_cat!r} not in dims")
-
-    if "sec_cats" in ds.attrs and "cat" not in ds.attrs:
-        logger.warning("Secondary category defined, but no primary category defined, weird.")
-
-    all_dims = set(dim_names(ds))
-    unknown_dims = (
-        all_dims
-        - required_direct_dims
-        - set(required_indirect_dims_long)
-        - optional_direct_dims
-        - set(included_optional_dims)
-    )
-
-    if unknown_dims:
-        logger.warning(
-            f"Dimension(s) {unknown_dims} unknown, likely a typo or missing in" f" sec_cats."
-        )
-
     for dim in required_indirect_dims.union(optional_indirect_dims):
         if dim in ds.attrs:
             split_dim_name(ds.attrs[dim])

diff --git a/primap2/_selection.py b/primap2/_selection.py
@@ -77,10 +77,6 @@ def translations_from_attrs(
         if abbrev in attrs:
             ret[key] = attrs[abbrev]
             ret[abbrev] = attrs[abbrev]
-    if "sec_cats" in attrs:
-        for full_name in attrs["sec_cats"]:
-            key = full_name.split("(")[0][:-1]
-            ret[key] = full_name
 
     if include_entity and "entity_terminology" in attrs:
         ret["entity"] = f"entity ({attrs['entity_terminology']})"

diff --git a/primap2/csg/_compose.py b/primap2/csg/_compose.py
@@ -1,6 +1,5 @@
 """Compose a harmonized dataset from multiple input datasets."""
 
-import contextlib
 import math
 import typing
 from collections.abc import Hashable
@@ -163,10 +162,6 @@ def compose(
             del result_ds.attrs["cat"]
         elif dim == "scenario":
             del result_ds.attrs["scen"]
-        elif dim not in ("provenance", "model", "source"):
-            # remove from sec_cats if it is in sec_cats
-            with contextlib.suppress(ValueError):
-                result_ds.attrs["sec_cats"].remove(dim_key)
     return result_ds
 
 

diff --git a/primap2/pm2io/_data_reading.py b/primap2/pm2io/_data_reading.py
@@ -1122,7 +1122,7 @@ def fill_from_other_col(
     -------
     pd.DataFrame
     """
-    dim_aliases = _selection.translations_from_attrs(attrs, include_entity=True)
+    dim_aliases = _selection.translations_from_dims(df.columns)
 
     # loop over target columns in value mapping
     for target_col in coords_value_filling:
@@ -1186,7 +1186,7 @@ def map_metadata_unordered(
     attrs: dict[str, Any],
 ):
     """Map the metadata according to specifications given in meta_mapping."""
-    dim_aliases = _selection.translations_from_attrs(attrs, include_entity=True)
+    dim_aliases = _selection.translations_from_dims(data)
 
     # TODO: add additional mapping functions here
     # values: (function, additional arguments)
@@ -1260,7 +1260,6 @@ def rename_columns(
     attr_names = {"category": "cat", "scenario": "scen", "area": "area"}
 
     attrs = {}
-    sec_cats = []
     coord_renaming = {}
 
     for coord in itertools.chain(coords_cols, coords_defaults):
@@ -1273,7 +1272,6 @@ def rename_columns(
 
         if coord.startswith(SEC_CATS_PREFIX):
             name = name[len(SEC_CATS_PREFIX) :]
-            sec_cats.append(name)
         elif coord in attr_names:
             attrs[attr_names[coord]] = name
 
@@ -1284,9 +1282,6 @@ def rename_columns(
 
     data.rename(columns=coord_renaming, inplace=True)
 
-    if sec_cats:
-        attrs["sec_cats"] = sec_cats
-
     return attrs
 
 
@@ -1504,7 +1499,7 @@ def harmonize_units(
     data_cols = list(set(data.columns.values) - set(dimensions))
 
     if attrs is not None:
-        dim_aliases = _selection.translations_from_attrs(attrs, include_entity=True)
+        dim_aliases = _selection.translations_from_dims(data.columns)
         entity_col = dim_aliases.get("entity", "entity")
     else:
         entity_col = "entity"

diff --git a/primap2/pm2io/_interchange_format.py b/primap2/pm2io/_interchange_format.py
@@ -36,6 +36,7 @@
             {
                 "area": sy.Str(),
                 sy.Optional("cat"): sy.Str(),
+                # sec_cats is only for backward compatibility, will be ignored
                 sy.Optional("sec_cats"): sy.Seq(sy.Str()),
                 sy.Optional("scen"): sy.Str(),
                 sy.Optional("references"): sy.Str(),
@@ -219,6 +220,8 @@ def read_interchange_format(
 
     data = pd.read_csv(data_file, dtype=object)
     data.attrs = meta_data
+    if "sec_cats" in data.attrs["attrs"]:
+        del data.attrs["attrs"]["sec_cats"]
 
     # strictyaml parses a datetime, we only want a date
     if "publication_date" in data.attrs["attrs"]:

diff --git a/primap2/tests/csg/test_compose.py b/primap2/tests/csg/test_compose.py
@@ -538,11 +538,10 @@ def test_compose_pbar(opulent_ds):
 
 
 def test_compose_sec_cats_missing(opulent_ds):
-    """Compose should also work when a dimensions is missing in `sec_cats`."""
+    """Compose should also work when a secondary category dimension is missing."""
     input_data = opulent_ds.drop_vars(["population", "SF6 (SARGWP100)"]).pr.loc[
         {"animal": ["cow"], "category": ["0", "1"]}
     ]
-    input_data.attrs["sec_cats"].remove("product (FAOSTAT)")
     priority_definition = primap2.csg.PriorityDefinition(
         priority_dimensions=["source", "scenario (FAOSTAT)", "product (FAOSTAT)"],
         priorities=[

diff --git a/primap2/tests/examples.py b/primap2/tests/examples.py
@@ -105,7 +105,6 @@ def opulent_ds() -> xr.Dataset:
             "entity_terminology": "primap2",
             "area": "area (ISO3)",
             "cat": "category (IPCC 2006)",
-            "sec_cats": ["animal (FAOSTAT)", "product (FAOSTAT)"],
             "scen": "scenario (FAOSTAT)",
             "references": "doi:10.1012",
             "rights": "Use however you want.",

diff --git a/primap2/tests/test_data_format.py b/primap2/tests/test_data_format.py
@@ -122,15 +122,6 @@ def test_wrong_provenance_value(self, opulent_ds, caplog):
         assert "ERROR" in caplog.text
         assert "provenance contains invalid values: {'asdf'}" in caplog.text
 
-    def test_additional_dimension(self, minimal_ds: xr.Dataset, caplog):
-        ds = minimal_ds.expand_dims({"addl_dim": ["a", "b", "c"]})  # type: ignore
-        ds.pr.ensure_valid()
-        assert "WARNING" in caplog.text
-        assert (
-            "Dimension(s) {'addl_dim'} unknown, likely a typo or missing in sec_cats."
-            in caplog.text
-        )
-
     def test_wrong_dimension_key(self, minimal_ds, caplog):
         ds = minimal_ds.rename_dims({"area (ISO3)": "asdf"})
         ds.attrs["area"] = "asdf"
@@ -139,27 +130,13 @@ def test_wrong_dimension_key(self, minimal_ds, caplog):
         assert "ERROR" in caplog.text
         assert "'asdf' not in the format 'dim (category_set)'." in caplog.text
 
-    def test_missing_sec_cat(self, minimal_ds, caplog):
-        minimal_ds.attrs["sec_cats"] = ["missing"]
-        with pytest.raises(ValueError, match="Secondary category 'missing' not in dims"):
-            minimal_ds.pr.ensure_valid()
-        assert "ERROR" in caplog.text
-        assert "Secondary category 'missing' defined, but not found in dims:" in caplog.text
-
     def test_missing_optional_dim(self, minimal_ds, caplog):
         minimal_ds.attrs["scen"] = "missing"
         with pytest.raises(ValueError, match="'scen' not in dims"):
             minimal_ds.pr.ensure_valid()
         assert "ERROR" in caplog.text
         assert "'missing' defined as scen, but not found in dims." in caplog.text
 
-    def test_sec_cat_without_primary_cat(self, minimal_ds, caplog):
-        ds = minimal_ds.expand_dims({"something (cset)": ["a", "b", "c"]})
-        ds.attrs["sec_cats"] = ["something (cset)"]
-        ds.pr.ensure_valid()
-        assert "WARNING" in caplog.text
-        assert "Secondary category defined, but no primary category defined, weird." in caplog.text
-
     def test_additional_coordinate_space(self, opulent_ds: xr.Dataset, caplog):
         ds = opulent_ds.rename({"category_names": "category names"})
         with pytest.raises(ValueError, match=r"Coord key 'category names' contains a space"):

diff --git a/primap2/tests/test_data_reading.py b/primap2/tests/test_data_reading.py
@@ -232,7 +232,6 @@ def test_output(
         attrs_expected = {
             "attrs": {
                 "references": "Just ask around.",
-                "sec_cats": ["Class (class)", "Type (type)"],
                 "scen": "scenario (general)",
                 "area": "area (ISO3)",
                 "cat": "category (IPCC2006)",
@@ -831,7 +830,6 @@ def test_from(self):
                 "area": "area (ISO3)",
                 "cat": "category (IPCC2006)",
                 "scen": "scenario (general)",
-                "sec_cats": ["Class (class)", "Type (type)"],
             },
             "time_format": "%Y",
             "dimensions": {"*": dims},
@@ -907,7 +905,6 @@ def test_roundtrip(self, tmp_path):
                 "area": "area (ISO3)",
                 "cat": "category (IPCC2006)",
                 "scen": "scenario (general)",
-                "sec_cats": ["Class (class)", "Type (type)"],
             },
             "time_format": "%Y",
             "dimensions": {"CO2": ["area (ISO3)"]},