From ed0505312f386f80a54abd781d830e57b3203545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 22 Oct 2024 14:29:08 +0200 Subject: [PATCH 1/4] refactor: remove sec_cats from native format --- primap2/_data_format.py | 33 ++-------------------------- primap2/_selection.py | 2 +- primap2/csg/_compose.py | 5 ----- primap2/pm2io/_data_reading.py | 5 ----- primap2/pm2io/_interchange_format.py | 4 ++++ primap2/tests/csg/test_compose.py | 3 +-- primap2/tests/examples.py | 1 - primap2/tests/test_data_format.py | 23 ------------------- primap2/tests/test_data_reading.py | 1 - 9 files changed, 8 insertions(+), 69 deletions(-) diff --git a/primap2/_data_format.py b/primap2/_data_format.py index 0f3c661b..5c7600a2 100644 --- a/primap2/_data_format.py +++ b/primap2/_data_format.py @@ -16,7 +16,6 @@ from primap2._selection import translations_from_dims from . import _accessor_base, pm2io -from ._dim_names import dim_names from ._units import ureg @@ -80,7 +79,7 @@ def open_dataset( engine="h5netcdf", ).pint.quantify(unit_registry=ureg) if "sec_cats" in ds.attrs: - ds.attrs["sec_cats"] = list(ds.attrs["sec_cats"]) + del ds.attrs["sec_cats"] if "publication_date" in ds.attrs: ds.attrs["publication_date"] = datetime.date.fromisoformat(ds.attrs["publication_date"]) for entity in ds: @@ -389,7 +388,6 @@ def ensure_valid_attributes(ds: xr.Dataset): "institution", "area", "cat", - "sec_cats", "scen", "entity_terminology", "publication_date", @@ -527,8 +525,7 @@ def ensure_valid_coordinate_values(ds: xr.Dataset): def ensure_valid_dimensions(ds: xr.Dataset): required_direct_dims = {"time", "source"} required_indirect_dims = {"area"} - optional_direct_dims = {"provenance", "model"} - optional_indirect_dims = {"cat", "scen"} # sec_cats is special + optional_indirect_dims = {"cat", "scen"} for req_dim in required_direct_dims: if req_dim not in ds.dims: @@ -573,32 +570,6 @@ def ensure_valid_dimensions(ds: xr.Dataset): logger.error(f"{long_name!r} defined as {opt_dim}, but not found in dims.") raise ValueError(f"{opt_dim!r} not in dims") - if "sec_cats" in ds.attrs: - for sec_cat in ds.attrs["sec_cats"]: - included_optional_dims.append(sec_cat) - if sec_cat not in ds.dims: - logger.error( - f"Secondary category {sec_cat!r} defined, but not found in dims: " f"{ds.dims}." - ) - raise ValueError(f"Secondary category {sec_cat!r} not in dims") - - if "sec_cats" in ds.attrs and "cat" not in ds.attrs: - logger.warning("Secondary category defined, but no primary category defined, weird.") - - all_dims = set(dim_names(ds)) - unknown_dims = ( - all_dims - - required_direct_dims - - set(required_indirect_dims_long) - - optional_direct_dims - - set(included_optional_dims) - ) - - if unknown_dims: - logger.warning( - f"Dimension(s) {unknown_dims} unknown, likely a typo or missing in" f" sec_cats." - ) - for dim in required_indirect_dims.union(optional_indirect_dims): if dim in ds.attrs: split_dim_name(ds.attrs[dim]) diff --git a/primap2/_selection.py b/primap2/_selection.py index c6e533f8..d47a20c3 100644 --- a/primap2/_selection.py +++ b/primap2/_selection.py @@ -77,7 +77,7 @@ def translations_from_attrs( if abbrev in attrs: ret[key] = attrs[abbrev] ret[abbrev] = attrs[abbrev] - if "sec_cats" in attrs: + if "sec_cats" in attrs: # relevant in interchange format, where sec_cats in attrs for full_name in attrs["sec_cats"]: key = full_name.split("(")[0][:-1] ret[key] = full_name diff --git a/primap2/csg/_compose.py b/primap2/csg/_compose.py index b54a9a44..3f72111c 100644 --- a/primap2/csg/_compose.py +++ b/primap2/csg/_compose.py @@ -1,6 +1,5 @@ """Compose a harmonized dataset from multiple input datasets.""" -import contextlib import math import typing from collections.abc import Hashable @@ -163,10 +162,6 @@ def compose( del result_ds.attrs["cat"] elif dim == "scenario": del result_ds.attrs["scen"] - elif dim not in ("provenance", "model", "source"): - # remove from sec_cats if it is in sec_cats - with contextlib.suppress(ValueError): - result_ds.attrs["sec_cats"].remove(dim_key) return result_ds diff --git a/primap2/pm2io/_data_reading.py b/primap2/pm2io/_data_reading.py index 497c36e1..4efa6369 100644 --- a/primap2/pm2io/_data_reading.py +++ b/primap2/pm2io/_data_reading.py @@ -1260,7 +1260,6 @@ def rename_columns( attr_names = {"category": "cat", "scenario": "scen", "area": "area"} attrs = {} - sec_cats = [] coord_renaming = {} for coord in itertools.chain(coords_cols, coords_defaults): @@ -1273,7 +1272,6 @@ def rename_columns( if coord.startswith(SEC_CATS_PREFIX): name = name[len(SEC_CATS_PREFIX) :] - sec_cats.append(name) elif coord in attr_names: attrs[attr_names[coord]] = name @@ -1284,9 +1282,6 @@ def rename_columns( data.rename(columns=coord_renaming, inplace=True) - if sec_cats: - attrs["sec_cats"] = sec_cats - return attrs diff --git a/primap2/pm2io/_interchange_format.py b/primap2/pm2io/_interchange_format.py index 402a6dcb..5da0c540 100644 --- a/primap2/pm2io/_interchange_format.py +++ b/primap2/pm2io/_interchange_format.py @@ -391,6 +391,10 @@ def from_interchange_format( # add the dataset wide attributes data_xr.attrs = attrs["attrs"] + if "sec_cats" in data_xr.attrs: + # only needed in interchange format + del data_xr.attrs["sec_cats"] + data_xr = data_xr.pr.quantify() data_xr.pr.ensure_valid() diff --git a/primap2/tests/csg/test_compose.py b/primap2/tests/csg/test_compose.py index 920f7bb0..9aff0625 100644 --- a/primap2/tests/csg/test_compose.py +++ b/primap2/tests/csg/test_compose.py @@ -538,11 +538,10 @@ def test_compose_pbar(opulent_ds): def test_compose_sec_cats_missing(opulent_ds): - """Compose should also work when a dimensions is missing in `sec_cats`.""" + """Compose should also work when a secondary category dimension is missing.""" input_data = opulent_ds.drop_vars(["population", "SF6 (SARGWP100)"]).pr.loc[ {"animal": ["cow"], "category": ["0", "1"]} ] - input_data.attrs["sec_cats"].remove("product (FAOSTAT)") priority_definition = primap2.csg.PriorityDefinition( priority_dimensions=["source", "scenario (FAOSTAT)", "product (FAOSTAT)"], priorities=[ diff --git a/primap2/tests/examples.py b/primap2/tests/examples.py index c11bf58e..5adbdd26 100644 --- a/primap2/tests/examples.py +++ b/primap2/tests/examples.py @@ -105,7 +105,6 @@ def opulent_ds() -> xr.Dataset: "entity_terminology": "primap2", "area": "area (ISO3)", "cat": "category (IPCC 2006)", - "sec_cats": ["animal (FAOSTAT)", "product (FAOSTAT)"], "scen": "scenario (FAOSTAT)", "references": "doi:10.1012", "rights": "Use however you want.", diff --git a/primap2/tests/test_data_format.py b/primap2/tests/test_data_format.py index 3d9148db..272ecd0a 100644 --- a/primap2/tests/test_data_format.py +++ b/primap2/tests/test_data_format.py @@ -122,15 +122,6 @@ def test_wrong_provenance_value(self, opulent_ds, caplog): assert "ERROR" in caplog.text assert "provenance contains invalid values: {'asdf'}" in caplog.text - def test_additional_dimension(self, minimal_ds: xr.Dataset, caplog): - ds = minimal_ds.expand_dims({"addl_dim": ["a", "b", "c"]}) # type: ignore - ds.pr.ensure_valid() - assert "WARNING" in caplog.text - assert ( - "Dimension(s) {'addl_dim'} unknown, likely a typo or missing in sec_cats." - in caplog.text - ) - def test_wrong_dimension_key(self, minimal_ds, caplog): ds = minimal_ds.rename_dims({"area (ISO3)": "asdf"}) ds.attrs["area"] = "asdf" @@ -139,13 +130,6 @@ def test_wrong_dimension_key(self, minimal_ds, caplog): assert "ERROR" in caplog.text assert "'asdf' not in the format 'dim (category_set)'." in caplog.text - def test_missing_sec_cat(self, minimal_ds, caplog): - minimal_ds.attrs["sec_cats"] = ["missing"] - with pytest.raises(ValueError, match="Secondary category 'missing' not in dims"): - minimal_ds.pr.ensure_valid() - assert "ERROR" in caplog.text - assert "Secondary category 'missing' defined, but not found in dims:" in caplog.text - def test_missing_optional_dim(self, minimal_ds, caplog): minimal_ds.attrs["scen"] = "missing" with pytest.raises(ValueError, match="'scen' not in dims"): @@ -153,13 +137,6 @@ def test_missing_optional_dim(self, minimal_ds, caplog): assert "ERROR" in caplog.text assert "'missing' defined as scen, but not found in dims." in caplog.text - def test_sec_cat_without_primary_cat(self, minimal_ds, caplog): - ds = minimal_ds.expand_dims({"something (cset)": ["a", "b", "c"]}) - ds.attrs["sec_cats"] = ["something (cset)"] - ds.pr.ensure_valid() - assert "WARNING" in caplog.text - assert "Secondary category defined, but no primary category defined, weird." in caplog.text - def test_additional_coordinate_space(self, opulent_ds: xr.Dataset, caplog): ds = opulent_ds.rename({"category_names": "category names"}) with pytest.raises(ValueError, match=r"Coord key 'category names' contains a space"): diff --git a/primap2/tests/test_data_reading.py b/primap2/tests/test_data_reading.py index 6f445225..65cf9624 100644 --- a/primap2/tests/test_data_reading.py +++ b/primap2/tests/test_data_reading.py @@ -232,7 +232,6 @@ def test_output( attrs_expected = { "attrs": { "references": "Just ask around.", - "sec_cats": ["Class (class)", "Type (type)"], "scen": "scenario (general)", "area": "area (ISO3)", "cat": "category (IPCC2006)", From 61ea33ec85daf29fa3971d33c17861f1234d0bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 22 Oct 2024 15:15:06 +0200 Subject: [PATCH 2/4] refactor: remove sec_cats from interchange format --- primap2/_data_format.py | 4 ++-- primap2/_selection.py | 4 ---- primap2/pm2io/_data_reading.py | 6 +++--- primap2/pm2io/_interchange_format.py | 7 +++---- primap2/tests/test_data_reading.py | 2 -- 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/primap2/_data_format.py b/primap2/_data_format.py index 5c7600a2..cb7517eb 100644 --- a/primap2/_data_format.py +++ b/primap2/_data_format.py @@ -13,9 +13,8 @@ from attr import define from loguru import logger -from primap2._selection import translations_from_dims - from . import _accessor_base, pm2io +from ._selection import translations_from_dims from ._units import ureg @@ -78,6 +77,7 @@ def open_dataset( backend_kwargs=backend_kwargs, engine="h5netcdf", ).pint.quantify(unit_registry=ureg) + # backwards compatibility: ignore sec_cats if "sec_cats" in ds.attrs: del ds.attrs["sec_cats"] if "publication_date" in ds.attrs: diff --git a/primap2/_selection.py b/primap2/_selection.py index d47a20c3..3db9edc7 100644 --- a/primap2/_selection.py +++ b/primap2/_selection.py @@ -77,10 +77,6 @@ def translations_from_attrs( if abbrev in attrs: ret[key] = attrs[abbrev] ret[abbrev] = attrs[abbrev] - if "sec_cats" in attrs: # relevant in interchange format, where sec_cats in attrs - for full_name in attrs["sec_cats"]: - key = full_name.split("(")[0][:-1] - ret[key] = full_name if include_entity and "entity_terminology" in attrs: ret["entity"] = f"entity ({attrs['entity_terminology']})" diff --git a/primap2/pm2io/_data_reading.py b/primap2/pm2io/_data_reading.py index 4efa6369..d263e9fa 100644 --- a/primap2/pm2io/_data_reading.py +++ b/primap2/pm2io/_data_reading.py @@ -1122,7 +1122,7 @@ def fill_from_other_col( ------- pd.DataFrame """ - dim_aliases = _selection.translations_from_attrs(attrs, include_entity=True) + dim_aliases = _selection.translations_from_dims(df.columns) # loop over target columns in value mapping for target_col in coords_value_filling: @@ -1186,7 +1186,7 @@ def map_metadata_unordered( attrs: dict[str, Any], ): """Map the metadata according to specifications given in meta_mapping.""" - dim_aliases = _selection.translations_from_attrs(attrs, include_entity=True) + dim_aliases = _selection.translations_from_dims(data) # TODO: add additional mapping functions here # values: (function, additional arguments) @@ -1499,7 +1499,7 @@ def harmonize_units( data_cols = list(set(data.columns.values) - set(dimensions)) if attrs is not None: - dim_aliases = _selection.translations_from_attrs(attrs, include_entity=True) + dim_aliases = _selection.translations_from_dims(data.columns) entity_col = dim_aliases.get("entity", "entity") else: entity_col = "entity" diff --git a/primap2/pm2io/_interchange_format.py b/primap2/pm2io/_interchange_format.py index 5da0c540..ef554415 100644 --- a/primap2/pm2io/_interchange_format.py +++ b/primap2/pm2io/_interchange_format.py @@ -36,6 +36,7 @@ { "area": sy.Str(), sy.Optional("cat"): sy.Str(), + # sec_cats is only for backward compatibility, will be ignored sy.Optional("sec_cats"): sy.Seq(sy.Str()), sy.Optional("scen"): sy.Str(), sy.Optional("references"): sy.Str(), @@ -219,6 +220,8 @@ def read_interchange_format( data = pd.read_csv(data_file, dtype=object) data.attrs = meta_data + if "sec_cats" in data.attrs["attrs"]: + del data.attrs["attrs"]["sec_cats"] # strictyaml parses a datetime, we only want a date if "publication_date" in data.attrs["attrs"]: @@ -391,10 +394,6 @@ def from_interchange_format( # add the dataset wide attributes data_xr.attrs = attrs["attrs"] - if "sec_cats" in data_xr.attrs: - # only needed in interchange format - del data_xr.attrs["sec_cats"] - data_xr = data_xr.pr.quantify() data_xr.pr.ensure_valid() diff --git a/primap2/tests/test_data_reading.py b/primap2/tests/test_data_reading.py index 65cf9624..e8ea4d15 100644 --- a/primap2/tests/test_data_reading.py +++ b/primap2/tests/test_data_reading.py @@ -830,7 +830,6 @@ def test_from(self): "area": "area (ISO3)", "cat": "category (IPCC2006)", "scen": "scenario (general)", - "sec_cats": ["Class (class)", "Type (type)"], }, "time_format": "%Y", "dimensions": {"*": dims}, @@ -906,7 +905,6 @@ def test_roundtrip(self, tmp_path): "area": "area (ISO3)", "cat": "category (IPCC2006)", "scen": "scenario (general)", - "sec_cats": ["Class (class)", "Type (type)"], }, "time_format": "%Y", "dimensions": {"CO2": ["area (ISO3)"]}, From fcd7d63cf1681b7690e1c5da6a7270545e86e069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 22 Oct 2024 15:23:07 +0200 Subject: [PATCH 3/4] docs: remove sec_cats from documentation --- .../source/data_format/data_format_details.md | 20 +++++++++---------- .../data_format/data_format_examples.md | 10 +++------- .../interchange_format_examples.md | 3 --- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/docs/source/data_format/data_format_details.md b/docs/source/data_format/data_format_details.md index cf73ef2b..597a650c 100644 --- a/docs/source/data_format/data_format_details.md +++ b/docs/source/data_format/data_format_details.md @@ -12,16 +12,16 @@ For all dimensions, defined names have to be used and additional metadata about dimensions is stored in the datasets `attrs`. The dimensions are: -| dimension | dimension key | req. | notes | attrs | -|-----------------|-----------------------|------|---------------------------|---------------------------------------| -| time | time | ✗ | for periods, the start | | -| area | area () | ✗ | pre-defined category sets | `'area': 'area ()'` | -| category | category () | | primary category | `'cat': 'category ()'` | -| sec. categories | () | | there can be multiple | `'sec_cats': [' ()', …]` | -| scenario | scenario () | | | `'scen': 'scenario ()'` | -| provenance | provenance | | values from fixed set | | -| model | model | | | | -| source | source | ✗ | a short source identifier | | +| dimension | dimension key | req. | notes | attrs | +|-----------------|-------------------------|------|---------------------------|-----------------------------------| +| `time` | `time` | ✗ | for periods, the start | | +| `area` | `area ()` | ✗ | pre-defined category sets | `'area': 'area ()'` | +| `category` | `category ()` | | primary category | `'cat': 'category ()'` | +| sec. categories | ` ()` | | there can be multiple | | +| `scenario` | `scenario ()` | | | `'scen': 'scenario ()'` | +| `provenance` | `provenance` | | values from fixed set | | +| `model` | `model` | | | | +| `source` | `source` | ✗ | a short source identifier | | For some dimensions, the meaning of the data is directly visible from the data type (`time` uses an xarray datetime data type) or the values come from a pre-defined list diff --git a/docs/source/data_format/data_format_examples.md b/docs/source/data_format/data_format_examples.md index 5295c8ad..02757953 100644 --- a/docs/source/data_format/data_format_examples.md +++ b/docs/source/data_format/data_format_examples.md @@ -138,7 +138,6 @@ opulent = xr.Dataset( "entity_terminology": "primap2", "area": "area (ISO3)", "cat": "category (IPCC 2006)", - "sec_cats": ["animal (FAOSTAT)", "product (FAOSTAT)"], "scen": "scenario (FAOSTAT)", "references": "doi:10.1012", "rights": "Use however you want.", @@ -205,11 +204,8 @@ Compared to the minimal example, this data set has a lot more to unpack: specific set of categories given in parentheses and with appropriate metadata in the `attrs`. The `scenario` is a standard dimension, and the metadata in `attrs` is given using - the `scen` key. The `animal` and `product` dimensions are nonstandard, and are - included in the - secondary categories at `attrs['sec_cats']`. Note that `sec_cats` contains a list, so - that multiple nonstandard dimensions can be included if needed. -* There is also s coordinate which is not defining a dimensions, `category names`. It + the `scen` key. The `animal` and `product` dimensions are nonstandard. +* There is also a coordinate which is not defining a dimension, `category names`. It gives additional information about categories, which can be helpful for humans trying to make sense of the category codes without looking them up. Note that because this coordinate is not used as an index for a dimension, the category @@ -218,7 +214,7 @@ Compared to the minimal example, this data set has a lot more to unpack: the population data does not use all dimensions. For each data variable, only the dimensions which make sense have to be used. * In the `attrs`, the terminology for the entities is explicitly defined, so that the - meaning of the entity attributes is unambigously defined. + meaning of the entity attributes is unambiguously defined. * In the `attrs`, additional metadata useful for humans is included: citable `references`, usage `rights`, a descriptive `title`, a long-form `comment`, an email address to `contact` for questions about the data set, and the `publication_date`. diff --git a/docs/source/data_format/interchange_format_examples.md b/docs/source/data_format/interchange_format_examples.md index 54ac9ae8..2afd8fcc 100644 --- a/docs/source/data_format/interchange_format_examples.md +++ b/docs/source/data_format/interchange_format_examples.md @@ -52,9 +52,6 @@ They are listed as secondary columns in the metadata dict. Column names correspond to the dimension key of the xarray format, i.e. they contain the terminology in parentheses (e.g. `area (ISO3)`). -Additional columns are currently not possible, but the option will be added -in a future release ([#25](https://github.com/pik-primap/primap2/issues/25)). - The metadata dict has an `attrs` entry, which corresponds to the `attrs` dict of the xarray format (see [Data format details](data_format_details.md)). Additionally, the metadata dict contains information on the `dimensions` of the From 3930b2b5983671b0c01275ef85c9bad043f10b83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Pfl=C3=BCger?= Date: Tue, 22 Oct 2024 15:39:38 +0200 Subject: [PATCH 4/4] docs: add changelog --- changelog/277.breaking.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 changelog/277.breaking.md diff --git a/changelog/277.breaking.md b/changelog/277.breaking.md new file mode 100644 index 00000000..14660cf6 --- /dev/null +++ b/changelog/277.breaking.md @@ -0,0 +1,6 @@ +We removed the `sec_cats` entry from the metadata in a dataset's `attrs` in the native format +as well as the interchange format. It did not add much value, but maintaining it was work, so on balance +we decided to remove it. +When reading datasets from disk (from the interchange format or netcdf files), `sec_cats` will be ignored +so that datasets written with earlier versions of primap2 can still be read and produce valid in-memory +datasets.