From 3d2d564fd019376e0a5e0817f03cedf108bc8fd5 Mon Sep 17 00:00:00 2001 From: Daniel Krebs Date: Wed, 2 Oct 2024 14:45:44 +0200 Subject: [PATCH] Raise error when resolving variable UUIDs to display names with duplicates. (#34) --- docs/errors.rst | 3 +++ src/enlyze/errors.py | 10 ++++++++++ src/enlyze/models.py | 25 ++++++++++++++++++++++--- tests/enlyze/test_models.py | 24 +++++++++++++++++++++++- 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/docs/errors.rst b/docs/errors.rst index d3a999e..c3b49c4 100644 --- a/docs/errors.rst +++ b/docs/errors.rst @@ -10,3 +10,6 @@ Errors .. autoclass:: ResamplingValidationError :show-inheritance: + +.. autoclass:: DuplicateDisplayNameError + :show-inheritance: diff --git a/src/enlyze/errors.py b/src/enlyze/errors.py index 97f57ec..2cd3cc9 100644 --- a/src/enlyze/errors.py +++ b/src/enlyze/errors.py @@ -28,3 +28,13 @@ class ResamplingValidationError(EnlyzeError): resampling interval is specified. """ + + +class DuplicateDisplayNameError(EnlyzeError): + """Variables with duplicate display names + + Resolving variable UUIDs to display names would result in ambiguity because + multiple variables have the same display name. You should either fix the + duplicate variable display names via the ENLYZE App or don't request them at + the same time. + """ diff --git a/src/enlyze/models.py b/src/enlyze/models.py index a16328d..62bc5fd 100644 --- a/src/enlyze/models.py +++ b/src/enlyze/models.py @@ -7,6 +7,7 @@ import pandas +from enlyze.errors import DuplicateDisplayNameError from enlyze.schema import dataframe_ensure_schema @@ -143,6 +144,19 @@ def _display_names_as_column_names(self, columns: list[str]) -> list[str]: if var.display_name } + distinct_display_names = set(uuid_to_display_name.values()) + if len(uuid_to_display_name) != len(distinct_display_names): + maybe_duplicate_display_names = list(uuid_to_display_name.values()) + for name in distinct_display_names: + maybe_duplicate_display_names.remove(name) + + raise DuplicateDisplayNameError( + ", ".join( + f"'{duplicate_display_name}'" + for duplicate_display_name in set(maybe_duplicate_display_names) + ) + ) + return [uuid_to_display_name.get(var_uuid, var_uuid) for var_uuid in columns] def to_dicts(self, use_display_names: bool = False) -> Iterator[dict[str, Any]]: @@ -153,12 +167,14 @@ def to_dicts(self, use_display_names: bool = False) -> Iterator[dict[str, Any]]: ` :py:class:`datetime.datetime` localized in UTC. :param use_display_names: Whether to return display names instead of variable - UUIDs. If there is no display name fall back to UUID. + UUIDs. If there is no display name, fall back to UUID. + + :raises: :exc:`~enlyze.errors.DuplicateDisplayNameError` when duplicate + display names would be returned instead of UUIDs. :returns: Iterator over rows """ - time_column, *variable_columns = self._columns if use_display_names: @@ -181,7 +197,10 @@ def to_dataframe(self, use_display_names: bool = False) -> pandas.DataFrame: represented as a column named by its UUID. :param use_display_names: Whether to return display names instead of variable - UUIDs. If there is no display name fall back to UUID. + UUIDs. If there is no display name, fall back to UUID. + + :raises: :exc:`~enlyze.errors.DuplicateDisplayNameError` when duplicate + display names would be returned instead of UUIDs. :returns: DataFrame with timeseries data indexed by time diff --git a/tests/enlyze/test_models.py b/tests/enlyze/test_models.py index 3bba305..4b80123 100644 --- a/tests/enlyze/test_models.py +++ b/tests/enlyze/test_models.py @@ -1,9 +1,13 @@ from dataclasses import replace +from datetime import datetime +from uuid import uuid4 import hypothesis.strategies as st +import pytest from hypothesis import given -from enlyze.models import ProductionRun, ProductionRuns +from enlyze.errors import DuplicateDisplayNameError +from enlyze.models import ProductionRun, ProductionRuns, TimeseriesData, Variable @given(runs=st.lists(st.from_type(ProductionRun), max_size=10)) @@ -28,3 +32,21 @@ def test_production_runs_to_dataframe_no_empty_columns_for_optional_dataclasses( assert "quantity_total" not in df.columns assert "average_throughput" in df.columns + + +@given(variable=st.builds(Variable, display_name=st.text(min_size=1))) +def test_timeseries_data_duplicate_display_names(variable): + + variable_duplicate = replace(variable, uuid=uuid4()) + variables = [variable, variable_duplicate] + + data = TimeseriesData( + start=datetime.now(), + end=datetime.now(), + variables=variables, + _columns=["time", *[str(v.uuid) for v in variables]], + _records=[], + ) + + with pytest.raises(DuplicateDisplayNameError): + data.to_dataframe(use_display_names=True)