Raise error when resolving variable UUIDs to display names with dupli…

…cates. (#34)
enlyze · Oct 2, 2024 · 3d2d564 · 3d2d564
1 parent f37ff83
commit 3d2d564
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 4 deletions.
diff --git a/docs/errors.rst b/docs/errors.rst
@@ -10,3 +10,6 @@ Errors
 
 .. autoclass:: ResamplingValidationError
     :show-inheritance:
+
+.. autoclass:: DuplicateDisplayNameError
+    :show-inheritance:
diff --git a/src/enlyze/errors.py b/src/enlyze/errors.py
@@ -28,3 +28,13 @@ class ResamplingValidationError(EnlyzeError):
     resampling interval is specified.
 
     """
+
+
+class DuplicateDisplayNameError(EnlyzeError):
+    """Variables with duplicate display names
+
+    Resolving variable UUIDs to display names would result in ambiguity because
+    multiple variables have the same display name. You should either fix the
+    duplicate variable display names via the ENLYZE App or don't request them at
+    the same time.
+    """
diff --git a/src/enlyze/models.py b/src/enlyze/models.py
@@ -7,6 +7,7 @@
 
 import pandas
 
+from enlyze.errors import DuplicateDisplayNameError
 from enlyze.schema import dataframe_ensure_schema
 
 
@@ -143,6 +144,19 @@ def _display_names_as_column_names(self, columns: list[str]) -> list[str]:
             if var.display_name
         }
 
+        distinct_display_names = set(uuid_to_display_name.values())
+        if len(uuid_to_display_name) != len(distinct_display_names):
+            maybe_duplicate_display_names = list(uuid_to_display_name.values())
+            for name in distinct_display_names:
+                maybe_duplicate_display_names.remove(name)
+
+            raise DuplicateDisplayNameError(
+                ", ".join(
+                    f"'{duplicate_display_name}'"
+                    for duplicate_display_name in set(maybe_duplicate_display_names)
+                )
+            )
+
         return [uuid_to_display_name.get(var_uuid, var_uuid) for var_uuid in columns]
 
     def to_dicts(self, use_display_names: bool = False) -> Iterator[dict[str, Any]]:
@@ -153,12 +167,14 @@ def to_dicts(self, use_display_names: bool = False) -> Iterator[dict[str, Any]]:
         <python:datetime-naive-aware>` :py:class:`datetime.datetime` localized in UTC.
 
         :param use_display_names: Whether to return display names instead of variable
-            UUIDs. If there is no display name fall back to UUID.
+            UUIDs. If there is no display name, fall back to UUID.
+
+        :raises: :exc:`~enlyze.errors.DuplicateDisplayNameError` when duplicate
+            display names would be returned instead of UUIDs.
 
         :returns: Iterator over rows
 
         """
-
         time_column, *variable_columns = self._columns
 
         if use_display_names:
@@ -181,7 +197,10 @@ def to_dataframe(self, use_display_names: bool = False) -> pandas.DataFrame:
         represented as a column named by its UUID.
 
         :param use_display_names: Whether to return display names instead of variable
-            UUIDs. If there is no display name fall back to UUID.
+            UUIDs. If there is no display name, fall back to UUID.
+
+        :raises: :exc:`~enlyze.errors.DuplicateDisplayNameError` when duplicate
+            display names would be returned instead of UUIDs.
 
         :returns: DataFrame with timeseries data indexed by time
 

diff --git a/tests/enlyze/test_models.py b/tests/enlyze/test_models.py
@@ -1,9 +1,13 @@
 from dataclasses import replace
+from datetime import datetime
+from uuid import uuid4
 
 import hypothesis.strategies as st
+import pytest
 from hypothesis import given
 
-from enlyze.models import ProductionRun, ProductionRuns
+from enlyze.errors import DuplicateDisplayNameError
+from enlyze.models import ProductionRun, ProductionRuns, TimeseriesData, Variable
 
 
 @given(runs=st.lists(st.from_type(ProductionRun), max_size=10))
@@ -28,3 +32,21 @@ def test_production_runs_to_dataframe_no_empty_columns_for_optional_dataclasses(
 
     assert "quantity_total" not in df.columns
     assert "average_throughput" in df.columns
+
+
+@given(variable=st.builds(Variable, display_name=st.text(min_size=1)))
+def test_timeseries_data_duplicate_display_names(variable):
+
+    variable_duplicate = replace(variable, uuid=uuid4())
+    variables = [variable, variable_duplicate]
+
+    data = TimeseriesData(
+        start=datetime.now(),
+        end=datetime.now(),
+        variables=variables,
+        _columns=["time", *[str(v.uuid) for v in variables]],
+        _records=[],
+    )
+
+    with pytest.raises(DuplicateDisplayNameError):
+        data.to_dataframe(use_display_names=True)