Skip to content

Commit

Permalink
Merge pull request PowerGridModel#758 from PowerGridModel/feature/des…
Browse files Browse the repository at this point in the history
…erialization-filter-relevant-logic-implementation

Feature / deserialization data filter relevant logic in place
  • Loading branch information
nitbharambe authored Oct 3, 2024
2 parents 2487f49 + da1a0a2 commit 2d74e82
Show file tree
Hide file tree
Showing 12 changed files with 135 additions and 39 deletions.
2 changes: 2 additions & 0 deletions docs/api_reference/python-api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ SPDX-License-Identifier: MPL-2.0
.. autoclass:: power_grid_model.data_types.SparseBatchArray
.. autoclass:: power_grid_model.dataset_definitions.DatasetType
.. autoclass:: power_grid_model.dataset_definitions.ComponentType
.. autodata:: power_grid_model.typing.ComponentAttributeMapping
:annotation: ComponentAttributeMapping
```

## error types
Expand Down
2 changes: 1 addition & 1 deletion docs/user_manual/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The format consists of a [`PowerGridModelRoot`](#json-schema-root-object) JSON o

- [`PowerGridModelRoot`](#json-schema-root-object): `Object`
- `version`: `string` containing the schema version (required, current version is `"1.0"`)
- `type`: `string` containing the dataset type, e.g. `"input"`, `"update"`, ...
- `type`: `string` containing the dataset type, e.g. `"input"`, `"update"`, etc.
- `is_batch`: `boolean` flag that describes whether the dataset is a batch or not.
- `attributes`: [`Attributes`](#json-schema-attributes-object) containing specified attributes per component type (e.g.: `"node"`).
- `data`: [`Dataset`](#json-schema-dataset-object) containing the actual dataset.
Expand Down
3 changes: 2 additions & 1 deletion src/power_grid_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
BranchSide,
CalculationMethod,
CalculationType,
ComponentAttributeFilterOptions,
FaultPhase,
FaultType,
LoadGenType,
Expand All @@ -20,4 +21,4 @@
TapChangingStrategy,
WindingType,
)
from power_grid_model.typing import ComponentAttributeFilterOptions, ComponentAttributeMapping
from power_grid_model.typing import ComponentAttributeMapping
47 changes: 34 additions & 13 deletions src/power_grid_model/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,8 @@
SinglePythonDataset,
SparseBatchData,
)
from power_grid_model.typing import (
ComponentAttributeFilterOptions,
ComponentAttributeMapping,
_ComponentAttributeMappingDict,
)
from power_grid_model.enum import ComponentAttributeFilterOptions
from power_grid_model.typing import ComponentAttributeMapping, _ComponentAttributeMappingDict


def is_nan(data) -> bool:
Expand Down Expand Up @@ -495,37 +492,61 @@ def is_columnar(component_data: ComponentData) -> bool:
return not isinstance(component_data, np.ndarray)


def component_data_checks(component_data: ComponentData, component=None) -> None:
"""Checks if component_data is of ComponentData and raises ValueError if its not"""
component_name = f"'{component}'" if component is not None else ""
err_msg = f"Invalid data for {component_name} component. " "{0}"
err_msg_suffixed = err_msg + "Expecting a 1D/2D Numpy structured array or a dictionary of such."
def is_nan_or_equivalent(array):
"""
Check if the array contains only nan values or equivalent nan values for specific data types.
Args:
array: The array to check.
Returns:
bool: True if the array contains only nan or equivalent nan values, False otherwise.
"""
return isinstance(array, np.ndarray) and (
(array.dtype == np.float64 and np.isnan(array).all())
or (array.dtype in (np.int32, np.int8) and np.all(array == np.iinfo(array.dtype).min))
)


def _check_sparse_dense(component_data: ComponentData, err_msg_suffixed: str) -> ComponentData:
if is_sparse(component_data):
indptr = component_data["indptr"]
if not isinstance(indptr, np.ndarray):
raise TypeError(err_msg_suffixed.format(f"Invalid indptr type {type(indptr).__name__}. "))
sub_data = component_data["data"]
elif isinstance(component_data, dict) and ("indptr" in component_data or "data" in component_data):
missing_element = "indptr" if "indptr" not in component_data else "data"
raise KeyError(err_msg.format(f"Missing '{missing_element}' in sparse batch data. "))
raise KeyError(err_msg_suffixed.format(f"Missing '{missing_element}' in sparse batch data. "))
else:
sub_data = component_data
return sub_data

if is_columnar(component_data):

def _check_columnar_row(sub_data: ComponentData, err_msg_suffixed: str) -> None:
if is_columnar(sub_data):
if not isinstance(sub_data, dict):
raise TypeError(err_msg_suffixed.format(""))
for attribute, attribute_array in sub_data.items():
if not isinstance(attribute_array, np.ndarray):
raise TypeError(err_msg_suffixed.format(f"'{attribute}' attribute. "))
if attribute_array.ndim not in [1, 2, 3]:
raise TypeError(err_msg_suffixed.format(f"Invalid dimension: {attribute_array.ndim }"))
raise TypeError(err_msg_suffixed.format(f"Invalid dimension: {attribute_array.ndim}"))
elif not isinstance(sub_data, np.ndarray):
raise TypeError(err_msg_suffixed.format(f"Invalid data type {type(sub_data).__name__} "))
elif isinstance(sub_data, np.ndarray) and sub_data.ndim not in [1, 2]:
raise TypeError(err_msg_suffixed.format(f"Invalid dimension: {sub_data.ndim}. "))


def component_data_checks(component_data: ComponentData, component=None) -> None:
"""Checks if component_data is of ComponentData and raises ValueError if its not"""
component_name = f"'{component}'" if component is not None else ""
err_msg = f"Invalid data for {component_name} component. " "{0}"
err_msg_suffixed = err_msg + "Expecting a 1D/2D Numpy structured array or a dictionary of such."

sub_data = _check_sparse_dense(component_data, err_msg_suffixed)
_check_columnar_row(sub_data, err_msg_suffixed)


def _extract_indptr(data: ComponentData) -> IndexPointer: # pragma: no cover
"""returns indptr and checks if its valid
Expand Down
40 changes: 33 additions & 7 deletions src/power_grid_model/core/power_grid_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from typing import Any, Mapping, Optional

from power_grid_model._utils import is_columnar, is_sparse, process_data_filter
from power_grid_model._utils import is_columnar, is_nan_or_equivalent, is_sparse, process_data_filter
from power_grid_model.core.buffer_handling import (
BufferProperties,
CAttributeBuffer,
Expand All @@ -28,12 +28,9 @@
)
from power_grid_model.core.power_grid_meta import ComponentMetaData, DatasetMetaData, power_grid_meta_data
from power_grid_model.data_types import AttributeType, ComponentData, Dataset
from power_grid_model.enum import ComponentAttributeFilterOptions
from power_grid_model.errors import PowerGridError
from power_grid_model.typing import (
ComponentAttributeFilterOptions,
ComponentAttributeMapping,
_ComponentAttributeMappingDict,
)
from power_grid_model.typing import ComponentAttributeMapping, _ComponentAttributeMappingDict


class CDatasetInfo: # pylint: disable=too-few-public-methods
Expand Down Expand Up @@ -422,8 +419,9 @@ def get_data(self) -> Dataset:
The Power Grid Model may write to these buffers at a later point in time.
Returns:
The full dataset.
The full dataset with filters applied.
"""
self._post_filtering()
return self._data

def get_component_data(self, component: ComponentType) -> ComponentData:
Expand Down Expand Up @@ -504,6 +502,34 @@ def _get_buffer_properties(self, info: CDatasetInfo) -> Mapping[ComponentType, B
if component in self._data_filter
}

def _filter_attributes(self, attributes):
keys_to_remove = []
for attr, array in attributes.items():
if is_columnar(array):
continue
if is_nan_or_equivalent(array):
keys_to_remove.append(attr)
for key in keys_to_remove:
del attributes[key]

def _filter_with_option(self):
if self._data_filter is ComponentAttributeFilterOptions.RELEVANT:
for attributes in self._data.values():
self._filter_attributes(attributes)

def _filter_with_mapping(self):
for component_type, attributes in self._data.items():
if component_type in self._data_filter:
filter_option = self._data_filter[component_type]
if filter_option is ComponentAttributeFilterOptions.RELEVANT:
self._filter_attributes(attributes)

def _post_filtering(self):
if isinstance(self._data_filter, ComponentAttributeFilterOptions):
self._filter_with_option()
elif isinstance(self._data_filter, dict):
self._filter_with_mapping()


def _get_filtered_attributes(
schema: ComponentMetaData,
Expand Down
2 changes: 2 additions & 0 deletions src/power_grid_model/core/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Deserializer:
_deserializer: DeserializerPtr
_dataset_ptr: WritableDatasetPtr
_dataset: CWritableDataset
_data_filter: ComponentAttributeMapping

def __new__(
cls,
Expand All @@ -59,6 +60,7 @@ def __new__(
instance._dataset_ptr = pgc.deserializer_get_dataset(instance._deserializer)
assert_no_error()

instance._data_filter = data_filter
instance._dataset = CWritableDataset(instance._dataset_ptr, data_filter=data_filter)
assert_no_error()

Expand Down
9 changes: 9 additions & 0 deletions src/power_grid_model/enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,12 @@ class _ExperimentalFeatures(IntEnum):

disabled = 0
enabled = 1


class ComponentAttributeFilterOptions(IntEnum):
"""Filter option component or attribute"""

ALL = 0
"""Filter all components/attributes"""
RELEVANT = 1
"""Filter only non-empty components/attributes that contain non-NaN values"""
28 changes: 16 additions & 12 deletions src/power_grid_model/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,8 @@
Type hints for PGM. This includes all miscellaneous type hints not under dataset or dataset_definitions categories
"""

from enum import IntEnum

from power_grid_model.core.dataset_definitions import ComponentType, ComponentTypeVar


class ComponentAttributeFilterOptions(IntEnum):
"""Filter option component or attribute"""

ALL = 0
"""Filter all components/attributes"""
RELEVANT = 1
"""Filter only non-empty components/attributes that contain non-NaN values"""

from power_grid_model.enum import ComponentAttributeFilterOptions

_ComponentAttributeMappingDict = dict[ComponentType, set[str] | list[str] | None | ComponentAttributeFilterOptions]

Expand All @@ -29,3 +18,18 @@ class ComponentAttributeFilterOptions(IntEnum):
| None
| _ComponentAttributeMappingDict
)
"""
Type hint for mapping component attributes.
`ComponentAttributeMapping` can be one of the following:
- A set of `ComponentTypeVar`
- A list of `ComponentTypeVar`
- A `ComponentAttributeFilterOptions` value
- `None`
- A dictionary mapping `ComponentType` to a set, list, `None`, or `ComponentAttributeFilterOptions`
"""
2 changes: 1 addition & 1 deletion tests/unit/test_internal_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from power_grid_model.core.dataset_definitions import ComponentType as CT, DatasetType as DT
from power_grid_model.data_types import BatchDataset, BatchList
from power_grid_model.typing import ComponentAttributeFilterOptions
from power_grid_model.enum import ComponentAttributeFilterOptions

from .utils import convert_python_to_numpy

Expand Down
34 changes: 33 additions & 1 deletion tests/unit/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from power_grid_model.core.dataset_definitions import ComponentType
from power_grid_model.core.power_grid_dataset import get_dataset_type
from power_grid_model.data_types import BatchDataset, Dataset, SingleDataset
from power_grid_model.typing import ComponentAttributeFilterOptions
from power_grid_model.enum import ComponentAttributeFilterOptions
from power_grid_model.utils import json_deserialize, json_serialize, msgpack_deserialize, msgpack_serialize


Expand Down Expand Up @@ -384,6 +384,15 @@ def serialized_data(request):
pytest.param({"node": ["id"], "sym_load": ["id"]}, id="columnar filter"),
pytest.param({"node": ["id"], "sym_load": None}, id="mixed columnar/row filter"),
pytest.param({"node": ["id"], "shunt": None}, id="unused component filter"),
pytest.param(
{
"node": ["id"],
"line": ComponentAttributeFilterOptions.ALL,
"sym_load": None,
"asym_load": ComponentAttributeFilterOptions.RELEVANT,
},
id="mixed filter",
),
]
)
def data_filters(request):
Expand Down Expand Up @@ -610,6 +619,27 @@ def assert_serialization_correct(deserialized_dataset: Dataset, serialized_datas
)


def _check_only_relevant_attributes_present(component_values) -> bool:
for array in component_values.values():
if not isinstance(array, np.ndarray):
continue
if (array.dtype == np.float64 and np.isnan(array).all()) or (
array.dtype in (np.int32, np.int8) and np.all(array == np.iinfo(array.dtype).min)
):
return False
return True


def assert_deserialization_filtering_correct(deserialized_dataset: Dataset, data_filter) -> bool:
if data_filter is ComponentAttributeFilterOptions.ALL:
return True
if data_filter is ComponentAttributeFilterOptions.RELEVANT:
for component_values in deserialized_dataset.values():
if not _check_only_relevant_attributes_present(component_values):
return False
return True


@pytest.mark.parametrize("raw_buffer", (True, False))
def test_json_deserialize_data(serialized_data, data_filters, raw_buffer: bool):
data = to_json(serialized_data, raw_buffer=raw_buffer)
Expand Down Expand Up @@ -723,6 +753,8 @@ def test_serialize_deserialize_double_round_trip(deserialize, serialize, seriali

assert serialized_result_a == serialized_result_b
assert list(deserialized_result_b) == list(deserialized_result_a)
assert assert_deserialization_filtering_correct(deserialized_result_a, data_filters)
assert assert_deserialization_filtering_correct(deserialized_result_b, data_filters)

for (component_a, component_result_a), component_result_b in zip(
deserialized_result_a.items(), deserialized_result_b.values()
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/validation/test_batch_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from power_grid_model import DatasetType, LoadGenType, initialize_array
from power_grid_model._utils import compatibility_convert_row_columnar_dataset
from power_grid_model.typing import ComponentAttributeFilterOptions
from power_grid_model.enum import ComponentAttributeFilterOptions
from power_grid_model.validation import validate_batch_data
from power_grid_model.validation.errors import MultiComponentNotUniqueError, NotBooleanError

Expand Down
3 changes: 1 addition & 2 deletions tests/unit/validation/test_input_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
initialize_array,
)
from power_grid_model._utils import compatibility_convert_row_columnar_dataset
from power_grid_model.enum import CalculationType, FaultPhase, FaultType
from power_grid_model.typing import ComponentAttributeFilterOptions
from power_grid_model.enum import CalculationType, ComponentAttributeFilterOptions, FaultPhase, FaultType
from power_grid_model.validation import validate_input_data
from power_grid_model.validation.errors import (
FaultPhaseError,
Expand Down

0 comments on commit 2d74e82

Please sign in to comment.