Skip to content

Commit

Permalink
refactor(datasets): deprecate "DataSet" type names (tracking)
Browse files Browse the repository at this point in the history
Signed-off-by: Deepyaman Datta <[email protected]>
  • Loading branch information
deepyaman committed Sep 18, 2023
1 parent 188a4b6 commit 45be7f0
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 78 deletions.
2 changes: 2 additions & 0 deletions kedro-datasets/docs/source/kedro_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ kedro_datasets
kedro_datasets.text.TextDataSet
kedro_datasets.text.TextDataset
kedro_datasets.tracking.JSONDataSet
kedro_datasets.tracking.JSONDataset
kedro_datasets.tracking.MetricsDataSet
kedro_datasets.tracking.MetricsDataset
kedro_datasets.video.VideoDataSet
kedro_datasets.yaml.YAMLDataSet
14 changes: 9 additions & 5 deletions kedro-datasets/kedro_datasets/tracking/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""Dataset implementations to save data for Kedro Experiment Tracking"""
"""Dataset implementations to save data for Kedro Experiment Tracking."""
from __future__ import annotations

from typing import Any

import lazy_loader as lazy

# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
JSONDataSet: Any
MetricsDataSet: Any
JSONDataSet: type[JSONDataset]
JSONDataset: Any
MetricsDataSet: type[MetricsDataset]
MetricsDataset: Any

__getattr__, __dir__, __all__ = lazy.attach(
__name__,
submod_attrs={
"json_dataset": ["JSONDataSet"],
"metrics_dataset": ["MetricsDataSet"],
"json_dataset": ["JSONDataSet", "JSONDataset"],
"metrics_dataset": ["MetricsDataSet", "MetricsDataset"],
},
)
41 changes: 30 additions & 11 deletions kedro-datasets/kedro_datasets/tracking/json_dataset.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
"""``JSONDataSet`` saves data to a JSON file using an underlying
"""``JSONDataset`` saves data to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file.
The ``JSONDataSet`` is part of Kedro Experiment Tracking. The dataset is versioned by default.
The ``JSONDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default.
"""
import warnings
from typing import NoReturn

from kedro.io.core import DataSetError
from kedro.io.core import DatasetError

from kedro_datasets.json import json_dataset


class JSONDataSet(json_dataset.JSONDataSet):
"""``JSONDataSet`` saves data to a JSON file using an underlying
class JSONDataset(json_dataset.JSONDataset):
"""``JSONDataset`` saves data to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file.
The ``JSONDataSet`` is part of Kedro Experiment Tracking.
The ``JSONDataset`` is part of Kedro Experiment Tracking.
The dataset is write-only and it is versioned by default.
Example usage for the
Expand All @@ -22,24 +23,42 @@ class JSONDataSet(json_dataset.JSONDataSet):
.. code-block:: yaml
cars:
type: tracking.JSONDataSet
type: tracking.JSONDataset
filepath: data/09_tracking/cars.json
Example usage for the
`Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_:
::
>>> from kedro_datasets.tracking import JSONDataSet
>>> from kedro_datasets.tracking import JSONDataset
>>>
>>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002}
>>>
>>> data_set = JSONDataSet(filepath="test.json")
>>> data_set.save(data)
>>> dataset = JSONDataset(filepath="test.json")
>>> dataset.save(data)
"""

versioned = True

def _load(self) -> NoReturn:
raise DataSetError(f"Loading not supported for '{self.__class__.__name__}'")
raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'")


_DEPRECATED_CLASSES = {
"JSONDataSet": JSONDataset,
}


def __getattr__(name):
if name in _DEPRECATED_CLASSES:
alias = _DEPRECATED_CLASSES[name]
warnings.warn(
f"{repr(name)} has been renamed to {repr(alias.__name__)}, "
f"and the alias will be removed in Kedro-Datasets 2.0.0",
DeprecationWarning,
stacklevel=2,
)
return alias
raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")
47 changes: 33 additions & 14 deletions kedro-datasets/kedro_datasets/tracking/metrics_dataset.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
"""``MetricsDataSet`` saves data to a JSON file using an underlying
"""``MetricsDataset`` saves data to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file.
The ``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is versioned by default
The ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default
and only takes metrics of numeric values.
"""
import json
import warnings
from typing import Dict, NoReturn

from kedro.io.core import DataSetError, get_filepath_str
from kedro.io.core import DatasetError, get_filepath_str

from kedro_datasets.json import json_dataset


class MetricsDataSet(json_dataset.JSONDataSet):
"""``MetricsDataSet`` saves data to a JSON file using an underlying
class MetricsDataset(json_dataset.JSONDataset):
"""``MetricsDataset`` saves data to a JSON file using an underlying
filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. The
``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only,
``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is write-only,
it is versioned by default and only takes metrics of numeric values.
Example usage for the
Expand All @@ -24,38 +25,38 @@ class MetricsDataSet(json_dataset.JSONDataSet):
.. code-block:: yaml
cars:
type: tracking.MetricsDataSet
type: tracking.MetricsDataset
filepath: data/09_tracking/cars.json
Example usage for the
`Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_:
::
>>> from kedro_datasets.tracking import MetricsDataSet
>>> from kedro_datasets.tracking import MetricsDataset
>>>
>>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002}
>>>
>>> data_set = MetricsDataSet(filepath="test.json")
>>> data_set.save(data)
>>> dataset = MetricsDataset(filepath="test.json")
>>> dataset.save(data)
"""

versioned = True

def _load(self) -> NoReturn:
raise DataSetError(f"Loading not supported for '{self.__class__.__name__}'")
raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'")

def _save(self, data: Dict[str, float]) -> None:
"""Converts all values in the data from a ``MetricsDataSet`` to float to make sure
"""Converts all values in the data from a ``MetricsDataset`` to float to make sure
they are numeric values which can be displayed in Kedro Viz and then saves the dataset.
"""
try:
for key, value in data.items():
data[key] = float(value)
except ValueError as exc:
raise DataSetError(
f"The MetricsDataSet expects only numeric values. {exc}"
raise DatasetError(
f"The MetricsDataset expects only numeric values. {exc}"
) from exc

save_path = get_filepath_str(self._get_save_path(), self._protocol)
Expand All @@ -64,3 +65,21 @@ def _save(self, data: Dict[str, float]) -> None:
json.dump(data, fs_file, **self._save_args)

self._invalidate_cache()


_DEPRECATED_CLASSES = {
"MetricsDataSet": MetricsDataset,
}


def __getattr__(name):
if name in _DEPRECATED_CLASSES:
alias = _DEPRECATED_CLASSES[name]
warnings.warn(
f"{repr(name)} has been renamed to {repr(alias.__name__)}, "
f"and the alias will be removed in Kedro-Datasets 2.0.0",
DeprecationWarning,
stacklevel=2,
)
return alias
raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")
57 changes: 34 additions & 23 deletions kedro-datasets/tests/tracking/test_json_dataset.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import importlib
import json
from pathlib import Path, PurePosixPath

import pytest
from fsspec.implementations.local import LocalFileSystem
from gcsfs import GCSFileSystem
from kedro.io import DataSetError
from kedro.io.core import PROTOCOL_DELIMITER, Version
from s3fs.core import S3FileSystem

from kedro_datasets.tracking import JSONDataSet
from kedro_datasets._io import DatasetError
from kedro_datasets.tracking import JSONDataset
from kedro_datasets.tracking.json_dataset import _DEPRECATED_CLASSES


@pytest.fixture
Expand All @@ -18,12 +20,12 @@ def filepath_json(tmp_path):

@pytest.fixture
def json_dataset(filepath_json, save_args, fs_args):
return JSONDataSet(filepath=filepath_json, save_args=save_args, fs_args=fs_args)
return JSONDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args)


@pytest.fixture
def explicit_versioned_json_dataset(filepath_json, load_version, save_version):
return JSONDataSet(
return JSONDataset(
filepath=filepath_json, version=Version(load_version, save_version)
)

Expand All @@ -33,10 +35,19 @@ def dummy_data():
return {"col1": 1, "col2": 2, "col3": "mystring"}


class TestJSONDataSet:
@pytest.mark.parametrize(
"module_name", ["kedro_datasets.tracking", "kedro_datasets.tracking.json_dataset"]
)
@pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES)
def test_deprecation(module_name, class_name):
with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"):
getattr(importlib.import_module(module_name), class_name)


class TestJSONDataset:
def test_save(self, filepath_json, dummy_data, tmp_path, save_version):
"""Test saving and reloading the data set."""
json_dataset = JSONDataSet(
json_dataset = JSONDataset(
filepath=filepath_json, version=Version(None, save_version)
)
json_dataset.save(dummy_data)
Expand All @@ -62,8 +73,8 @@ def test_save(self, filepath_json, dummy_data, tmp_path, save_version):

def test_load_fail(self, json_dataset, dummy_data):
json_dataset.save(dummy_data)
pattern = r"Loading not supported for 'JSONDataSet'"
with pytest.raises(DataSetError, match=pattern):
pattern = r"Loading not supported for 'JSONDataset'"
with pytest.raises(DatasetError, match=pattern):
json_dataset.load()

def test_exists(self, json_dataset, dummy_data):
Expand Down Expand Up @@ -100,44 +111,44 @@ def test_open_extra_args(self, json_dataset, fs_args):
],
)
def test_protocol_usage(self, filepath, instance_type):
data_set = JSONDataSet(filepath=filepath)
assert isinstance(data_set._fs, instance_type)
dataset = JSONDataset(filepath=filepath)
assert isinstance(dataset._fs, instance_type)

path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]

assert str(data_set._filepath) == path
assert isinstance(data_set._filepath, PurePosixPath)
assert str(dataset._filepath) == path
assert isinstance(dataset._filepath, PurePosixPath)

def test_catalog_release(self, mocker):
fs_mock = mocker.patch("fsspec.filesystem").return_value
filepath = "test.json"
data_set = JSONDataSet(filepath=filepath)
data_set.release()
dataset = JSONDataset(filepath=filepath)
dataset.release()
fs_mock.invalidate_cache.assert_called_once_with(filepath)

def test_not_version_str_repr(self):
"""Test that version is not in string representation of the class instance."""
filepath = "test.json"
ds = JSONDataSet(filepath=filepath)
ds = JSONDataset(filepath=filepath)

assert filepath in str(ds)
assert "version" not in str(ds)
assert "JSONDataSet" in str(ds)
assert "JSONDataset" in str(ds)
assert "protocol" in str(ds)
# Default save_args
assert "save_args={'indent': 2}" in str(ds)

def test_version_str_repr(self, load_version, save_version):
"""Test that version is in string representation of the class instance."""
filepath = "test.json"
ds_versioned = JSONDataSet(
ds_versioned = JSONDataset(
filepath=filepath, version=Version(load_version, save_version)
)

assert filepath in str(ds_versioned)
ver_str = f"version=Version(load={load_version}, save='{save_version}')"
assert ver_str in str(ds_versioned)
assert "JSONDataSet" in str(ds_versioned)
assert "JSONDataset" in str(ds_versioned)
assert "protocol" in str(ds_versioned)
# Default save_args
assert "save_args={'indent': 2}" in str(ds_versioned)
Expand All @@ -147,10 +158,10 @@ def test_prevent_overwrite(self, explicit_versioned_json_dataset, dummy_data):
corresponding json file for a given save version already exists."""
explicit_versioned_json_dataset.save(dummy_data)
pattern = (
r"Save path \'.+\' for JSONDataSet\(.+\) must "
r"Save path \'.+\' for JSONDataset\(.+\) must "
r"not exist if versioning is enabled\."
)
with pytest.raises(DataSetError, match=pattern):
with pytest.raises(DatasetError, match=pattern):
explicit_versioned_json_dataset.save(dummy_data)

@pytest.mark.parametrize(
Expand All @@ -171,15 +182,15 @@ def test_save_version_warning(
pattern = (
f"Save version '{save_version}' did not match "
f"load version '{load_version}' for "
r"JSONDataSet\(.+\)"
r"JSONDataset\(.+\)"
)
with pytest.warns(UserWarning, match=pattern):
explicit_versioned_json_dataset.save(dummy_data)

def test_http_filesystem_no_versioning(self):
pattern = "Versioning is not supported for HTTP protocols."

with pytest.raises(DataSetError, match=pattern):
JSONDataSet(
with pytest.raises(DatasetError, match=pattern):
JSONDataset(
filepath="https://example.com/file.json", version=Version(None, None)
)
Loading

0 comments on commit 45be7f0

Please sign in to comment.