From 159e0a3e45ac81e6465c6bb010492f33f7e98064 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Pilla Date: Tue, 7 Jan 2025 18:48:56 -0600 Subject: [PATCH] chore(datasets): Remove tracking datasets which are used in Kedro Viz Experiment Tracking (#969) * remove et related kedro datasets * update release note and static json schema * temporary doc fix --- kedro-datasets/RELEASE.md | 4 + .../docs/source/api/kedro_datasets.rst | 2 - kedro-datasets/kedro_datasets/_typing.py | 5 - .../kedro_datasets/dask/csv_dataset.py | 4 +- .../kedro_datasets/dask/parquet_dataset.py | 4 +- .../kedro_datasets/tracking/__init__.py | 26 --- .../kedro_datasets/tracking/json_dataset.py | 56 ----- .../tracking/metrics_dataset.py | 76 ------- kedro-datasets/pyproject.toml | 4 - .../static/jsonschema/kedro-catalog-0.18.json | 72 ------- .../static/jsonschema/kedro-catalog-0.19.json | 72 ------- kedro-datasets/tests/tracking/__init__.py | 0 .../tests/tracking/test_json_dataset.py | 195 ----------------- .../tests/tracking/test_metrics_dataset.py | 204 ------------------ 14 files changed, 8 insertions(+), 716 deletions(-) delete mode 100644 kedro-datasets/kedro_datasets/tracking/__init__.py delete mode 100644 kedro-datasets/kedro_datasets/tracking/json_dataset.py delete mode 100644 kedro-datasets/kedro_datasets/tracking/metrics_dataset.py delete mode 100644 kedro-datasets/tests/tracking/__init__.py delete mode 100644 kedro-datasets/tests/tracking/test_json_dataset.py delete mode 100644 kedro-datasets/tests/tracking/test_metrics_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index a477dca5e..16fa5b18a 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,7 +1,11 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes + ## Breaking Changes + +- Removed `tracking.MetricsDataset` and `tracking.JSONDataset` + ## Community contributions # Release 6.0.0 diff --git a/kedro-datasets/docs/source/api/kedro_datasets.rst b/kedro-datasets/docs/source/api/kedro_datasets.rst index 0cbd3bc4e..63142220a 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets.rst @@ -62,6 +62,4 @@ kedro_datasets svmlight.SVMLightDataset tensorflow.TensorFlowModelDataset text.TextDataset - tracking.JSONDataset - tracking.MetricsDataset yaml.YAMLDataset diff --git a/kedro-datasets/kedro_datasets/_typing.py b/kedro-datasets/kedro_datasets/_typing.py index feb6d91b7..aa083f514 100644 --- a/kedro-datasets/kedro_datasets/_typing.py +++ b/kedro-datasets/kedro_datasets/_typing.py @@ -9,8 +9,3 @@ ImagePreview = NewType("ImagePreview", str) PlotlyPreview = NewType("PlotlyPreview", dict) JSONPreview = NewType("JSONPreview", str) - - -# experiment tracking datasets types -MetricsTrackingPreview = NewType("MetricsTrackingPreview", dict) -JSONTrackingPreview = NewType("JSONTrackingPreview", dict) diff --git a/kedro-datasets/kedro_datasets/dask/csv_dataset.py b/kedro-datasets/kedro_datasets/dask/csv_dataset.py index 053da6b00..bc5b5764b 100644 --- a/kedro-datasets/kedro_datasets/dask/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/csv_dataset.py @@ -67,9 +67,9 @@ def __init__( # noqa: PLR0913 filepath: Filepath in POSIX format to a CSV file CSV collection or the directory of a multipart CSV. load_args: Additional loading options `dask.dataframe.read_csv`: - https://docs.dask.org/en/latest/generated/dask.dataframe.read_csv.html + https://docs.dask.org/en/stable/generated/dask.dataframe.read_csv.html save_args: Additional saving options for `dask.dataframe.to_csv`: - https://docs.dask.org/en/latest/generated/dask.dataframe.to_csv.html + https://docs.dask.org/en/stable/generated/dask.dataframe.to_csv.html credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 1acfe7cda..3b2dff73e 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -97,9 +97,9 @@ def __init__( # noqa: PLR0913 filepath: Filepath in POSIX format to a parquet file parquet collection or the directory of a multipart parquet. load_args: Additional loading options `dask.dataframe.read_parquet`: - https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html + https://docs.dask.org/en/stable/generated/dask.dataframe.read_parquet.html save_args: Additional saving options for `dask.dataframe.to_parquet`: - https://docs.dask.org/en/latest/generated/dask.dataframe.to_parquet.html + https://docs.dask.org/en/stable/generated/dask.dataframe.to_parquet.html credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: diff --git a/kedro-datasets/kedro_datasets/tracking/__init__.py b/kedro-datasets/kedro_datasets/tracking/__init__.py deleted file mode 100644 index 1b1a5c70d..000000000 --- a/kedro-datasets/kedro_datasets/tracking/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Dataset implementations to save data for Kedro Experiment Tracking.""" - -import warnings -from typing import Any - -import lazy_loader as lazy - -from kedro_datasets import KedroDeprecationWarning - -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -JSONDataset: Any -MetricsDataset: Any - -__getattr__, __dir__, __all__ = lazy.attach( - __name__, - submod_attrs={ - "json_dataset": ["JSONDataset"], - "metrics_dataset": ["MetricsDataset"], - }, -) - -warnings.warn( - "`tracking.JSONDataset` and `tracking.MetricsDataset` are deprecated. These datasets will be removed in kedro-datasets 7.0.0", - KedroDeprecationWarning, - stacklevel=2, -) diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py deleted file mode 100644 index d73df1b10..000000000 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ /dev/null @@ -1,56 +0,0 @@ -"""``JSONDataset`` saves data to a JSON file using an underlying -filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. -The ``JSONDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default. -""" - -import json -from typing import NoReturn - -from kedro.io.core import DatasetError, get_filepath_str - -from kedro_datasets._typing import JSONTrackingPreview -from kedro_datasets.json import json_dataset - - -class JSONDataset(json_dataset.JSONDataset): - """``JSONDataset`` saves data to a JSON file using an underlying - filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. - The ``JSONDataset`` is part of Kedro Experiment Tracking. - The dataset is write-only and it is versioned by default. - - Example usage for the - `YAML API `_: - - .. code-block:: yaml - - cars: - type: tracking.JSONDataset - filepath: data/09_tracking/cars.json - - Example usage for the - `Python API `_: - - .. code-block:: pycon - - >>> from kedro_datasets.tracking import JSONDataset - >>> - >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} - >>> - >>> dataset = JSONDataset(filepath=tmp_path / "test.json") - >>> dataset.save(data) - - """ - - versioned = True - - def load(self) -> NoReturn: - raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") - - def preview(self) -> JSONTrackingPreview: # type: ignore[override] - "Load the JSON tracking dataset used in Kedro-viz experiment tracking." - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return JSONTrackingPreview(json.load(fs_file)) diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py deleted file mode 100644 index 6202acf34..000000000 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ /dev/null @@ -1,76 +0,0 @@ -"""``MetricsDataset`` saves data to a JSON file using an underlying -filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. -The ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default -and only takes metrics of numeric values. -""" - -import json -from typing import NoReturn - -from kedro.io.core import DatasetError, get_filepath_str - -from kedro_datasets._typing import MetricsTrackingPreview -from kedro_datasets.json import json_dataset - - -class MetricsDataset(json_dataset.JSONDataset): - """``MetricsDataset`` saves data to a JSON file using an underlying - filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. The - ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is write-only, - it is versioned by default and only takes metrics of numeric values. - - Example usage for the - `YAML API `_: - - .. code-block:: yaml - - cars: - type: tracking.MetricsDataset - filepath: data/09_tracking/cars.json - - Example usage for the - `Python API `_: - - .. code-block:: pycon - - >>> from kedro_datasets.tracking import MetricsDataset - >>> - >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} - >>> - >>> dataset = MetricsDataset(filepath=tmp_path / "test.json") - >>> dataset.save(data) - - """ - - versioned = True - - def load(self) -> NoReturn: - raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") - - def save(self, data: dict[str, float]) -> None: - """Converts all values in the data from a ``MetricsDataset`` to float to make sure - they are numeric values which can be displayed in Kedro Viz and then saves the dataset. - """ - try: - for key, value in data.items(): - data[key] = float(value) - except ValueError as exc: - raise DatasetError( - f"The MetricsDataset expects only numeric values. {exc}" - ) from exc - - save_path = get_filepath_str(self._get_save_path(), self._protocol) - - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: - json.dump(data, fs_file, **self._save_args) - - self._invalidate_cache() - - def preview(self) -> MetricsTrackingPreview: # type: ignore[override] - "Load the Metrics tracking dataset used in Kedro-viz experiment tracking" - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return json.load(fs_file) diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 91b938c19..3ee8eb9e9 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -163,10 +163,6 @@ tensorflow = ["kedro-datasets[tensorflow-tensorflowmodeldataset]"] text-textdataset = [] text = ["kedro-datasets[text-textdataset]"] -tracking-jsondataset = [] -tracking-metricsdataset = [] -tracking = ["kedro-datasets[tracking-jsondataset, tracking-metricsdataset]"] - yaml-yamldataset = ["kedro-datasets[pandas-base]", "PyYAML>=4.2, <7.0"] yaml = ["kedro-datasets[yaml-yamldataset]"] diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json index 195f0234a..b9fa61d14 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json @@ -42,8 +42,6 @@ "spark.SparkJDBCDataSet", "tensorflow.TensorFlowModelDataset", "text.TextDataSet", - "tracking.JSONDataSet", - "tracking.MetricsDataSet", "yaml.YAMLDataSet" ] } @@ -1312,76 +1310,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "tracking.JSONDataSet" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "tracking.MetricsDataSet" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, { "if": { "properties": { diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json index f19266812..087725710 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json @@ -41,8 +41,6 @@ "spark.SparkJDBCDataset", "tensorflow.TensorFlowModelDataset", "text.TextDataset", - "tracking.JSONDataset", - "tracking.MetricsDataset", "yaml.YAMLDataset" ] } @@ -1277,76 +1275,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "tracking.JSONDataset" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "tracking.MetricsDataset" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, { "if": { "properties": { diff --git a/kedro-datasets/tests/tracking/__init__.py b/kedro-datasets/tests/tracking/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py deleted file mode 100644 index de24ba9b9..000000000 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ /dev/null @@ -1,195 +0,0 @@ -import inspect -import json -from pathlib import Path, PurePosixPath - -import pytest -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version -from s3fs.core import S3FileSystem - -from kedro_datasets.tracking import JSONDataset - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "test.json").as_posix() - - -@pytest.fixture -def json_dataset(filepath_json, save_args, fs_args): - return JSONDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args) - - -@pytest.fixture -def explicit_versioned_json_dataset(filepath_json, load_version, save_version): - return JSONDataset( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def dummy_data(): - return {"col1": 1, "col2": 2, "col3": "mystring"} - - -class TestJSONDataset: - def test_save(self, filepath_json, dummy_data, tmp_path, save_version): - """Test saving and reloading the dataset.""" - json_dataset = JSONDataset( - filepath=filepath_json, version=Version(None, save_version) - ) - json_dataset.save(dummy_data) - - actual_filepath = Path(json_dataset._filepath.as_posix()) - test_filepath = tmp_path / "locally_saved.json" - - test_filepath.parent.mkdir(parents=True, exist_ok=True) - with open(test_filepath, "w", encoding="utf-8") as file: - json.dump(dummy_data, file) - - with open(test_filepath, encoding="utf-8") as file: - test_data = json.load(file) - - with open( - (actual_filepath / save_version / "test.json"), encoding="utf-8" - ) as actual_file: - actual_data = json.load(actual_file) - - assert actual_data == test_data - assert json_dataset._fs_open_args_load == {} - assert json_dataset._fs_open_args_save == {"mode": "w"} - - def test_load_fail(self, json_dataset, dummy_data): - json_dataset.save(dummy_data) - pattern = r"Loading not supported for 'JSONDataset'" - with pytest.raises(DatasetError, match=pattern): - json_dataset.load() - - def test_exists(self, json_dataset, dummy_data): - """Test `exists` method invocation for both existing and - nonexistent dataset.""" - assert not json_dataset.exists() - json_dataset.save(dummy_data) - assert json_dataset.exists() - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, json_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert json_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, json_dataset, fs_args): - assert json_dataset._fs_open_args_load == fs_args["open_args_load"] - assert json_dataset._fs_open_args_save == {"mode": "w"} # default unchanged - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - dataset = JSONDataset(filepath=filepath) - assert isinstance(dataset._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(dataset._filepath) == path - assert isinstance(dataset._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - dataset = JSONDataset(filepath=filepath) - dataset.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - def test_not_version_str_repr(self): - """Test that version is not in string representation of the class instance.""" - filepath = "test.json" - ds = JSONDataset(filepath=filepath) - - assert filepath in str(ds) - assert "version" not in str(ds) - assert "JSONDataset" in str(ds) - assert "protocol" in str(ds) - # Default save_args - assert "save_args={'indent': 2}" in str(ds) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance.""" - filepath = "test.json" - ds_versioned = JSONDataset( - filepath=filepath, version=Version(load_version, save_version) - ) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "JSONDataset" in str(ds_versioned) - assert "protocol" in str(ds_versioned) - # Default save_args - assert "save_args={'indent': 2}" in str(ds_versioned) - - def test_prevent_overwrite(self, explicit_versioned_json_dataset, dummy_data): - """Check the error when attempting to override the dataset if the - corresponding json file for a given save version already exists.""" - explicit_versioned_json_dataset.save(dummy_data) - pattern = ( - r"Save path \'.+\' for JSONDataset\(.+\) must " - r"not exist if versioning is enabled\." - ) - with pytest.raises(DatasetError, match=pattern): - explicit_versioned_json_dataset.save(dummy_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, - explicit_versioned_json_dataset, - load_version, - save_version, - dummy_data, - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - f"Save version '{save_version}' did not match " - f"load version '{load_version}' for " - r"JSONDataset\(.+\)" - ) - with pytest.warns(UserWarning, match=pattern): - explicit_versioned_json_dataset.save(dummy_data) - - def test_http_filesystem_no_versioning(self): - pattern = "Versioning is not supported for HTTP protocols." - - with pytest.raises(DatasetError, match=pattern): - JSONDataset( - filepath="https://example.com/file.json", version=Version(None, None) - ) - - def test_preview(self, json_dataset, dummy_data): - expected_preview = {"col1": 1, "col2": 2, "col3": "mystring"} - json_dataset.save(dummy_data) - preview = json_dataset.preview() - assert preview == expected_preview - assert ( - inspect.signature(json_dataset.preview).return_annotation.__name__ - == "JSONTrackingPreview" - ) diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py deleted file mode 100644 index b638fcdfd..000000000 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ /dev/null @@ -1,204 +0,0 @@ -import inspect -import json -from pathlib import Path, PurePosixPath - -import pytest -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version -from s3fs.core import S3FileSystem - -from kedro_datasets.tracking import MetricsDataset - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "test.json").as_posix() - - -@pytest.fixture -def metrics_dataset(filepath_json, save_args, fs_args): - return MetricsDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args) - - -@pytest.fixture -def explicit_versioned_metrics_dataset(filepath_json, load_version, save_version): - return MetricsDataset( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def dummy_data(): - return {"col1": 1, "col2": 2, "col3": 3} - - -class TestMetricsDataset: - def test_save_data( - self, - dummy_data, - tmp_path, - filepath_json, - save_version, - ): - """Test saving and reloading the dataset.""" - metrics_dataset = MetricsDataset( - filepath=filepath_json, version=Version(None, save_version) - ) - metrics_dataset.save(dummy_data) - - actual_filepath = Path(metrics_dataset._filepath.as_posix()) - test_filepath = tmp_path / "locally_saved.json" - - test_filepath.parent.mkdir(parents=True, exist_ok=True) - with open(test_filepath, "w", encoding="utf-8") as file: - json.dump(dummy_data, file) - - with open(test_filepath, encoding="utf-8") as file: - test_data = json.load(file) - - with open( - (actual_filepath / save_version / "test.json"), encoding="utf-8" - ) as actual_file: - actual_data = json.load(actual_file) - - assert actual_data == test_data - assert metrics_dataset._fs_open_args_load == {} - assert metrics_dataset._fs_open_args_save == {"mode": "w"} - - def test_load_fail(self, metrics_dataset, dummy_data): - metrics_dataset.save(dummy_data) - pattern = r"Loading not supported for 'MetricsDataset'" - with pytest.raises(DatasetError, match=pattern): - metrics_dataset.load() - - def test_exists(self, metrics_dataset, dummy_data): - """Test `exists` method invocation for both existing and - nonexistent dataset.""" - assert not metrics_dataset.exists() - metrics_dataset.save(dummy_data) - assert metrics_dataset.exists() - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, metrics_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert metrics_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, metrics_dataset, fs_args): - assert metrics_dataset._fs_open_args_load == fs_args["open_args_load"] - assert metrics_dataset._fs_open_args_save == {"mode": "w"} # default unchanged - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - dataset = MetricsDataset(filepath=filepath) - assert isinstance(dataset._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(dataset._filepath) == path - assert isinstance(dataset._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - dataset = MetricsDataset(filepath=filepath) - dataset.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - def test_fail_on_saving_non_numeric_value(self, metrics_dataset): - data = {"col1": 1, "col2": 2, "col3": "hello"} - - pattern = "The MetricsDataset expects only numeric values." - with pytest.raises(DatasetError, match=pattern): - metrics_dataset.save(data) - - def test_not_version_str_repr(self): - """Test that version is not in string representation of the class instance.""" - filepath = "test.json" - ds = MetricsDataset(filepath=filepath) - - assert filepath in str(ds) - assert "version" not in str(ds) - assert "MetricsDataset" in str(ds) - assert "protocol" in str(ds) - # Default save_args - assert "save_args={'indent': 2}" in str(ds) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance.""" - filepath = "test.json" - ds_versioned = MetricsDataset( - filepath=filepath, version=Version(load_version, save_version) - ) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "MetricsDataset" in str(ds_versioned) - assert "protocol" in str(ds_versioned) - # Default save_args - assert "save_args={'indent': 2}" in str(ds_versioned) - - def test_prevent_overwrite(self, explicit_versioned_metrics_dataset, dummy_data): - """Check the error when attempting to override the dataset if the - corresponding json file for a given save version already exists.""" - explicit_versioned_metrics_dataset.save(dummy_data) - pattern = ( - r"Save path \'.+\' for MetricsDataset\(.+\) must " - r"not exist if versioning is enabled\." - ) - with pytest.raises(DatasetError, match=pattern): - explicit_versioned_metrics_dataset.save(dummy_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, explicit_versioned_metrics_dataset, load_version, save_version, dummy_data - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - f"Save version '{save_version}' did not match " - f"load version '{load_version}' for " - r"MetricsDataset\(.+\)" - ) - with pytest.warns(UserWarning, match=pattern): - explicit_versioned_metrics_dataset.save(dummy_data) - - def test_http_filesystem_no_versioning(self): - pattern = "Versioning is not supported for HTTP protocols." - - with pytest.raises(DatasetError, match=pattern): - MetricsDataset( - filepath="https://example.com/file.json", version=Version(None, None) - ) - - def test_preview(self, metrics_dataset, dummy_data): - expected_preview = {"col1": 1, "col2": 2, "col3": 3} - metrics_dataset.save(dummy_data) - preview = metrics_dataset.preview() - assert preview == expected_preview - assert ( - inspect.signature(metrics_dataset.preview).return_annotation.__name__ - == "MetricsTrackingPreview" - )