From b55fee1f7b04bad9026b20e9302893cbdd8dee41 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 16:36:27 +0100 Subject: [PATCH 01/12] Rename all DataSet mentions in kedro.io + tests/io to Dataset Signed-off-by: Merel Theisen --- kedro/io/__init__.py | 25 ++------- kedro/io/cached_dataset.py | 17 ------- kedro/io/core.py | 29 ----------- kedro/io/data_catalog.py | 30 +++++------ kedro/io/lambda_dataset.py | 17 ------- kedro/io/memory_dataset.py | 17 ------- kedro/io/partitioned_dataset.py | 25 +-------- tests/io/test_cached_dataset.py | 10 ++-- tests/io/test_core.py | 15 ++---- tests/io/test_data_catalog.py | 76 ++++++++++++++-------------- tests/io/test_incremental_dataset.py | 14 ++--- tests/io/test_partitioned_dataset.py | 70 ++++++++++++------------- 12 files changed, 107 insertions(+), 238 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 26d4c3619c..850254ba26 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -3,7 +3,7 @@ """ from __future__ import annotations -from .cached_dataset import CachedDataSet, CachedDataset +from .cached_dataset import CachedDataset from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -13,22 +13,13 @@ Version, ) from .data_catalog import DataCatalog -from .lambda_dataset import LambdaDataSet, LambdaDataset -from .memory_dataset import MemoryDataSet, MemoryDataset +from .lambda_dataset import LambdaDataset +from .memory_dataset import MemoryDataset from .partitioned_dataset import ( - IncrementalDataSet, IncrementalDataset, - PartitionedDataSet, PartitionedDataset, ) -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -DataSetError: type[DatasetError] -DataSetNotFoundError: type[DatasetNotFoundError] -DataSetAlreadyExistsError: type[DatasetAlreadyExistsError] -AbstractDataSet: type[AbstractDataset] -AbstractVersionedDataSet: type[AbstractVersionedDataset] - def __getattr__(name): import kedro.io.core # noqa: import-outside-toplevel @@ -39,26 +30,16 @@ def __getattr__(name): __all__ = [ - "AbstractDataSet", "AbstractDataset", - "AbstractVersionedDataSet", "AbstractVersionedDataset", - "CachedDataSet", "CachedDataset", "DataCatalog", - "DataSetAlreadyExistsError", "DatasetAlreadyExistsError", - "DataSetError", "DatasetError", - "DataSetNotFoundError", "DatasetNotFoundError", - "IncrementalDataSet", "IncrementalDataset", - "LambdaDataSet", "LambdaDataset", - "MemoryDataSet", "MemoryDataset", - "PartitionedDataSet", "PartitionedDataset", "Version", ] diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index 6ec2a59fb7..e0935c8100 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -5,15 +5,11 @@ from __future__ import annotations import logging -import warnings from typing import Any from kedro.io.core import VERSIONED_FLAG_KEY, AbstractDataset, Version from kedro.io.memory_dataset import MemoryDataset -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -CachedDataSet: type[CachedDataset] - class CachedDataset(AbstractDataset): """``CachedDataset`` is a dataset wrapper which caches in memory the data saved, @@ -121,16 +117,3 @@ def __getstate__(self): logging.getLogger(__name__).warning("%s: clearing cache to pickle.", str(self)) self._cache.release() return self.__dict__ - - -def __getattr__(name): - if name == "CachedDataSet": - alias = CachedDataset - warnings.warn( - f"{repr(name)} has been renamed to {repr(alias.__name__)}, " - f"and the alias will be removed in Kedro 0.19.0", - DeprecationWarning, - stacklevel=2, - ) - return alias - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/core.py b/kedro/io/core.py index 66dba46495..38307f58e4 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -29,13 +29,6 @@ PROTOCOL_DELIMITER = "://" CLOUD_PROTOCOLS = ("s3", "s3n", "s3a", "gcs", "gs", "adl", "abfs", "abfss", "gdrive") -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -DataSetError: type[DatasetError] -DataSetNotFoundError: type[DatasetNotFoundError] -DataSetAlreadyExistsError: type[DatasetAlreadyExistsError] -AbstractDataSet: type[AbstractDataset] -AbstractVersionedDataSet: type[AbstractVersionedDataset] - class DatasetError(Exception): """``DatasetError`` raised by ``AbstractDataset`` implementations @@ -757,25 +750,3 @@ def validate_on_forbidden_chars(**kwargs): raise DatasetError( f"Neither white-space nor semicolon are allowed in '{key}'." ) - - -_DEPRECATED_CLASSES = { - "DataSetError": DatasetError, - "DataSetNotFoundError": DatasetNotFoundError, - "DataSetAlreadyExistsError": DatasetAlreadyExistsError, - "AbstractDataSet": AbstractDataset, - "AbstractVersionedDataSet": AbstractVersionedDataset, -} - - -def __getattr__(name): - if name in _DEPRECATED_CLASSES: - alias = _DEPRECATED_CLASSES[name] - warnings.warn( - f"{repr(name)} has been renamed to {repr(alias.__name__)}, " - f"and the alias will be removed in Kedro 0.19.0", - DeprecationWarning, - stacklevel=2, - ) - return alias - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 58aebfe73d..443d28b7cb 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -176,9 +176,9 @@ def __init__( # noqa: too-many-arguments Example: :: - >>> from kedro_datasets.pandas import CSVDataSet + >>> from kedro_datasets.pandas import CSVDataset >>> - >>> cars = CSVDataSet(filepath="cars.csv", + >>> cars = CSVDataset(filepath="cars.csv", >>> load_args=None, >>> save_args={"index": False}) >>> io = DataCatalog(datasets={'cars': cars}) @@ -246,14 +246,14 @@ class to be loaded is specified with the key ``type`` and their >>> config = { >>> "cars": { - >>> "type": "pandas.CSVDataSet", + >>> "type": "pandas.CSVDataset", >>> "filepath": "cars.csv", >>> "save_args": { >>> "index": False >>> } >>> }, >>> "boats": { - >>> "type": "pandas.CSVDataSet", + >>> "type": "pandas.CSVDataset", >>> "filepath": "s3://aws-bucket-name/boats.csv", >>> "credentials": "boats_credentials", >>> "save_args": { @@ -484,9 +484,9 @@ def load(self, name: str, version: str = None) -> Any: :: >>> from kedro.io import DataCatalog - >>> from kedro_datasets.pandas import CSVDataSet + >>> from kedro_datasets.pandas import CSVDataset >>> - >>> cars = CSVDataSet(filepath="cars.csv", + >>> cars = CSVDataset(filepath="cars.csv", >>> load_args=None, >>> save_args={"index": False}) >>> io = DataCatalog(datasets={'cars': cars}) @@ -524,9 +524,9 @@ def save(self, name: str, data: Any) -> None: >>> import pandas as pd >>> - >>> from kedro_datasets.pandas import CSVDataSet + >>> from kedro_datasets.pandas import CSVDataset >>> - >>> cars = CSVDataSet(filepath="cars.csv", + >>> cars = CSVDataset(filepath="cars.csv", >>> load_args=None, >>> save_args={"index": False}) >>> io = DataCatalog(datasets={'cars': cars}) @@ -598,13 +598,13 @@ def add( Example: :: - >>> from kedro_datasets.pandas import CSVDataSet + >>> from kedro_datasets.pandas import CSVDataset >>> >>> io = DataCatalog(datasets={ - >>> 'cars': CSVDataSet(filepath="cars.csv") + >>> 'cars': CSVDataset(filepath="cars.csv") >>> }) >>> - >>> io.add("boats", CSVDataSet(filepath="boats.csv")) + >>> io.add("boats", CSVDataset(filepath="boats.csv")) """ if dataset_name in self._datasets: if replace: @@ -634,14 +634,14 @@ def add_all( Example: :: - >>> from kedro_datasets.pandas import CSVDataSet, ParquetDataSet + >>> from kedro_datasets.pandas import CSVDataset, ParquetDataset >>> >>> io = DataCatalog(datasets={ - >>> "cars": CSVDataSet(filepath="cars.csv") + >>> "cars": CSVDataset(filepath="cars.csv") >>> }) >>> additional = { - >>> "planes": ParquetDataSet("planes.parq"), - >>> "boats": CSVDataSet(filepath="boats.csv") + >>> "planes": ParquetDataset("planes.parq"), + >>> "boats": CSVDataset(filepath="boats.csv") >>> } >>> >>> io.add_all(additional) diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py index b5ec9f6232..bef5146ee7 100644 --- a/kedro/io/lambda_dataset.py +++ b/kedro/io/lambda_dataset.py @@ -4,14 +4,10 @@ """ from __future__ import annotations -import warnings from typing import Any, Callable from kedro.io.core import AbstractDataset, DatasetError -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -LambdaDataSet: type[LambdaDataset] - class LambdaDataset(AbstractDataset): """``LambdaDataset`` loads and saves data to a data set. @@ -121,16 +117,3 @@ def __init__( # noqa: too-many-arguments self.__exists = exists self.__release = release self.metadata = metadata - - -def __getattr__(name): - if name == "LambdaDataSet": - alias = LambdaDataset - warnings.warn( - f"{repr(name)} has been renamed to {repr(alias.__name__)}, " - f"and the alias will be removed in Kedro 0.19.0", - DeprecationWarning, - stacklevel=2, - ) - return alias - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index 5e52e6e1bd..5b1075fdb0 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -3,16 +3,12 @@ from __future__ import annotations import copy -import warnings from typing import Any from kedro.io.core import AbstractDataset, DatasetError _EMPTY = object() -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -MemoryDataSet: type[MemoryDataset] - class MemoryDataset(AbstractDataset): """``MemoryDataset`` loads and saves data from/to an in-memory @@ -140,16 +136,3 @@ def _copy_with_mode(data: Any, copy_mode: str) -> Any: ) return copied_data - - -def __getattr__(name): - if name == "MemoryDataSet": - alias = MemoryDataset - warnings.warn( - f"{repr(name)} has been renamed to {repr(alias.__name__)}, " - f"and the alias will be removed in Kedro 0.19.0", - DeprecationWarning, - stacklevel=2, - ) - return alias - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/partitioned_dataset.py b/kedro/io/partitioned_dataset.py index 4b3e9eccb3..00f3364802 100644 --- a/kedro/io/partitioned_dataset.py +++ b/kedro/io/partitioned_dataset.py @@ -31,10 +31,6 @@ S3_PROTOCOLS = ("s3", "s3a", "s3n") -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -PartitionedDataSet: type[PartitionedDataset] -IncrementalDataSet: type[IncrementalDataset] - class PartitionedDataset(AbstractDataset): # noqa: too-many-instance-attributes,protected-access @@ -379,7 +375,7 @@ class IncrementalDataset(PartitionedDataset): >>> dataset.load() """ - DEFAULT_CHECKPOINT_TYPE = "kedro_datasets.text.TextDataSet" # TODO: PartitionedDataset should move to kedro-datasets + DEFAULT_CHECKPOINT_TYPE = "kedro_datasets.text.TextDataset" # TODO: PartitionedDataset should move to kedro-datasets DEFAULT_CHECKPOINT_FILENAME = "CHECKPOINT" def __init__( # noqa: too-many-arguments @@ -554,22 +550,3 @@ def confirm(self) -> None: partition_ids = [self._path_to_partition(p) for p in self._list_partitions()] if partition_ids: self._checkpoint.save(partition_ids[-1]) # checkpoint to last partition - - -_DEPRECATED_CLASSES = { - "PartitionedDataSet": PartitionedDataset, - "IncrementalDataSet": IncrementalDataset, -} - - -def __getattr__(name): - if name in _DEPRECATED_CLASSES: - alias = _DEPRECATED_CLASSES[name] - warnings.warn( - f"{repr(name)} has been renamed to {repr(alias.__name__)}, " - f"and the alias will be removed in Kedro 0.19.0", - DeprecationWarning, - stacklevel=2, - ) - return alias - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/tests/io/test_cached_dataset.py b/tests/io/test_cached_dataset.py index 92499de81c..fa93dfaa2f 100644 --- a/tests/io/test_cached_dataset.py +++ b/tests/io/test_cached_dataset.py @@ -3,7 +3,7 @@ import pytest import yaml -from kedro_datasets.pandas import CSVDataSet +from kedro_datasets.pandas import CSVDataset from kedro.io import CachedDataset, DataCatalog, DatasetError, MemoryDataset @@ -11,7 +11,7 @@ test_ds: type: CachedDataset dataset: - type: kedro_datasets.pandas.CSVDataSet + type: kedro_datasets.pandas.CSVDataset filepath: example.csv """ @@ -20,7 +20,7 @@ type: CachedDataset versioned: true dataset: - type: kedro_datasets.pandas.CSVDataSet + type: kedro_datasets.pandas.CSVDataset filepath: example.csv """ @@ -28,7 +28,7 @@ test_ds: type: CachedDataset dataset: - type: kedro_datasets.pandas.CSVDataSet + type: kedro_datasets.pandas.CSVDataset filepath: example.csv versioned: true """ @@ -81,7 +81,7 @@ def test_from_yaml(self, mocker): catalog = DataCatalog.from_config(config) assert catalog.list() == ["test_ds"] mock = mocker.Mock() - assert isinstance(catalog._datasets["test_ds"]._dataset, CSVDataSet) + assert isinstance(catalog._datasets["test_ds"]._dataset, CSVDataset) catalog._datasets["test_ds"]._dataset = mock catalog.save("test_ds", 20) diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 7274a0cd32..77774e7f50 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib from decimal import Decimal from fractions import Fraction from pathlib import PurePosixPath @@ -9,7 +8,6 @@ import pytest from kedro.io.core import ( - _DEPRECATED_CLASSES, AbstractDataset, _parse_filepath, get_filepath_str, @@ -33,14 +31,7 @@ ] -@pytest.mark.parametrize("module_name", ["kedro.io", "kedro.io.core"]) -@pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES) -def test_deprecation(module_name, class_name): - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): - getattr(importlib.import_module(module_name), class_name) - - -class MyDataSet(AbstractDataset): +class MyDataset(AbstractDataset): def __init__(self, var=None): self.var = var @@ -57,10 +48,10 @@ def _save(self, data): class TestCoreFunctions: @pytest.mark.parametrize("var", [1, True] + FALSE_BUILTINS) def test_str_representation(self, var): - assert str(MyDataSet(var)) == f"MyDataSet(var={var})" + assert str(MyDataset(var)) == f"MyDataset(var={var})" def test_str_representation_none(self): - assert str(MyDataSet()) == "MyDataSet()" + assert str(MyDataset()) == "MyDataset()" def test_get_filepath_str(self): path = get_filepath_str(PurePosixPath("example.com/test.csv"), "http") diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index b3d8dc7ef7..afd1707bea 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -7,7 +7,7 @@ import pandas as pd import pytest -from kedro_datasets.pandas import CSVDataSet, ParquetDataSet +from kedro_datasets.pandas import CSVDataset, ParquetDataset from pandas.testing import assert_frame_equal from kedro.io import ( @@ -42,9 +42,9 @@ def dummy_dataframe(): def sane_config(filepath): return { "catalog": { - "boats": {"type": "pandas.CSVDataSet", "filepath": filepath}, + "boats": {"type": "pandas.CSVDataset", "filepath": filepath}, "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "s3://test_bucket/test_file.csv", "credentials": "s3_credentials", "layer": "raw", @@ -78,11 +78,11 @@ def sane_config_with_tracking_ds(tmp_path): return { "catalog": { "boats": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": boat_path, "versioned": True, }, - "planes": {"type": "tracking.MetricsDataSet", "filepath": plane_path}, + "planes": {"type": "tracking.MetricsDataset", "filepath": plane_path}, }, } @@ -92,15 +92,15 @@ def config_with_dataset_factories(): return { "catalog": { "{brand}_cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{brand}_cars.csv", }, "audi_cars": { - "type": "pandas.ParquetDataSet", + "type": "pandas.ParquetDataset", "filepath": "data/01_raw/audi_cars.pq", }, "{type}_boats": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{type}_boats.csv", }, }, @@ -114,7 +114,7 @@ def config_with_dataset_factories_nested(): "{brand}_cars": { "type": "PartitionedDataset", "path": "data/01_raw", - "dataset": "pandas.CSVDataSet", + "dataset": "pandas.CSVDataset", "metadata": { "my-plugin": { "brand": "{brand}", @@ -133,7 +133,7 @@ def config_with_dataset_factories_nested(): @pytest.fixture def config_with_dataset_factories_with_default(config_with_dataset_factories): config_with_dataset_factories["catalog"]["{default_dataset}"] = { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{default_dataset}.csv", } return config_with_dataset_factories @@ -142,7 +142,7 @@ def config_with_dataset_factories_with_default(config_with_dataset_factories): @pytest.fixture def config_with_dataset_factories_bad_pattern(config_with_dataset_factories): config_with_dataset_factories["catalog"]["{type}@planes"] = { - "type": "pandas.ParquetDataSet", + "type": "pandas.ParquetDataset", "filepath": "data/01_raw/{brand}_plane.pq", } return config_with_dataset_factories @@ -153,19 +153,19 @@ def config_with_dataset_factories_only_patterns(): return { "catalog": { "{default}": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{default}.csv", }, "{namespace}_{dataset}": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{namespace}_{dataset}.pq", }, "{country}_companies": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{country}_companies.csv", }, "{dataset}s": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/{dataset}s.csv", }, }, @@ -174,13 +174,13 @@ def config_with_dataset_factories_only_patterns(): @pytest.fixture def dataset(filepath): - return CSVDataSet(filepath=filepath, save_args={"index": False}) + return CSVDataset(filepath=filepath, save_args={"index": False}) @pytest.fixture def multi_catalog(): - csv = CSVDataSet(filepath="abc.csv") - parq = ParquetDataSet(filepath="xyz.parq") + csv = CSVDataset(filepath="abc.csv") + parq = ParquetDataset(filepath="xyz.parq") layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}} return DataCatalog({"abc": csv, "xyz": parq}, layers=layers) @@ -364,14 +364,14 @@ def test_eq(self, multi_catalog, data_catalog): def test_datasets_on_init(self, data_catalog_from_config): """Check datasets are loaded correctly on construction""" - assert isinstance(data_catalog_from_config.datasets.boats, CSVDataSet) - assert isinstance(data_catalog_from_config.datasets.cars, CSVDataSet) + assert isinstance(data_catalog_from_config.datasets.boats, CSVDataset) + assert isinstance(data_catalog_from_config.datasets.cars, CSVDataset) def test_datasets_on_add(self, data_catalog_from_config): """Check datasets are updated correctly after adding""" - data_catalog_from_config.add("new_dataset", CSVDataSet("some_path")) - assert isinstance(data_catalog_from_config.datasets.new_dataset, CSVDataSet) - assert isinstance(data_catalog_from_config.datasets.boats, CSVDataSet) + data_catalog_from_config.add("new_dataset", CSVDataset("some_path")) + assert isinstance(data_catalog_from_config.datasets.new_dataset, CSVDataset) + assert isinstance(data_catalog_from_config.datasets.boats, CSVDataset) def test_adding_datasets_not_allowed(self, data_catalog_from_config): """Check error if user tries to update the datasets attribute""" @@ -454,15 +454,15 @@ def test_config_invalid_module(self, sane_config): """Check the error if the type points to nonexistent module""" sane_config["catalog"]["boats"][ "type" - ] = "kedro.invalid_module_name.io.CSVDataSet" + ] = "kedro.invalid_module_name.io.CSVDataset" - error_msg = "Class 'kedro.invalid_module_name.io.CSVDataSet' not found" + error_msg = "Class 'kedro.invalid_module_name.io.CSVDataset' not found" with pytest.raises(DatasetError, match=re.escape(error_msg)): DataCatalog.from_config(**sane_config) def test_config_relative_import(self, sane_config): """Check the error if the type points to a relative import""" - sane_config["catalog"]["boats"]["type"] = ".CSVDataSetInvalid" + sane_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" pattern = "'type' class path does not support relative paths" with pytest.raises(DatasetError, match=re.escape(pattern)): @@ -480,20 +480,20 @@ def test_config_import_kedro_datasets(self, sane_config, mocker): # In Python 3.7 call_args.args is not available thus we access the call # arguments with less meaningful index. # The 1st index returns a tuple, the 2nd index return the name of module. - assert call_args[0][0] == f"{prefix}pandas.CSVDataSet" + assert call_args[0][0] == f"{prefix}pandas.CSVDataset" def test_config_import_extras(self, sane_config): """Test kedro_datasets default path to the dataset class""" - sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" + sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" assert DataCatalog.from_config(**sane_config) def test_config_missing_class(self, sane_config): """Check the error if the type points to nonexistent class""" - sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDataSetInvalid" + sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" - "Class 'kedro.io.CSVDataSetInvalid' not found" + "Class 'kedro.io.CSVDatasetInvalid' not found" ) with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**sane_config) @@ -570,10 +570,10 @@ def test_missing_dependency(self, sane_config, mocker): pattern = "dependency issue" def dummy_load(obj_path, *args, **kwargs): - if obj_path == "kedro_datasets.pandas.CSVDataSet": + if obj_path == "kedro_datasets.pandas.CSVDataset": raise AttributeError(pattern) if obj_path == "kedro_datasets.pandas.__all__": - return ["CSVDataSet"] + return ["CSVDataset"] mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) with pytest.raises(DatasetError, match=pattern): @@ -598,7 +598,7 @@ def test_confirm(self, tmp_path, caplog, mocker): catalog = { "ds_to_confirm": { "type": "IncrementalDataset", - "dataset": "pandas.CSVDataSet", + "dataset": "pandas.CSVDataset", "path": str(tmp_path), } } @@ -748,7 +748,7 @@ def test_load_version_on_unversioned_dataset( def test_replacing_nonword_characters(self): """Test replacing non-word characters in dataset names""" - csv = CSVDataSet(filepath="abc.csv") + csv = CSVDataset(filepath="abc.csv") datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeƱo": csv} catalog = DataCatalog(datasets=datasets) @@ -767,7 +767,7 @@ def test_no_versions_with_cloud_protocol(self, monkeypatch): monkeypatch.setenv("AWS_ACCESS_KEY_ID", "dummmy") monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "dummmy") version = Version(load=None, save=None) - versioned_dataset = CSVDataSet("s3://bucket/file.csv", version=version) + versioned_dataset = CSVDataset("s3://bucket/file.csv", version=version) pattern = re.escape( f"Did not find any versions for {versioned_dataset}. " f"This could be due to insufficient permission." @@ -785,7 +785,7 @@ def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): assert "{brand}_cars" in catalog._dataset_patterns tesla_cars = catalog._get_dataset("tesla_cars") - assert isinstance(tesla_cars, CSVDataSet) + assert isinstance(tesla_cars, CSVDataset) assert "tesla_cars" in catalog._datasets @pytest.mark.parametrize( @@ -818,7 +818,7 @@ def test_explicit_entry_not_overwritten(self, config_with_dataset_factories): """Check that the existing catalog entry is not overwritten by config in pattern""" catalog = DataCatalog.from_config(**config_with_dataset_factories) audi_cars = catalog._get_dataset("audi_cars") - assert isinstance(audi_cars, ParquetDataSet) + assert isinstance(audi_cars, ParquetDataset) @pytest.mark.parametrize( "dataset_name,pattern", @@ -859,7 +859,7 @@ def test_default_dataset(self, config_with_dataset_factories_with_default, caplo "in the catalog will be used to override the default " "MemoryDataset creation for the dataset 'jet@planes'" in log_record.message ) - assert isinstance(jet_dataset, CSVDataSet) + assert isinstance(jet_dataset, CSVDataset) def test_unmatched_key_error_when_parsing_config( self, config_with_dataset_factories_bad_pattern diff --git a/tests/io/test_incremental_dataset.py b/tests/io/test_incremental_dataset.py index db9421e886..c36c6b62f9 100644 --- a/tests/io/test_incremental_dataset.py +++ b/tests/io/test_incremental_dataset.py @@ -8,15 +8,15 @@ import boto3 import pandas as pd import pytest -from kedro_datasets.pickle import PickleDataSet -from kedro_datasets.text import TextDataSet +from kedro_datasets.pickle import PickleDataset +from kedro_datasets.text import TextDataset from moto import mock_s3 from pandas.testing import assert_frame_equal from kedro.io import AbstractDataset, DatasetError, IncrementalDataset from kedro.io.data_catalog import CREDENTIALS_KEY -DATASET = "kedro_datasets.pandas.CSVDataSet" +DATASET = "kedro_datasets.pandas.CSVDataset" @pytest.fixture @@ -226,8 +226,8 @@ def test_checkpoint_path(self, local_csvs, partitioned_data_pandas): @pytest.mark.parametrize( "checkpoint_config,expected_checkpoint_class", [ - (None, TextDataSet), - ({"type": "kedro_datasets.pickle.PickleDataSet"}, PickleDataSet), + (None, TextDataset), + ({"type": "kedro_datasets.pickle.PickleDataset"}, PickleDataset), ({"type": "tests.io.test_incremental_dataset.DummyDataset"}, DummyDataset), ], ) @@ -372,7 +372,7 @@ def mocked_csvs_in_s3(mocked_s3_bucket, partitioned_data_pandas): return f"s3://{BUCKET_NAME}/{prefix}" -class TestPartitionedDataSetS3: +class TestPartitionedDatasetS3: os.environ["AWS_ACCESS_KEY_ID"] = "FAKE_ACCESS_KEY" os.environ["AWS_SECRET_ACCESS_KEY"] = "FAKE_SECRET_KEY" @@ -477,7 +477,7 @@ def test_force_checkpoint_checkpoint_file_exists( checkpoint_path = ( f"{mocked_csvs_in_s3}/{IncrementalDataset.DEFAULT_CHECKPOINT_FILENAME}" ) - checkpoint_value = TextDataSet(checkpoint_path).load() + checkpoint_value = TextDataset(checkpoint_path).load() assert checkpoint_value == "p04/data.csv" pds = IncrementalDataset( diff --git a/tests/io/test_partitioned_dataset.py b/tests/io/test_partitioned_dataset.py index 02903cca7c..0acece1eab 100644 --- a/tests/io/test_partitioned_dataset.py +++ b/tests/io/test_partitioned_dataset.py @@ -7,7 +7,7 @@ import pandas as pd import pytest import s3fs -from kedro_datasets.pandas import CSVDataSet, ParquetDataSet +from kedro_datasets.pandas import CSVDataset, ParquetDataset from moto import mock_s3 from pandas.testing import assert_frame_equal @@ -38,11 +38,11 @@ def local_csvs(tmp_path, partitioned_data_pandas): LOCAL_DATASET_DEFINITION = [ - "pandas.CSVDataSet", - "kedro_datasets.pandas.CSVDataSet", - CSVDataSet, - {"type": "pandas.CSVDataSet", "save_args": {"index": False}}, - {"type": CSVDataSet}, + "pandas.CSVDataset", + "kedro_datasets.pandas.CSVDataset", + CSVDataset, + {"type": "pandas.CSVDataset", "save_args": {"index": False}}, + {"type": CSVDataset}, ] @@ -101,7 +101,7 @@ def original_data(): def test_save_invalidates_cache(self, local_csvs, mocker): """Test that save calls invalidate partition cache""" - pds = PartitionedDataset(str(local_csvs), "pandas.CSVDataSet") + pds = PartitionedDataset(str(local_csvs), "pandas.CSVDataset") mocked_fs_invalidate = mocker.patch.object(pds._filesystem, "invalidate_cache") first_load = pds.load() assert pds._partition_cache.currsize == 1 @@ -125,7 +125,7 @@ def test_save_invalidates_cache(self, local_csvs, mocker): @pytest.mark.parametrize("overwrite,expected_num_parts", [(False, 6), (True, 1)]) def test_overwrite(self, local_csvs, overwrite, expected_num_parts): pds = PartitionedDataset( - str(local_csvs), "pandas.CSVDataSet", overwrite=overwrite + str(local_csvs), "pandas.CSVDataset", overwrite=overwrite ) original_data = pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) part_id = "new/data" @@ -137,9 +137,9 @@ def test_overwrite(self, local_csvs, overwrite, expected_num_parts): def test_release_instance_cache(self, local_csvs): """Test that cache invalidation does not affect other instances""" - ds_a = PartitionedDataset(str(local_csvs), "pandas.CSVDataSet") + ds_a = PartitionedDataset(str(local_csvs), "pandas.CSVDataset") ds_a.load() - ds_b = PartitionedDataset(str(local_csvs), "pandas.CSVDataSet") + ds_b = PartitionedDataset(str(local_csvs), "pandas.CSVDataset") ds_b.load() assert ds_a._partition_cache.currsize == 1 @@ -151,7 +151,7 @@ def test_release_instance_cache(self, local_csvs): # cache of the dataset B is unaffected assert ds_b._partition_cache.currsize == 1 - @pytest.mark.parametrize("dataset", ["pandas.CSVDataSet", "pandas.ParquetDataSet"]) + @pytest.mark.parametrize("dataset", ["pandas.CSVDataset", "pandas.ParquetDataset"]) def test_exists(self, local_csvs, dataset): assert PartitionedDataset(str(local_csvs), dataset).exists() @@ -192,7 +192,7 @@ def test_load_args(self, mocker): path = str(Path.cwd()) load_args = {"maxdepth": 42, "withdirs": True} - pds = PartitionedDataset(path, "pandas.CSVDataSet", load_args=load_args) + pds = PartitionedDataset(path, "pandas.CSVDataset", load_args=load_args) mocker.patch.object(pds, "_path_to_partition", return_value=fake_partition_name) assert pds.load().keys() == {fake_partition_name} @@ -207,7 +207,7 @@ def test_credentials( ): mocked_filesystem = mocker.patch("fsspec.filesystem") path = str(Path.cwd()) - pds = PartitionedDataset(path, "pandas.CSVDataSet", credentials=credentials) + pds = PartitionedDataset(path, "pandas.CSVDataset", credentials=credentials) assert mocked_filesystem.call_count == 2 mocked_filesystem.assert_called_with("file", **expected_pds_creds) @@ -233,13 +233,13 @@ def test_fs_args(self, mocker): mocked_filesystem = mocker.patch("fsspec.filesystem") path = str(Path.cwd()) - pds = PartitionedDataset(path, "pandas.CSVDataSet", fs_args=fs_args) + pds = PartitionedDataset(path, "pandas.CSVDataset", fs_args=fs_args) assert mocked_filesystem.call_count == 2 mocked_filesystem.assert_called_with("file", **fs_args) assert pds._dataset_config["fs_args"] == fs_args - @pytest.mark.parametrize("dataset", ["pandas.ParquetDataSet", ParquetDataSet]) + @pytest.mark.parametrize("dataset", ["pandas.ParquetDataset", ParquetDataset]) def test_invalid_dataset(self, dataset, local_csvs): pds = PartitionedDataset(str(local_csvs), dataset) loaded_partitions = pds.load() @@ -278,8 +278,8 @@ def test_invalid_dataset_config(self, dataset_config, error_pattern): @pytest.mark.parametrize( "dataset_config", [ - {"type": CSVDataSet, "versioned": True}, - {"type": "pandas.CSVDataSet", "versioned": True}, + {"type": CSVDataset, "versioned": True}, + {"type": "pandas.CSVDataset", "versioned": True}, ], ) def test_versioned_dataset_not_allowed(self, dataset_config): @@ -291,7 +291,7 @@ def test_versioned_dataset_not_allowed(self, dataset_config): PartitionedDataset(str(Path.cwd()), dataset_config) def test_no_partitions(self, tmpdir): - pds = PartitionedDataset(str(tmpdir), "pandas.CSVDataSet") + pds = PartitionedDataset(str(tmpdir), "pandas.CSVDataset") pattern = re.escape(f"No partitions found in '{tmpdir}'") with pytest.raises(DatasetError, match=pattern): @@ -303,14 +303,14 @@ def test_no_partitions(self, tmpdir): ( { "path": str(Path.cwd()), - "dataset": {"type": CSVDataSet, "filepath": "fake_path"}, + "dataset": {"type": CSVDataset, "filepath": "fake_path"}, }, "filepath", ), ( { "path": str(Path.cwd()), - "dataset": {"type": CSVDataSet, "other_arg": "fake_path"}, + "dataset": {"type": CSVDataset, "other_arg": "fake_path"}, "filepath_arg": "other_arg", }, "other_arg", @@ -330,7 +330,7 @@ def test_credentials_log_warning(self, caplog): the top-level ones""" pds = PartitionedDataset( path=str(Path.cwd()), - dataset={"type": CSVDataSet, "credentials": {"secret": "dataset"}}, + dataset={"type": CSVDataset, "credentials": {"secret": "dataset"}}, credentials={"secret": "global"}, ) log_message = KEY_PROPAGATION_WARNING % { @@ -345,7 +345,7 @@ def test_fs_args_log_warning(self, caplog): arguments will overwrite the top-level ones""" pds = PartitionedDataset( path=str(Path.cwd()), - dataset={"type": CSVDataSet, "fs_args": {"args": "dataset"}}, + dataset={"type": CSVDataset, "fs_args": {"args": "dataset"}}, fs_args={"args": "dataset"}, ) log_message = KEY_PROPAGATION_WARNING % { @@ -359,14 +359,14 @@ def test_fs_args_log_warning(self, caplog): "pds_config,expected_ds_creds,global_creds", [ ( - {"dataset": "pandas.CSVDataSet", "credentials": {"secret": "global"}}, + {"dataset": "pandas.CSVDataset", "credentials": {"secret": "global"}}, {"secret": "global"}, {"secret": "global"}, ), ( { "dataset": { - "type": CSVDataSet, + "type": CSVDataset, "credentials": {"secret": "expected"}, }, }, @@ -375,7 +375,7 @@ def test_fs_args_log_warning(self, caplog): ), ( { - "dataset": {"type": CSVDataSet, "credentials": None}, + "dataset": {"type": CSVDataset, "credentials": None}, "credentials": {"secret": "global"}, }, None, @@ -384,7 +384,7 @@ def test_fs_args_log_warning(self, caplog): ( { "dataset": { - "type": CSVDataSet, + "type": CSVDataset, "credentials": {"secret": "expected"}, }, "credentials": {"secret": "global"}, @@ -403,11 +403,11 @@ def test_dataset_creds(self, pds_config, expected_ds_creds, global_creds): BUCKET_NAME = "fake_bucket_name" S3_DATASET_DEFINITION = [ - "pandas.CSVDataSet", - "kedro_datasets.pandas.CSVDataSet", - CSVDataSet, - {"type": "pandas.CSVDataSet", "save_args": {"index": False}}, - {"type": CSVDataSet}, + "pandas.CSVDataset", + "kedro_datasets.pandas.CSVDataset", + CSVDataset, + {"type": "pandas.CSVDataset", "save_args": {"index": False}}, + {"type": CSVDataset}, ] @@ -456,7 +456,7 @@ def test_load_s3a(self, mocked_csvs_in_s3, partitioned_data_pandas, mocker): s3a_path = f"s3a://{path}" # any type is fine as long as it passes isinstance check # since _dataset_type is mocked later anyways - pds = PartitionedDataset(s3a_path, "pandas.CSVDataSet") + pds = PartitionedDataset(s3a_path, "pandas.CSVDataset") assert pds._protocol == "s3a" mocked_ds = mocker.patch.object(pds, "_dataset_type") @@ -481,7 +481,7 @@ def test_join_protocol_with_bucket_name_startswith_protocol( bucket name starts with the protocol name, i.e. `s3://s3_bucket/dummy_.txt` """ - pds = PartitionedDataset(mocked_csvs_in_s3, "pandas.CSVDataSet") + pds = PartitionedDataset(mocked_csvs_in_s3, "pandas.CSVDataset") assert pds._join_protocol(partition_path) == f"s3://{partition_path}" @pytest.mark.parametrize("dataset", S3_DATASET_DEFINITION) @@ -505,7 +505,7 @@ def test_save_s3a(self, mocked_csvs_in_s3, mocker): s3a_path = f"s3a://{path}" # any type is fine as long as it passes isinstance check # since _dataset_type is mocked later anyways - pds = PartitionedDataset(s3a_path, "pandas.CSVDataSet", filename_suffix=".csv") + pds = PartitionedDataset(s3a_path, "pandas.CSVDataset", filename_suffix=".csv") assert pds._protocol == "s3a" mocked_ds = mocker.patch.object(pds, "_dataset_type") @@ -517,7 +517,7 @@ def test_save_s3a(self, mocked_csvs_in_s3, mocker): mocked_ds.assert_called_once_with(filepath=f"{s3a_path}/{new_partition}.csv") mocked_ds.return_value.save.assert_called_once_with(data) - @pytest.mark.parametrize("dataset", ["pandas.CSVDataSet", "pandas.HDFDataSet"]) + @pytest.mark.parametrize("dataset", ["pandas.CSVDataset", "pandas.HDFDataset"]) def test_exists(self, dataset, mocked_csvs_in_s3): assert PartitionedDataset(mocked_csvs_in_s3, dataset).exists() From d724707c564c2233ecca3da3864439cef7bdc835 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 16:38:59 +0100 Subject: [PATCH 02/12] Rename any mentions in kedro/ from DataSet to Dataset Signed-off-by: Merel Theisen --- kedro/config/templated_config.py | 4 ++-- kedro/runner/parallel_runner.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index 615b75fdda..8a4e3835a4 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -63,8 +63,8 @@ class TemplatedConfigLoader(AbstractConfigLoader): environment: "dev" datasets: - csv: "pandas.CSVDataSet" - spark: "spark.SparkDataSet" + csv: "pandas.CSVDataset" + spark: "spark.SparkDataset" folders: raw: "01_raw" diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 07c8824274..7bddb6284d 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -32,9 +32,6 @@ # see https://github.com/python/cpython/blob/master/Lib/concurrent/futures/process.py#L114 _MAX_WINDOWS_WORKERS = 61 -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -_SharedMemoryDataSet: type[_SharedMemoryDataset] - class _SharedMemoryDataset: """``_SharedMemoryDataset`` is a wrapper class for a shared MemoryDataset in SyncManager. @@ -74,7 +71,7 @@ def save(self, data: Any): def __getattr__(name): - if name == "_SharedMemoryDataSet": + if name == "_SharedMemoryDataset": alias = _SharedMemoryDataset warnings.warn( f"{repr(name)} has been renamed to {repr(alias.__name__)}, " From 928e35acfc0958c659f86c934dd5601276f28aa1 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 16:41:42 +0100 Subject: [PATCH 03/12] Rename any mentions in tests/ from DataSet to Dataset Signed-off-by: Merel Theisen --- tests/config/test_config.py | 30 ++++++------ tests/config/test_omegaconf_config.py | 48 +++++++++---------- tests/config/test_templated_config.py | 42 ++++++++-------- tests/framework/cli/pipeline/test_pipeline.py | 6 +-- tests/framework/cli/test_catalog.py | 42 ++++++++-------- tests/framework/context/test_context.py | 10 ++-- tests/framework/session/conftest.py | 4 +- .../session/test_session_extension_hooks.py | 12 ++--- tests/runner/conftest.py | 10 ++-- tests/runner/test_parallel_runner.py | 2 +- 10 files changed, 103 insertions(+), 103 deletions(-) diff --git a/tests/config/test_config.py b/tests/config/test_config.py index fd34f8edf8..934eab0639 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -40,9 +40,9 @@ def _write_dummy_ini(filepath: Path): def base_config(tmp_path): filepath = str(tmp_path / "cars.csv") return { - "trains": {"type": "MemoryDataSet"}, + "trains": {"type": "MemoryDataset"}, "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": filepath, "save_args": {"index": True}, }, @@ -54,11 +54,11 @@ def local_config(tmp_path): filepath = str(tmp_path / "cars.csv") return { "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": filepath, "save_args": {"index": False}, }, - "boats": {"type": "MemoryDataSet"}, + "boats": {"type": "MemoryDataset"}, } @@ -85,7 +85,7 @@ def proj_catalog(tmp_path, base_config): @pytest.fixture def proj_catalog_nested(tmp_path): path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" - _write_yaml(path, {"nested": {"type": "MemoryDataSet"}}) + _write_yaml(path, {"nested": {"type": "MemoryDataset"}}) use_config_dir = pytest.mark.usefixtures("create_config_dir") @@ -101,9 +101,9 @@ def test_load_core_config_dict_get(self, tmp_path): catalog = conf["catalog"] assert params["param1"] == 1 - assert catalog["trains"]["type"] == "MemoryDataSet" - assert catalog["cars"]["type"] == "pandas.CSVDataSet" - assert catalog["boats"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" + assert catalog["cars"]["type"] == "pandas.CSVDataset" + assert catalog["boats"]["type"] == "MemoryDataset" assert not catalog["cars"]["save_args"]["index"] @use_config_dir @@ -118,9 +118,9 @@ def test_load_local_config(self, tmp_path): assert params["param1"] == 1 assert db_conf["prod"]["url"] == "postgresql://user:pass@url_prod/db" - assert catalog["trains"]["type"] == "MemoryDataSet" - assert catalog["cars"]["type"] == "pandas.CSVDataSet" - assert catalog["boats"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" + assert catalog["cars"]["type"] == "pandas.CSVDataset" + assert catalog["boats"]["type"] == "MemoryDataset" assert not catalog["cars"]["save_args"]["index"] @use_proj_catalog @@ -159,9 +159,9 @@ def test_nested(self, tmp_path): config_loader.default_run_env = "" catalog = config_loader.get("catalog*", "catalog*/**") assert catalog.keys() == {"cars", "trains", "nested"} - assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataset" assert catalog["cars"]["save_args"]["index"] is True - assert catalog["nested"]["type"] == "MemoryDataSet" + assert catalog["nested"]["type"] == "MemoryDataset" @use_config_dir def test_nested_subdirs_duplicate(self, tmp_path, base_config): @@ -322,7 +322,7 @@ def test_yaml_parser_error(self, tmp_path): example_catalog = """ example_iris_data: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/iris.csv """ @@ -359,7 +359,7 @@ def test_adding_extra_keys_to_confloader(self, tmp_path): catalog = conf["catalog"] conf["spark"] = {"spark_config": "emr.blabla"} - assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" assert conf["spark"] == {"spark_config": "emr.blabla"} @use_config_dir diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 4a99458f19..162cdecb0b 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -46,9 +46,9 @@ def _write_dummy_ini(filepath: Path): def base_config(tmp_path): filepath = str(tmp_path / "cars.csv") return { - "trains": {"type": "MemoryDataSet"}, + "trains": {"type": "MemoryDataset"}, "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": filepath, "save_args": {"index": True}, }, @@ -60,11 +60,11 @@ def local_config(tmp_path): filepath = str(tmp_path / "cars.csv") return { "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": filepath, "save_args": {"index": False}, }, - "boats": {"type": "MemoryDataSet"}, + "boats": {"type": "MemoryDataset"}, } @@ -104,7 +104,7 @@ def proj_catalog(tmp_path, base_config): @pytest.fixture def proj_catalog_nested(tmp_path): path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" - _write_yaml(path, {"nested": {"type": "MemoryDataSet"}}) + _write_yaml(path, {"nested": {"type": "MemoryDataset"}}) @pytest.fixture @@ -138,7 +138,7 @@ def test_load_core_config_dict_syntax(self, tmp_path): catalog = conf["catalog"] assert params["param1"] == 1 - assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" @use_config_dir def test_load_core_config_get_syntax(self, tmp_path): @@ -148,7 +148,7 @@ def test_load_core_config_get_syntax(self, tmp_path): catalog = conf.get("catalog") assert params["param1"] == 1 - assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" @use_config_dir def test_load_local_config_overrides_base(self, tmp_path): @@ -159,9 +159,9 @@ def test_load_local_config_overrides_base(self, tmp_path): catalog = conf["catalog"] assert params["param1"] == 1 - assert catalog["trains"]["type"] == "MemoryDataSet" - assert catalog["cars"]["type"] == "pandas.CSVDataSet" - assert catalog["boats"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" + assert catalog["cars"]["type"] == "pandas.CSVDataset" + assert catalog["boats"]["type"] == "MemoryDataset" assert not catalog["cars"]["save_args"]["index"] @use_proj_catalog @@ -204,9 +204,9 @@ def test_nested(self, tmp_path): catalog = config_loader["catalog"] assert catalog.keys() == {"cars", "trains", "nested"} - assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataset" assert catalog["cars"]["save_args"]["index"] is True - assert catalog["nested"]["type"] == "MemoryDataSet" + assert catalog["nested"]["type"] == "MemoryDataset" @use_config_dir def test_nested_subdirs_duplicate(self, tmp_path, base_config): @@ -384,7 +384,7 @@ def test_yaml_parser_error(self, tmp_path): example_catalog = """ example_iris_data: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/iris.csv """ @@ -460,7 +460,7 @@ def test_adding_extra_keys_to_confloader(self, tmp_path): catalog = conf["catalog"] conf["spark"] = {"spark_config": "emr.blabla"} - assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" assert conf["spark"] == {"spark_config": "emr.blabla"} @use_config_dir @@ -530,7 +530,7 @@ def test_load_config_from_tar_file(self, tmp_path): conf = OmegaConfigLoader(conf_source=f"{tmp_path}/tar_conf.tar.gz") catalog = conf["catalog"] - assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" @use_config_dir def test_load_config_from_zip_file(self, tmp_path): @@ -554,7 +554,7 @@ def zipdir(path, ziph): conf = OmegaConfigLoader(conf_source=f"{tmp_path}/Python.zip") catalog = conf["catalog"] - assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["trains"]["type"] == "MemoryDataset" @use_config_dir def test_variable_interpolation_with_correct_env(self, tmp_path): @@ -621,13 +621,13 @@ def test_variable_interpolation_in_catalog_with_templates(self, tmp_path): "type": "${_pandas.type}", "filepath": "data/01_raw/companies.csv", }, - "_pandas": {"type": "pandas.CSVDataSet"}, + "_pandas": {"type": "pandas.CSVDataset"}, } _write_yaml(base_catalog, catalog_config) conf = OmegaConfigLoader(str(tmp_path)) conf.default_run_env = "" - assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataSet" + assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataset" def test_variable_interpolation_in_catalog_with_separate_templates_file( self, tmp_path @@ -640,13 +640,13 @@ def test_variable_interpolation_in_catalog_with_separate_templates_file( } } tmp_catalog = tmp_path / _BASE_ENV / "catalog_temp.yml" - template = {"_pandas": {"type": "pandas.CSVDataSet"}} + template = {"_pandas": {"type": "pandas.CSVDataset"}} _write_yaml(base_catalog, catalog_config) _write_yaml(tmp_catalog, template) conf = OmegaConfigLoader(str(tmp_path)) conf.default_run_env = "" - assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataSet" + assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataset" def test_custom_resolvers(self, tmp_path): base_params = tmp_path / _BASE_ENV / "parameters.yml" @@ -696,7 +696,7 @@ def test_globals_resolution(self, tmp_path): "filepath": "data/01_raw/companies.csv", }, } - globals_config = {"x": 34, "dataset_type": "pandas.CSVDataSet"} + globals_config = {"x": 34, "dataset_type": "pandas.CSVDataset"} _write_yaml(base_params, param_config) _write_yaml(globals_params, globals_config) _write_yaml(base_catalog, catalog_config) @@ -883,7 +883,7 @@ def test_runtime_params_resolution(self, tmp_path): runtime_params = { "x": 45, "dataset": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", }, } param_config = { @@ -965,12 +965,12 @@ def test_runtime_params_default_global(self, tmp_path): } globals_config = { "dataset": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", } } catalog_config = { "companies": { - "type": "${runtime_params:type, ${globals:dataset.type, 'MemoryDataSet'}}", + "type": "${runtime_params:type, ${globals:dataset.type, 'MemoryDataset'}}", "filepath": "data/01_raw/companies.csv", }, } diff --git a/tests/config/test_templated_config.py b/tests/config/test_templated_config.py index 9a8edbd0d4..dec4b48ddf 100644 --- a/tests/config/test_templated_config.py +++ b/tests/config/test_templated_config.py @@ -41,7 +41,7 @@ def template_config(): "s3_bucket": "s3a://boat-and-car-bucket", "raw_data_folder": "01_raw", "boat_file_name": "boats.csv", - "boat_data_type": "SparkDataSet", + "boat_data_type": "SparkDataset", "string_type": "VARCHAR", "float_type": "FLOAT", "write_only_user": "ron", @@ -55,10 +55,10 @@ def catalog_with_jinja2_syntax(tmp_path): catalog = """ {% for speed in ['fast', 'slow'] %} {{ speed }}-trains: - type: MemoryDataSet + type: MemoryDataset {{ speed }}-cars: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: ${s3_bucket}/{{ speed }}-cars.csv save_args: index: true @@ -86,7 +86,7 @@ def proj_catalog_globals(tmp_path, template_config): def normal_config_advanced(): return { "planes": { - "type": "SparkJDBCDataSet", + "type": "SparkJDBCDataset", "postgres_credentials": {"user": "Fakeuser", "password": "F@keP@55word"}, "batch_size": 10000, "need_permission": True, @@ -117,7 +117,7 @@ def param_config_advanced(): @pytest.fixture def template_config_advanced(): return { - "plane_data_type": "SparkJDBCDataSet", + "plane_data_type": "SparkJDBCDataset", "credentials": {"user": "Fakeuser", "password": "F@keP@55word"}, "batch_size": 10000, "permission_param": True, @@ -215,7 +215,7 @@ def test_get_catalog_config_with_dict_get(self, tmp_path, template_config): ) config_loader.default_run_env = "" catalog = config_loader["catalog"] - assert catalog["boats"]["type"] == "SparkDataSet" + assert catalog["boats"]["type"] == "SparkDataset" @pytest.mark.usefixtures("proj_catalog_param") def test_catalog_parameterized_w_dict(self, tmp_path, template_config): @@ -225,7 +225,7 @@ def test_catalog_parameterized_w_dict(self, tmp_path, template_config): ) config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") - assert catalog["boats"]["type"] == "SparkDataSet" + assert catalog["boats"]["type"] == "SparkDataset" assert ( catalog["boats"]["filepath"] == "s3a://boat-and-car-bucket/01_raw/boats.csv" ) @@ -243,7 +243,7 @@ def test_catalog_parameterized_w_globals(self, tmp_path): str(tmp_path), globals_pattern="*globals.yml" ).get("catalog*.yml") - assert catalog["boats"]["type"] == "SparkDataSet" + assert catalog["boats"]["type"] == "SparkDataset" assert ( catalog["boats"]["filepath"] == "s3a://boat-and-car-bucket/01_raw/boats.csv" ) @@ -279,7 +279,7 @@ def test_catalog_advanced(self, tmp_path, normal_config_advanced): config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") - assert catalog["planes"]["type"] == "SparkJDBCDataSet" + assert catalog["planes"]["type"] == "SparkJDBCDataset" assert catalog["planes"]["postgres_credentials"]["user"] == "Fakeuser" assert catalog["planes"]["postgres_credentials"]["password"] == "F@keP@55word" assert catalog["planes"]["batch_size"] == 10000 @@ -295,7 +295,7 @@ def test_catalog_parameterized_advanced(self, tmp_path, template_config_advanced config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") - assert catalog["planes"]["type"] == "SparkJDBCDataSet" + assert catalog["planes"]["type"] == "SparkJDBCDataset" assert catalog["planes"]["postgres_credentials"]["user"] == "Fakeuser" assert catalog["planes"]["postgres_credentials"]["password"] == "F@keP@55word" assert catalog["planes"]["batch_size"] == 10000 @@ -312,7 +312,7 @@ def test_catalog_parameterized_w_dict_mixed(self, tmp_path, get_environ): str(tmp_path), globals_pattern="*globals.yml", globals_dict=get_environ ).get("catalog*.yml") - assert catalog["boats"]["type"] == "SparkDataSet" + assert catalog["boats"]["type"] == "SparkDataset" assert ( catalog["boats"]["filepath"] == "s3a://boat-and-car-bucket/01_raw/boats.csv" ) @@ -332,7 +332,7 @@ def test_catalog_parameterized_w_dict_namespaced( config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") - assert catalog["boats"]["type"] == "SparkDataSet" + assert catalog["boats"]["type"] == "SparkDataset" assert ( catalog["boats"]["filepath"] == "s3a://boat-and-car-bucket/01_raw/boats.csv" ) @@ -362,15 +362,15 @@ def test_catalog_with_jinja2_syntax(self, tmp_path, template_config): config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") expected_catalog = { - "fast-trains": {"type": "MemoryDataSet"}, + "fast-trains": {"type": "MemoryDataset"}, "fast-cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "s3a://boat-and-car-bucket/fast-cars.csv", "save_args": {"index": True}, }, - "slow-trains": {"type": "MemoryDataSet"}, + "slow-trains": {"type": "MemoryDataset"}, "slow-cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "s3a://boat-and-car-bucket/slow-cars.csv", "save_args": {"index": True}, }, @@ -389,15 +389,15 @@ def test_catalog_with_jinja2_syntax_and_globals_file(self, tmp_path): config_loader.default_run_env = "" catalog = config_loader.get("catalog*.yml") expected_catalog = { - "fast-trains": {"type": "MemoryDataSet"}, + "fast-trains": {"type": "MemoryDataset"}, "fast-cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "s3a://boat-and-car-bucket/fast-cars.csv", "save_args": {"index": True}, }, - "slow-trains": {"type": "MemoryDataSet"}, + "slow-trains": {"type": "MemoryDataset"}, "slow-cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "s3a://boat-and-car-bucket/slow-cars.csv", "save_args": {"index": True}, }, @@ -492,7 +492,7 @@ def test_adding_extra_keys_to_confloader(self, tmp_path, template_config): catalog = config_loader["catalog"] config_loader["spark"] = {"spark_config": "emr.blabla"} - assert catalog["boats"]["type"] == "SparkDataSet" + assert catalog["boats"]["type"] == "SparkDataset" assert config_loader["spark"] == {"spark_config": "emr.blabla"} @pytest.mark.usefixtures("proj_catalog_param") diff --git a/tests/framework/cli/pipeline/test_pipeline.py b/tests/framework/cli/pipeline/test_pipeline.py index f216d73917..0587dedd7d 100644 --- a/tests/framework/cli/pipeline/test_pipeline.py +++ b/tests/framework/cli/pipeline/test_pipeline.py @@ -4,7 +4,7 @@ import pytest import yaml from click.testing import CliRunner -from kedro_datasets.pandas import CSVDataSet +from kedro_datasets.pandas import CSVDataset from pandas import DataFrame from kedro.framework.cli.pipeline import _sync_dirs @@ -187,7 +187,7 @@ def test_catalog_and_params( conf_dir = fake_repo_path / settings.CONF_SOURCE / "base" catalog_dict = { "ds_from_pipeline": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "data/01_raw/iris.csv", } } @@ -204,7 +204,7 @@ def test_catalog_and_params( with KedroSession.create(PACKAGE_NAME) as session: ctx = session.load_context() - assert isinstance(ctx.catalog._datasets["ds_from_pipeline"], CSVDataSet) + assert isinstance(ctx.catalog._datasets["ds_from_pipeline"], CSVDataset) assert isinstance(ctx.catalog.load("ds_from_pipeline"), DataFrame) assert ctx.params["params_from_pipeline"] == params_dict["params_from_pipeline"] diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index f1394c4531..d6342be9bb 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -1,7 +1,7 @@ import pytest import yaml from click.testing import CliRunner -from kedro_datasets.pandas import CSVDataSet +from kedro_datasets.pandas import CSVDataset from kedro.io import DataCatalog, MemoryDataset from kedro.pipeline import node @@ -32,10 +32,10 @@ def mock_pipelines(mocker): def fake_catalog_config(): config = { "parquet_{factory_pattern}": { - "type": "pandas.ParquetDataSet", + "type": "pandas.ParquetDataset", "filepath": "test.pq", }, - "csv_{factory_pattern}": {"type": "pandas.CSVDataSet", "filepath": "test.csv"}, + "csv_{factory_pattern}": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, } return config @@ -44,7 +44,7 @@ def fake_catalog_config(): def fake_catalog_with_overlapping_factories(): config = { "an_example_dataset": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "dummy_filepath", }, "an_example_{placeholder}": { @@ -71,13 +71,13 @@ def fake_catalog_with_overlapping_factories(): def fake_catalog_config_with_resolvable_dataset(): config = { "parquet_{factory_pattern}": { - "type": "pandas.ParquetDataSet", + "type": "pandas.ParquetDataset", "filepath": "test.pq", }, - "csv_{factory_pattern}": {"type": "pandas.CSVDataSet", "filepath": "test.csv"}, - "explicit_ds": {"type": "pandas.CSVDataSet", "filepath": "test.csv"}, + "csv_{factory_pattern}": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "{factory_pattern}_ds": { - "type": "pandas.ParquetDataSet", + "type": "pandas.ParquetDataset", "filepath": "test.pq", }, } @@ -134,11 +134,11 @@ def test_no_param_datasets_in_respose( yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value catalog_datasets = { - "iris_data": CSVDataSet("test.csv"), + "iris_data": CSVDataset("test.csv"), "intermediate": MemoryDataset(), "parameters": MemoryDataset(), "params:data_ratio": MemoryDataset(), - "not_used": CSVDataSet("test2.csv"), + "not_used": CSVDataset("test2.csv"), } mocked_context.catalog = DataCatalog(datasets=catalog_datasets) @@ -177,7 +177,7 @@ def test_default_dataset( """ yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value - catalog_datasets = {"some_dataset": CSVDataSet("test.csv")} + catalog_datasets = {"some_dataset": CSVDataset("test.csv")} mocked_context.catalog = DataCatalog(datasets=catalog_datasets) mocker.patch.object( mock_pipelines[PIPELINE_NAME], @@ -236,8 +236,8 @@ def test_list_factory_generated_datasets( expected_dict = { f"Datasets in '{PIPELINE_NAME}' pipeline": { "Datasets generated from factories": { - "pandas.CSVDataSet": ["csv_example"], - "pandas.ParquetDataSet": ["parquet_example"], + "pandas.CSVDataset": ["csv_example"], + "pandas.ParquetDataset": ["parquet_example"], } } } @@ -341,8 +341,8 @@ def test_no_missing_datasets( mocked_context = fake_load_context.return_value catalog_datasets = { - "input_data": CSVDataSet("test.csv"), - "output_data": CSVDataSet("test2.csv"), + "input_data": CSVDataset("test.csv"), + "output_data": CSVDataset("test2.csv"), } mocked_context.catalog = DataCatalog(datasets=catalog_datasets) mocked_context.project_path = fake_repo_path @@ -370,7 +370,7 @@ def test_missing_datasets_appended( data_catalog_file = catalog_path / f"catalog_{self.PIPELINE_NAME}.yml" catalog_config = { - "example_test_x": {"type": "pandas.CSVDataSet", "filepath": "test.csv"} + "example_test_x": {"type": "pandas.CSVDataset", "filepath": "test.csv"} } with data_catalog_file.open(mode="w") as catalog_file: yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False) @@ -445,9 +445,9 @@ def test_rank_catalog_factories_with_no_factories( mocked_context = fake_load_context.return_value catalog_datasets = { - "iris_data": CSVDataSet("test.csv"), + "iris_data": CSVDataset("test.csv"), "intermediate": MemoryDataset(), - "not_used": CSVDataSet("test2.csv"), + "not_used": CSVDataset("test2.csv"), } mocked_context.catalog = DataCatalog(datasets=catalog_datasets) @@ -555,14 +555,14 @@ def test_no_param_datasets_in_resolve( catalog_config = { "iris_data": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": "test.csv", }, - "intermediate": {"type": "MemoryDataSet"}, + "intermediate": {"type": "MemoryDataset"}, } catalog_datasets = { - "iris_data": CSVDataSet("test.csv"), + "iris_data": CSVDataset("test.csv"), "intermediate": MemoryDataset(), "parameters": MemoryDataset(), "params:data_ratio": MemoryDataset(), diff --git a/tests/framework/context/test_context.py b/tests/framework/context/test_context.py index 794cdb1fa7..7032b1ecef 100644 --- a/tests/framework/context/test_context.py +++ b/tests/framework/context/test_context.py @@ -73,9 +73,9 @@ def base_config(tmp_path): trains_filepath = (tmp_path / "trains.csv").as_posix() return { - "trains": {"type": "pandas.CSVDataSet", "filepath": trains_filepath}, + "trains": {"type": "pandas.CSVDataset", "filepath": trains_filepath}, "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": cars_filepath, "save_args": {"index": True}, }, @@ -90,19 +90,19 @@ def local_config(tmp_path): horses_filepath = "horses.csv" return { "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": cars_filepath, "save_args": {"index": False}, "versioned": True, }, "boats": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": boats_filepath, "versioned": True, "layer": "raw", }, "horses": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": horses_filepath, "versioned": True, }, diff --git a/tests/framework/session/conftest.py b/tests/framework/session/conftest.py index c38a363666..c7758d2d89 100644 --- a/tests/framework/session/conftest.py +++ b/tests/framework/session/conftest.py @@ -66,13 +66,13 @@ def local_config(tmp_path): boats_filepath = str(tmp_path / "boats.csv") return { "cars": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": cars_filepath, "save_args": {"index": False}, "versioned": True, }, "boats": { - "type": "pandas.CSVDataSet", + "type": "pandas.CSVDataset", "filepath": boats_filepath, "versioned": True, }, diff --git a/tests/framework/session/test_session_extension_hooks.py b/tests/framework/session/test_session_extension_hooks.py index 3f407852b9..4e43c80880 100644 --- a/tests/framework/session/test_session_extension_hooks.py +++ b/tests/framework/session/test_session_extension_hooks.py @@ -18,7 +18,7 @@ settings, ) from kedro.framework.session import KedroSession -from kedro.io import DataCatalog, MemoryDataSet +from kedro.io import DataCatalog, MemoryDataset from kedro.pipeline import node, pipeline from kedro.pipeline.node import Node from kedro.runner import ParallelRunner @@ -286,7 +286,7 @@ def test_before_and_after_node_run_hooks_parallel_runner( assert set(record.outputs.keys()) <= {"planes", "ships"} -class TestDataSetHooks: +class TestDatasetHooks: @pytest.mark.usefixtures("mock_pipelines") def test_before_and_after_dataset_loaded_hooks_sequential_runner( self, mock_session, caplog, dummy_dataframe @@ -554,10 +554,10 @@ def load(self, name: str, version: str = None) -> Any: @pytest.fixture def memory_catalog(): - ds1 = MemoryDataSet({"data": 42}) - ds2 = MemoryDataSet({"data": 42}) - ds3 = MemoryDataSet({"data": 42}) - ds4 = MemoryDataSet({"data": 42}) + ds1 = MemoryDataset({"data": 42}) + ds2 = MemoryDataset({"data": 42}) + ds3 = MemoryDataset({"data": 42}) + ds4 = MemoryDataset({"data": 42}) return LogCatalog({"ds1": ds1, "ds2": ds2, "ds3": ds3, "ds4": ds4}) diff --git a/tests/runner/conftest.py b/tests/runner/conftest.py index 4c720a7a4a..0ce581e624 100644 --- a/tests/runner/conftest.py +++ b/tests/runner/conftest.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from kedro.io import DataCatalog, LambdaDataSet, MemoryDataSet +from kedro.io import DataCatalog, LambdaDataset, MemoryDataset from kedro.pipeline import node, pipeline @@ -42,7 +42,7 @@ def multi_input_list_output(arg1, arg2): @pytest.fixture def conflicting_feed_dict(pandas_df_feed_dict): - ds1 = MemoryDataSet({"data": 0}) + ds1 = MemoryDataset({"data": 0}) ds3 = pandas_df_feed_dict["ds3"] return {"ds1": ds1, "ds3": ds3} @@ -60,8 +60,8 @@ def catalog(): @pytest.fixture def memory_catalog(): - ds1 = MemoryDataSet({"data": 42}) - ds2 = MemoryDataSet([1, 2, 3, 4, 5]) + ds1 = MemoryDataset({"data": 42}) + ds2 = MemoryDataset([1, 2, 3, 4, 5]) return DataCatalog({"ds1": ds1, "ds2": ds2}) @@ -73,7 +73,7 @@ def _load(): def _save(arg): pass - persistent_dataset = LambdaDataSet(load=_load, save=_save) + persistent_dataset = LambdaDataset(load=_load, save=_save) return DataCatalog( { "ds0_A": persistent_dataset, diff --git a/tests/runner/test_parallel_runner.py b/tests/runner/test_parallel_runner.py index 27b91d6896..60e8bac015 100644 --- a/tests/runner/test_parallel_runner.py +++ b/tests/runner/test_parallel_runner.py @@ -35,7 +35,7 @@ def test_deprecation(): - class_name = "_SharedMemoryDataSet" + class_name = "_SharedMemoryDataset" with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): getattr(importlib.import_module("kedro.runner.parallel_runner"), class_name) From c74a03ce4de7ed2237c20f8c3c307bd94823f44a Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 16:51:42 +0100 Subject: [PATCH 04/12] Replace more mentions of DataSet with Dataset Signed-off-by: Merel Theisen --- .circleci/continue_config.yml | 2 +- features/environment.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 5446be9a41..0a2a2fcf98 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -52,7 +52,7 @@ commands: name: Install venv for some pre-commit hooks command: conda install -y virtualenv - run: - # pytables does not work properly with python 3.9 to handle our HDFDataSet + # pytables does not work properly with python 3.9 to handle our HDFDataset # if pip-installed, so we install this dependency via conda name: Install pytables command: conda install -c conda-forge pytables -y diff --git a/features/environment.py b/features/environment.py index a420a18a59..1be2de7227 100644 --- a/features/environment.py +++ b/features/environment.py @@ -118,6 +118,6 @@ def _install_project_requirements(context): .splitlines() ) install_reqs = [req for req in install_reqs if "{" not in req and "#" not in req] - install_reqs.append("kedro-datasets[pandas.CSVDataSet]") + install_reqs.append("kedro-datasets[pandas.CSVDataset]") call([context.pip, "install", *install_reqs], env=context.env) return context From 459145d480be5f6b958e870e6f0a91ed53e859a3 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 16:58:26 +0100 Subject: [PATCH 05/12] Rename DataSet to Dataset in e2e tests Signed-off-by: Merel Theisen --- features/steps/e2e_test_catalog.yml | 8 ++++---- .../{{ cookiecutter.repo_name }}/conf/base/catalog.yml | 10 +++++----- .../{{ cookiecutter.repo_name }}/requirements.txt | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/features/steps/e2e_test_catalog.yml b/features/steps/e2e_test_catalog.yml index 49cfe5450e..146714c761 100644 --- a/features/steps/e2e_test_catalog.yml +++ b/features/steps/e2e_test_catalog.yml @@ -1,20 +1,20 @@ A: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/input_1.csv save_args: index: False C: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/input_2.csv save_args: index: False E: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/02_intermediate/output_1.csv save_args: index: False F: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/02_intermediate/output_2.csv save_args: index: False diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index c0c61a3a2c..d34d27b3a8 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -8,11 +8,11 @@ # An example data set definition can look as follows: # #bikes: -# type: pandas.CSVDataSet +# type: pandas.CSVDataset # filepath: "data/01_raw/bikes.csv" # #weather: -# type: spark.SparkDataSet +# type: spark.SparkDataset # filepath: s3a://your_bucket/data/01_raw/weather* # file_format: csv # credentials: dev_s3 @@ -24,7 +24,7 @@ # header: True # #scooters: -# type: pandas.SQLTableDataSet +# type: pandas.SQLTableDataset # credentials: scooters_credentials # table_name: scooters # load_args: @@ -35,7 +35,7 @@ # # if_exists: 'fail' # # if_exists: 'append' # -# The Data Catalog supports being able to reference the same file using two different DataSet implementations +# The Data Catalog supports being able to reference the same file using two different Dataset implementations # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: # https://kedro.readthedocs.io/en/stable/data/data_catalog.html # @@ -43,5 +43,5 @@ # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/iris.csv diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt b/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt index 39d93ecd53..4d3bf56da4 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/requirements.txt @@ -5,7 +5,7 @@ jupyter~=1.0 jupyterlab_server>=2.11.1, <2.16.0 jupyterlab~=3.0, <3.6.0 kedro~={{ cookiecutter.kedro_version}} -kedro-datasets[pandas.CSVDataSet] +kedro-datasets[pandas.CSVDataset] kedro-telemetry~=0.2.0 pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 From 0db0746ab4e08ef2664be9298dbff6055278cbd7 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 17:03:15 +0100 Subject: [PATCH 06/12] Remove deprecation check test Signed-off-by: Merel Theisen --- tests/runner/test_parallel_runner.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/runner/test_parallel_runner.py b/tests/runner/test_parallel_runner.py index 60e8bac015..1c3269ff46 100644 --- a/tests/runner/test_parallel_runner.py +++ b/tests/runner/test_parallel_runner.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib import sys from concurrent.futures.process import ProcessPoolExecutor from typing import Any @@ -34,12 +33,6 @@ ) -def test_deprecation(): - class_name = "_SharedMemoryDataset" - with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"): - getattr(importlib.import_module("kedro.runner.parallel_runner"), class_name) - - @pytest.mark.skipif( sys.platform.startswith("win"), reason="Due to bug in parallel runner" ) From 27b509931c6c0150b0e30326cef1cd650f27eddb Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 17:09:22 +0100 Subject: [PATCH 07/12] Clean up Signed-off-by: Merel Theisen --- kedro/io/__init__.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 850254ba26..cbe00cab54 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -21,14 +21,6 @@ ) -def __getattr__(name): - import kedro.io.core # noqa: import-outside-toplevel - - if name in (kedro.io.core._DEPRECATED_CLASSES): # noqa: protected-access - return getattr(kedro.io.core, name) - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") - - __all__ = [ "AbstractDataset", "AbstractVersionedDataset", From 97898c1442357da74bb854b9307f621e137a6d74 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 17:16:14 +0100 Subject: [PATCH 08/12] Fix lint Signed-off-by: Merel Theisen --- kedro/io/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index cbe00cab54..ad1fc1f99f 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -20,7 +20,6 @@ PartitionedDataset, ) - __all__ = [ "AbstractDataset", "AbstractVersionedDataset", From ad23d524b399d1f89fcef474a03a0bea0c5d0386 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 17:28:25 +0100 Subject: [PATCH 09/12] Remove deprecation check from parallel runner Signed-off-by: Merel Theisen --- kedro/runner/parallel_runner.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 7bddb6284d..4fe4715cc7 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -70,19 +70,6 @@ def save(self, data: Any): raise exc -def __getattr__(name): - if name == "_SharedMemoryDataset": - alias = _SharedMemoryDataset - warnings.warn( - f"{repr(name)} has been renamed to {repr(alias.__name__)}, " - f"and the alias will be removed in Kedro 0.19.0", - DeprecationWarning, - stacklevel=2, - ) - return alias - raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") - - class ParallelRunnerManager(SyncManager): """``ParallelRunnerManager`` is used to create shared ``MemoryDataset`` objects as default data sets in a pipeline. From 5ab4b6c559b717a43c982c1a55a5c5a5ab804969 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 9 Oct 2023 17:29:59 +0100 Subject: [PATCH 10/12] Remove deprecation check from parallel runner Signed-off-by: Merel Theisen --- kedro/runner/parallel_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 4fe4715cc7..5f306f2e3e 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -7,7 +7,6 @@ import os import pickle import sys -import warnings from collections import Counter from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait from itertools import chain From 790b21d715feb7a9b19c05677e54c34d05b2c712 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Tue, 10 Oct 2023 11:14:45 +0100 Subject: [PATCH 11/12] Apply suggestions from code review Co-authored-by: Deepyaman Datta --- .../{{ cookiecutter.repo_name }}/conf/base/catalog.yml | 2 +- kedro/io/partitioned_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index d34d27b3a8..4d6170963e 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -35,7 +35,7 @@ # # if_exists: 'fail' # # if_exists: 'append' # -# The Data Catalog supports being able to reference the same file using two different Dataset implementations +# The Data Catalog supports being able to reference the same file using two different dataset implementations # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: # https://kedro.readthedocs.io/en/stable/data/data_catalog.html # diff --git a/kedro/io/partitioned_dataset.py b/kedro/io/partitioned_dataset.py index 00f3364802..08a0a8569b 100644 --- a/kedro/io/partitioned_dataset.py +++ b/kedro/io/partitioned_dataset.py @@ -375,7 +375,7 @@ class IncrementalDataset(PartitionedDataset): >>> dataset.load() """ - DEFAULT_CHECKPOINT_TYPE = "kedro_datasets.text.TextDataset" # TODO: PartitionedDataset should move to kedro-datasets + DEFAULT_CHECKPOINT_TYPE = "kedro_datasets.text.TextDataset" DEFAULT_CHECKPOINT_FILENAME = "CHECKPOINT" def __init__( # noqa: too-many-arguments From b2405f34bf8382c5dd1a7bb1371a209ccaf5b308 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 10 Oct 2023 12:26:06 +0100 Subject: [PATCH 12/12] Rename remaining mentions of DataSet Signed-off-by: Merel Theisen --- docs/source/kedro.io.rst | 8 ------ docs/source/kedro_datasets.rst | 43 ----------------------------- docs/source/tutorial/set_up_data.md | 10 +++---- 3 files changed, 5 insertions(+), 56 deletions(-) diff --git a/docs/source/kedro.io.rst b/docs/source/kedro.io.rst index 56c6a7d6d5..e86af85047 100644 --- a/docs/source/kedro.io.rst +++ b/docs/source/kedro.io.rst @@ -13,16 +13,11 @@ kedro.io kedro.io.AbstractDataset kedro.io.AbstractVersionedDataset - kedro.io.CachedDataSet kedro.io.CachedDataset kedro.io.DataCatalog - kedro.io.IncrementalDataSet kedro.io.IncrementalDataset - kedro.io.LambdaDataSet kedro.io.LambdaDataset - kedro.io.MemoryDataSet kedro.io.MemoryDataset - kedro.io.PartitionedDataSet kedro.io.PartitionedDataset kedro.io.Version @@ -32,9 +27,6 @@ kedro.io :toctree: :template: autosummary/class.rst - kedro.io.DataSetAlreadyExistsError - kedro.io.DataSetError - kedro.io.DataSetNotFoundError kedro.io.DatasetAlreadyExistsError kedro.io.DatasetError kedro.io.DatasetNotFoundError diff --git a/docs/source/kedro_datasets.rst b/docs/source/kedro_datasets.rst index d8db36ee0f..6d3077c338 100644 --- a/docs/source/kedro_datasets.rst +++ b/docs/source/kedro_datasets.rst @@ -11,91 +11,48 @@ kedro_datasets :toctree: :template: autosummary/class.rst - kedro_datasets.api.APIDataSet kedro_datasets.api.APIDataset - kedro_datasets.biosequence.BioSequenceDataSet kedro_datasets.biosequence.BioSequenceDataset - kedro_datasets.dask.ParquetDataSet kedro_datasets.dask.ParquetDataset - kedro_datasets.databricks.ManagedTableDataSet kedro_datasets.databricks.ManagedTableDataset - kedro_datasets.email.EmailMessageDataSet kedro_datasets.email.EmailMessageDataset - kedro_datasets.geopandas.GeoJSONDataSet kedro_datasets.geopandas.GeoJSONDataset kedro_datasets.holoviews.HoloviewsWriter - kedro_datasets.json.JSONDataSet kedro_datasets.json.JSONDataset kedro_datasets.matplotlib.MatplotlibWriter - kedro_datasets.networkx.GMLDataSet kedro_datasets.networkx.GMLDataset - kedro_datasets.networkx.GraphMLDataSet kedro_datasets.networkx.GraphMLDataset - kedro_datasets.networkx.JSONDataSet kedro_datasets.networkx.JSONDataset - kedro_datasets.pandas.CSVDataSet kedro_datasets.pandas.CSVDataset - kedro_datasets.pandas.DeltaTableDataSet kedro_datasets.pandas.DeltaTableDataset - kedro_datasets.pandas.ExcelDataSet kedro_datasets.pandas.ExcelDataset - kedro_datasets.pandas.FeatherDataSet kedro_datasets.pandas.FeatherDataset - kedro_datasets.pandas.GBQQueryDataSet kedro_datasets.pandas.GBQQueryDataset - kedro_datasets.pandas.GBQTableDataSet kedro_datasets.pandas.GBQTableDataset - kedro_datasets.pandas.GenericDataSet kedro_datasets.pandas.GenericDataset - kedro_datasets.pandas.HDFDataSet kedro_datasets.pandas.HDFDataset - kedro_datasets.pandas.JSONDataSet kedro_datasets.pandas.JSONDataset - kedro_datasets.pandas.ParquetDataSet kedro_datasets.pandas.ParquetDataset - kedro_datasets.pandas.SQLQueryDataSet kedro_datasets.pandas.SQLQueryDataset - kedro_datasets.pandas.SQLTableDataSet kedro_datasets.pandas.SQLTableDataset - kedro_datasets.pandas.XMLDataSet kedro_datasets.pandas.XMLDataset - kedro_datasets.pickle.PickleDataSet kedro_datasets.pickle.PickleDataset - kedro_datasets.pillow.ImageDataSet kedro_datasets.pillow.ImageDataset - kedro_datasets.plotly.JSONDataSet kedro_datasets.plotly.JSONDataset - kedro_datasets.plotly.PlotlyDataSet kedro_datasets.plotly.PlotlyDataset - kedro_datasets.polars.CSVDataSet kedro_datasets.polars.CSVDataset - kedro_datasets.polars.GenericDataSet kedro_datasets.polars.GenericDataset - kedro_datasets.redis.PickleDataSet kedro_datasets.redis.PickleDataset - kedro_datasets.snowflake.SnowparkTableDataSet kedro_datasets.snowflake.SnowparkTableDataset - kedro_datasets.spark.DeltaTableDataSet kedro_datasets.spark.DeltaTableDataset - kedro_datasets.spark.SparkDataSet kedro_datasets.spark.SparkDataset - kedro_datasets.spark.SparkHiveDataSet kedro_datasets.spark.SparkHiveDataset - kedro_datasets.spark.SparkJDBCDataSet kedro_datasets.spark.SparkJDBCDataset - kedro_datasets.spark.SparkStreamingDataSet kedro_datasets.spark.SparkStreamingDataset - kedro_datasets.svmlight.SVMLightDataSet kedro_datasets.svmlight.SVMLightDataset - kedro_datasets.tensorflow.TensorFlowModelDataSet kedro_datasets.tensorflow.TensorFlowModelDataset - kedro_datasets.text.TextDataSet kedro_datasets.text.TextDataset - kedro_datasets.tracking.JSONDataSet kedro_datasets.tracking.JSONDataset - kedro_datasets.tracking.MetricsDataSet kedro_datasets.tracking.MetricsDataset - kedro_datasets.video.VideoDataSet kedro_datasets.video.VideoDataset - kedro_datasets.yaml.YAMLDataSet kedro_datasets.yaml.YAMLDataset diff --git a/docs/source/tutorial/set_up_data.md b/docs/source/tutorial/set_up_data.md index 2315f04068..dfd1c5089b 100644 --- a/docs/source/tutorial/set_up_data.md +++ b/docs/source/tutorial/set_up_data.md @@ -28,11 +28,11 @@ Open `conf/base/catalog.yml` for the spaceflights project to inspect the content ```yaml companies: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/companies.csv reviews: - type: pandas.CSVDataSet + type: pandas.CSVDataset filepath: data/01_raw/reviews.csv ```
@@ -44,7 +44,7 @@ Likewise for the `xlsx` dataset: ```yaml shuttles: - type: pandas.ExcelDataSet + type: pandas.ExcelDataset filepath: data/01_raw/shuttles.xlsx load_args: engine: openpyxl # Use modern Excel engine (the default since Kedro 0.18.0) @@ -75,7 +75,7 @@ companies.head() Click to expand ``` -INFO Loading data from 'companies' (CSVDataSet) +INFO Loading data from 'companies' (CSVDataset) Out[1]: id company_rating company_location total_fleet_count iata_approved 0 35029 100% Niue 4.0 f @@ -100,7 +100,7 @@ You should see output such as the following: Click to expand ``` -INFO Loading data from 'shuttles' (ExcelDataSet) +INFO Loading data from 'shuttles' (ExcelDataset) Out[1]: id shuttle_location shuttle_type engine_type ... d_check_complete moon_clearance_complete price company_id 0 63561 Niue Type V5 Quantum ... f f $1,325.0 35029