Skip to content

Commit

Permalink
refactor(datasets): deprecate "DataSet" type names (api)
Browse files Browse the repository at this point in the history
Signed-off-by: Deepyaman Datta <[email protected]>
  • Loading branch information
deepyaman committed Sep 4, 2023
1 parent e6d0f6c commit efb95d1
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 40 deletions.
1 change: 1 addition & 0 deletions kedro-datasets/docs/source/kedro_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ kedro_datasets
:template: autosummary/class.rst

kedro_datasets.api.APIDataSet
kedro_datasets.api.APIDataset
kedro_datasets.biosequence.BioSequenceDataSet
kedro_datasets.dask.ParquetDataSet
kedro_datasets.databricks.ManagedTableDataSet
Expand Down
9 changes: 6 additions & 3 deletions kedro-datasets/kedro_datasets/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
"""``APIDataSet`` loads the data from HTTP(S) APIs
"""``APIDataset`` loads the data from HTTP(S) APIs
and returns them into either as string or json Dict.
It uses the python requests library: https://requests.readthedocs.io/en/latest/
"""
from __future__ import annotations

from typing import Any

import lazy_loader as lazy

# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
APIDataSet: Any
APIDataSet: type[APIDataset]
APIDataset: Any

__getattr__, __dir__, __all__ = lazy.attach(
__name__, submod_attrs={"api_dataset": ["APIDataSet"]}
__name__, submod_attrs={"api_dataset": ["APIDataSet", "APIDataset"]}
)
39 changes: 29 additions & 10 deletions kedro-datasets/kedro_datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""``APIDataSet`` loads the data from HTTP(S) APIs.
"""``APIDataset`` loads the data from HTTP(S) APIs.
It uses the python requests library: https://requests.readthedocs.io/en/latest/
"""
import json as json_ # make pylint happy
import warnings
from copy import deepcopy
from typing import Any, Dict, List, Tuple, Union

Expand All @@ -13,8 +14,8 @@
from .._io import DatasetError as DataSetError


class APIDataSet(AbstractDataSet[None, requests.Response]):
"""``APIDataSet`` loads/saves data from/to HTTP(S) APIs.
class APIDataset(AbstractDataSet[None, requests.Response]):
"""``APIDataset`` loads/saves data from/to HTTP(S) APIs.
It uses the python requests library: https://requests.readthedocs.io/en/latest/
Example usage for the `YAML API <https://kedro.readthedocs.io/en/stable/data/\
Expand All @@ -23,7 +24,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
.. code-block:: yaml
usda:
type: api.APIDataSet
type: api.APIDataset
url: https://quickstats.nass.usda.gov
params:
key: SOME_TOKEN,
Expand All @@ -36,10 +37,10 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\
advanced_data_catalog_usage.html>`_: ::
>>> from kedro_datasets.api import APIDataSet
>>> from kedro_datasets.api import APIDataset
>>>
>>>
>>> data_set = APIDataSet(
>>> data_set = APIDataset(
>>> url="https://quickstats.nass.usda.gov",
>>> load_args={
>>> "params": {
Expand All @@ -55,12 +56,12 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
>>> )
>>> data = data_set.load()
``APIDataSet`` can also be used to save output on a remote server using HTTP(S)
``APIDataset`` can also be used to save output on a remote server using HTTP(S)
methods. ::
>>> example_table = '{"col1":["val1", "val2"], "col2":["val3", "val4"]}'
>>> data_set = APIDataSet(
>>> data_set = APIDataset(
method = "POST",
url = "url_of_remote_server",
save_args = {"chunk_size":1}
Expand All @@ -74,7 +75,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
used if the input of save method is a list. It will divide the request into chunks
of size `chunk_size`. For example, here we will send two requests each containing
one row of our example DataFrame.
If the data passed to the save method is not a list, ``APIDataSet`` will check if it
If the data passed to the save method is not a list, ``APIDataset`` will check if it
can be loaded as JSON. If true, it will send the data unchanged in a single request.
Otherwise, the ``_save`` method will try to dump the data in JSON format and execute
the request.
Expand All @@ -99,7 +100,7 @@ def __init__(
credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.
"""Creates a new instance of ``APIDataset`` to fetch data from an API endpoint.
Args:
url: The API URL endpoint.
Expand Down Expand Up @@ -233,3 +234,21 @@ def _exists(self) -> bool:
with sessions.Session() as session:
response = self._execute_request(session)
return response.ok


_DEPRECATED_CLASSES = {
"APIDataSet": APIDataset,
}


def __getattr__(name):
if name in _DEPRECATED_CLASSES:
alias = _DEPRECATED_CLASSES[name]
warnings.warn(
f"{repr(name)} has been renamed to {repr(alias.__name__)}, "
f"and the alias will be removed in Kedro-Datasets 2.0.0",
DeprecationWarning,
stacklevel=2,
)
return alias
raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")
65 changes: 38 additions & 27 deletions kedro-datasets/tests/api/test_api_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# pylint: disable=no-member
import base64
import importlib
import json
import socket
from typing import Any
Expand All @@ -9,7 +10,8 @@
from kedro.io.core import DataSetError
from requests.auth import HTTPBasicAuth

from kedro_datasets.api import APIDataSet
from kedro_datasets.api import APIDataset
from kedro_datasets.api.api_dataset import _DEPRECATED_CLASSES

POSSIBLE_METHODS = ["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
SAVE_METHODS = ["POST", "PUT"]
Expand All @@ -27,18 +29,27 @@
TEST_SAVE_DATA = [{"key1": "info1", "key2": "info2"}]


class TestAPIDataSet:
@pytest.mark.parametrize(
"module_name", ["kedro_datasets.api", "kedro_datasets.api.api_dataset"]
)
@pytest.mark.parametrize("class_name", _DEPRECATED_CLASSES)
def test_deprecation(module_name, class_name):
with pytest.warns(DeprecationWarning, match=f"{repr(class_name)} has been renamed"):
getattr(importlib.import_module(module_name), class_name)


class TestAPIDataset:
@pytest.mark.parametrize("method", POSSIBLE_METHODS)
def test_request_method(self, requests_mock, method):
if method in ["OPTIONS", "HEAD", "PATCH", "DELETE"]:
with pytest.raises(
ValueError,
match="Only GET, POST and PUT methods are supported",
):
APIDataSet(url=TEST_URL, method=method)
APIDataset(url=TEST_URL, method=method)

else:
api_data_set = APIDataSet(url=TEST_URL, method=method)
api_data_set = APIDataset(url=TEST_URL, method=method)

requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA)

Expand All @@ -59,7 +70,7 @@ def test_request_method(self, requests_mock, method):
],
)
def test_params_in_request(self, requests_mock, parameters_in, url_postfix):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"params": parameters_in}
)
requests_mock.register_uri(
Expand All @@ -71,7 +82,7 @@ def test_params_in_request(self, requests_mock, parameters_in, url_postfix):
assert response.text == TEST_TEXT_RESPONSE_DATA

def test_json_in_request(self, requests_mock):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=TEST_METHOD,
load_args={"json": TEST_JSON_REQUEST_DATA},
Expand All @@ -82,7 +93,7 @@ def test_json_in_request(self, requests_mock):
assert response.request.json() == TEST_JSON_REQUEST_DATA

def test_headers_in_request(self, requests_mock):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"headers": TEST_HEADERS}
)
requests_mock.register_uri(TEST_METHOD, TEST_URL, headers={"pan": "cake"})
Expand All @@ -93,7 +104,7 @@ def test_headers_in_request(self, requests_mock):
assert response.headers["pan"] == "cake"

def test_api_cookies(self, requests_mock):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"cookies": {"pan": "cake"}}
)
requests_mock.register_uri(TEST_METHOD, TEST_URL, text="text")
Expand All @@ -107,7 +118,7 @@ def test_credentials_auth_error(self):
the constructor should raise a ValueError.
"""
with pytest.raises(ValueError, match="both auth and credentials"):
APIDataSet(
APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"auth": []}, credentials={}
)

Expand All @@ -128,7 +139,7 @@ def _basic_auth(username, password):
],
)
def test_auth_sequence(self, requests_mock, auth_kwarg):
api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD, **auth_kwarg)
api_data_set = APIDataset(url=TEST_URL, method=TEST_METHOD, **auth_kwarg)
requests_mock.register_uri(
TEST_METHOD,
TEST_URL,
Expand All @@ -137,7 +148,7 @@ def test_auth_sequence(self, requests_mock, auth_kwarg):

response = api_data_set.load()
assert isinstance(response, requests.Response)
assert response.request.headers["Authorization"] == TestAPIDataSet._basic_auth(
assert response.request.headers["Authorization"] == TestAPIDataset._basic_auth(
"john", "doe"
)
assert response.text == TEST_TEXT_RESPONSE_DATA
Expand All @@ -151,7 +162,7 @@ def test_auth_sequence(self, requests_mock, auth_kwarg):
],
)
def test_api_timeout(self, requests_mock, timeout_in, timeout_out):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"timeout": timeout_in}
)
requests_mock.register_uri(TEST_METHOD, TEST_URL)
Expand All @@ -161,7 +172,7 @@ def test_api_timeout(self, requests_mock, timeout_in, timeout_out):
def test_stream(self, requests_mock):
text = "I am being streamed."

api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"stream": True}
)

Expand All @@ -175,7 +186,7 @@ def test_stream(self, requests_mock):
assert chunks == ["I ", "am", " b", "ei", "ng", " s", "tr", "ea", "me", "d."]

def test_proxy(self, requests_mock):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url="ftp://example.com/api/test",
method=TEST_METHOD,
load_args={"proxies": {"ftp": "ftp://127.0.0.1:3000"}},
Expand All @@ -198,7 +209,7 @@ def test_proxy(self, requests_mock):
],
)
def test_certs(self, requests_mock, cert_in, cert_out):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL, method=TEST_METHOD, load_args={"cert": cert_in}
)
requests_mock.register_uri(TEST_METHOD, TEST_URL)
Expand All @@ -210,7 +221,7 @@ def test_exists_http_error(self, requests_mock):
In case of an unexpected HTTP error,
``exists()`` should not silently catch it.
"""
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=TEST_METHOD,
load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand All @@ -230,7 +241,7 @@ def test_exists_ok(self, requests_mock):
If the file actually exists and server responds 200,
``exists()`` should return True
"""
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=TEST_METHOD,
load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand All @@ -245,7 +256,7 @@ def test_exists_ok(self, requests_mock):
assert api_data_set.exists()

def test_http_error(self, requests_mock):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=TEST_METHOD,
load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand All @@ -262,7 +273,7 @@ def test_http_error(self, requests_mock):
api_data_set.load()

def test_socket_error(self, requests_mock):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=TEST_METHOD,
load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand All @@ -281,7 +292,7 @@ def test_socket_error(self, requests_mock):
def test_successful_save(self, requests_mock, method, data):
"""
When we want to save some data on a server
Given an APIDataSet class
Given an APIDataset class
Then check that the response is OK and the sent data is in the correct form.
"""

Expand All @@ -292,7 +303,7 @@ def json_callback(
return request.json()

if method in ["PUT", "POST"]:
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=method,
save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand All @@ -309,7 +320,7 @@ def json_callback(
assert response.json() == TEST_SAVE_DATA

elif method == "GET":
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=method,
save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand All @@ -321,13 +332,13 @@ def json_callback(
ValueError,
match="Only GET, POST and PUT methods are supported",
):
APIDataSet(url=TEST_URL, method=method)
APIDataset(url=TEST_URL, method=method)

@pytest.mark.parametrize("save_methods", SAVE_METHODS)
def test_successful_save_with_json(self, requests_mock, save_methods):
"""
When we want to save with json parameters
Given an APIDataSet class
Given an APIDataset class
Then check we get a response
"""

Expand All @@ -337,7 +348,7 @@ def json_callback(
"""Callback that sends back the json."""
return request.json()

api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=save_methods,
save_args={"json": TEST_JSON_RESPONSE_DATA, "headers": TEST_HEADERS},
Expand All @@ -363,7 +374,7 @@ def json_callback(

@pytest.mark.parametrize("save_methods", SAVE_METHODS)
def test_save_http_error(self, requests_mock, save_methods):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=save_methods,
save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS, "chunk_size": 2},
Expand All @@ -384,7 +395,7 @@ def test_save_http_error(self, requests_mock, save_methods):

@pytest.mark.parametrize("save_methods", SAVE_METHODS)
def test_save_socket_error(self, requests_mock, save_methods):
api_data_set = APIDataSet(
api_data_set = APIDataset(
url=TEST_URL,
method=save_methods,
save_args={"params": TEST_PARAMS, "headers": TEST_HEADERS},
Expand Down

0 comments on commit efb95d1

Please sign in to comment.