Merge branch 'kedro-org:main' into fix/dataset-naming-tensorflowmodel…

…dataset
kedro-org · Apr 25, 2023 · 0018cc4 · 0018cc4
2 parents 871a9ef + 61e6797
commit 0018cc4
Show file tree

Hide file tree

Showing 8 changed files with 276 additions and 152 deletions.
diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml
@@ -43,9 +43,6 @@ jobs:
         run: |
           cd ${{ inputs.plugin }}
           pip install -r test_requirements.txt
-      - name: Install pytables (only for kedro-datasets on windows)
-        if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets'
-        run: pip install tables
       - name: pip freeze
         run: pip freeze
       - name: Run unit tests for Linux / all plugins

diff --git a/.gitpod.yml b/.gitpod.yml
@@ -0,0 +1,33 @@
+# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart
+image: gitpod/workspace-python-3.10:2023-04-20-16-32-37
+
+
+tasks:
+  # We want packages installed during the pre-build init steps to go to /workspace
+  # rather than ~ so that they are persisted. Gitpod sets PIP_USER=yes to ensure this,
+  # but pre-commit requires PIP_USER=no. Hence we set PIP_USER=no and use
+  # pip install --user to install to /workspace.
+  - name: kedro-plugins
+    before: |
+      echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no
+    init: |
+      make sign-off
+    command: |
+      pre-commit install --install-hooks
+      clear
+
+
+github:
+  prebuilds:
+    # enable for the master/default branch (defaults to true)
+    master: true
+    # enable for all branches in this repo (defaults to false)
+    branches: true
+    # enable for pull requests coming from this repo (defaults to true)
+    pullRequests: true
+    # enable for pull requests coming from forks (defaults to false)
+    pullRequestsFromForks: true
+    # add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
+    addComment: false
+    # add a "Review in Gitpod" button to pull requests (defaults to false)
+    addBadge: true
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -3,7 +3,7 @@
 ## Major features and improvements:
 * Added pandas 2.0 support.
 * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
-
+* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
 ## Bug fixes and other changes
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
 

diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py
@@ -1,12 +1,17 @@
 """``APIDataSet`` loads the data from HTTP(S) APIs.
 It uses the python requests library: https://requests.readthedocs.io/en/latest/
 """
-from typing import Any, Dict, Iterable, List, NoReturn, Union
+from typing import Any, Dict, List, NoReturn, Tuple, Union
 
 import requests
 from kedro.io.core import AbstractDataSet, DataSetError
+from requests import Session, sessions
 from requests.auth import AuthBase
 
+# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0.
+# Any contribution to datasets should be made in kedro-datasets
+# in kedro-plugins (https://github.com/kedro-org/kedro-plugins)
+
 
 class APIDataSet(AbstractDataSet[None, requests.Response]):
     """``APIDataSet`` loads the data from HTTP(S) APIs.
@@ -34,88 +39,89 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
     data_catalog.html#use-the-data-catalog-with-the-code-api>`_:
     ::
 
-        >>> from kedro_datasets.api import APIDataSet
+        >>> from kedro.extras.datasets.api import APIDataSet
         >>>
         >>>
         >>> data_set = APIDataSet(
         >>>     url="https://quickstats.nass.usda.gov",
-        >>>     params={
-        >>>         "key": "SOME_TOKEN",
-        >>>         "format": "JSON",
-        >>>         "commodity_desc": "CORN",
-        >>>         "statisticcat_des": "YIELD",
-        >>>         "agg_level_desc": "STATE",
-        >>>         "year": 2000
-        >>>     }
+        >>>     load_args={
+        >>>         "params": {
+        >>>             "key": "SOME_TOKEN",
+        >>>             "format": "JSON",
+        >>>             "commodity_desc": "CORN",
+        >>>             "statisticcat_des": "YIELD",
+        >>>             "agg_level_desc": "STATE",
+        >>>             "year": 2000
+        >>>         }
+        >>>     },
+        >>>     credentials=("username", "password")
         >>> )
         >>> data = data_set.load()
     """
 
-    # pylint: disable=too-many-arguments
     def __init__(
         self,
         url: str,
         method: str = "GET",
-        data: Any = None,
-        params: Dict[str, Any] = None,
-        headers: Dict[str, Any] = None,
-        auth: Union[Iterable[str], AuthBase] = None,
-        json: Union[List, Dict[str, Any]] = None,
-        timeout: int = 60,
-        credentials: Union[Iterable[str], AuthBase] = None,
+        load_args: Dict[str, Any] = None,
+        credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
     ) -> None:
         """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.
 
         Args:
             url: The API URL endpoint.
             method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc...
-            data: The request payload, used for POST, PUT, etc requests
-                https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests
-            params: The url parameters of the API.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
-            headers: The HTTP headers.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
-            auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,
-                or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. Any
-                iterable will be cast to a tuple.
-            json: The request payload, used for POST, PUT, etc requests, passed in
-                to the json kwarg in the requests object.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests
-            timeout: The wait time in seconds for a response, defaults to 1 minute.
-                https://requests.readthedocs.io/en/latest/user/quickstart/#timeouts
-            credentials: same as ``auth``. Allows specifying ``auth`` secrets in
-                credentials.yml.
-
+            load_args: Additional parameters to be fed to requests.request.
+                https://requests.readthedocs.io/en/latest/api/#requests.request
+            credentials: Allows specifying secrets in credentials.yml.
+                Expected format is ``('login', 'password')`` if given as a tuple or list.
+                An ``AuthBase`` instance can be provided for more complex cases.
         Raises:
-            ValueError: if both ``credentials`` and ``auth`` are specified.
+            ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified.
         """
         super().__init__()
 
-        if credentials is not None and auth is not None:
+        self._load_args = load_args or {}
+        self._load_args_auth = self._load_args.pop("auth", None)
+
+        if credentials is not None and self._load_args_auth is not None:
             raise ValueError("Cannot specify both auth and credentials.")
 
-        auth = credentials or auth
+        self._auth = credentials or self._load_args_auth
+
+        if "cert" in self._load_args:
+            self._load_args["cert"] = self._convert_type(self._load_args["cert"])
 
-        if isinstance(auth, Iterable):
-            auth = tuple(auth)
+        if "timeout" in self._load_args:
+            self._load_args["timeout"] = self._convert_type(self._load_args["timeout"])
 
         self._request_args: Dict[str, Any] = {
             "url": url,
             "method": method,
-            "data": data,
-            "params": params,
-            "headers": headers,
-            "auth": auth,
-            "json": json,
-            "timeout": timeout,
+            "auth": self._convert_type(self._auth),
+            **self._load_args,
         }
 
+    @staticmethod
+    def _convert_type(value: Any):
+        """
+        From the Data Catalog, iterables are provided as Lists.
+        However, for some parameters in the Python requests library,
+        only Tuples are allowed.
+        """
+        if isinstance(value, List):
+            return tuple(value)
+        return value
+
     def _describe(self) -> Dict[str, Any]:
-        return {**self._request_args}
+        # prevent auth from logging
+        request_args_cp = self._request_args.copy()
+        request_args_cp.pop("auth", None)
+        return request_args_cp
 
-    def _execute_request(self) -> requests.Response:
+    def _execute_request(self, session: Session) -> requests.Response:
         try:
-            response = requests.request(**self._request_args)
+            response = session.request(**self._request_args)
             response.raise_for_status()
         except requests.exceptions.HTTPError as exc:
             raise DataSetError("Failed to fetch data", exc) from exc
@@ -125,12 +131,13 @@ def _execute_request(self) -> requests.Response:
         return response
 
     def _load(self) -> requests.Response:
-        return self._execute_request()
+        with sessions.Session() as session:
+            return self._execute_request(session)
 
     def _save(self, data: None) -> NoReturn:
         raise DataSetError(f"{self.__class__.__name__} is a read only data set type")
 
     def _exists(self) -> bool:
-        response = self._execute_request()
-
+        with sessions.Session() as session:
+            response = self._execute_request(session)
         return response.ok
diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt
@@ -53,8 +53,7 @@ scikit-learn~=1.0.2
 scipy~=1.7.3
 snowflake-snowpark-python~=1.0.0; python_version == '3.8'
 SQLAlchemy>=1.4, <3.0  # The `Inspector.has_table()` method replaces the `Engine.has_table()` method in version 1.4.
-tables~=3.6.0; platform_system == "Windows" and python_version < '3.9'
-tables~=3.6; platform_system != "Windows"
+tables~=3.7
 tensorflow-macos~=2.0; platform_system == "Darwin" and platform_machine == "arm64"
 tensorflow~=2.0; platform_system != "Darwin" or platform_machine != "arm64"
 triad>=0.6.7, <1.0