From 6c24d8bcdf1a13068acb0842a8ef9268563ddc03 Mon Sep 17 00:00:00 2001 From: Amrit Ghimire Date: Thu, 7 Nov 2024 17:01:24 +0545 Subject: [PATCH 1/3] Add studio flag for rm-dataset and edit-dataset This adds a support for `--studio` flag for edit-dataset and rm-dataset command. If the --studio flag is passed, it will use the studio client to process the operation. Some example are as: - `datachain rm-dataset "new_test_dataset" --studio --version 1` - `datachain edit-dataset png_files --studio --new-name new_dataset_name` TODO: - Add test Studio PR: https://github.com/iterative/studio/pull/10890 --- src/datachain/cli.py | 69 +++++++++++++++++++++++++++++++--- src/datachain/remote/studio.py | 40 ++++++++++++++++---- src/datachain/studio.py | 29 ++++++++++++++ 3 files changed, 125 insertions(+), 13 deletions(-) diff --git a/src/datachain/cli.py b/src/datachain/cli.py index 9859c1af..1c8e6ef7 100644 --- a/src/datachain/cli.py +++ b/src/datachain/cli.py @@ -18,7 +18,12 @@ from datachain.config import Config from datachain.error import DataChainError from datachain.lib.dc import DataChain -from datachain.studio import list_datasets, process_studio_cli_args +from datachain.studio import ( + edit_studio_dataset, + list_datasets, + process_studio_cli_args, + remove_studio_dataset, +) from datachain.telemetry import telemetry if TYPE_CHECKING: @@ -418,6 +423,18 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915 nargs="+", help="Dataset labels", ) + parse_edit_dataset.add_argument( + "--studio", + action="store_true", + default=False, + help="Remove dataset from Studio", + ) + parse_edit_dataset.add_argument( + "--team", + action="store", + default=None, + help="The team to edit a dataset. By default, it will use team from config.", + ) datasets_parser = subp.add_parser( "datasets", parents=[parent_parser], description="List datasets" @@ -466,6 +483,18 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915 action=BooleanOptionalAction, help="Force delete registered dataset with all of it's versions", ) + rm_dataset_parser.add_argument( + "--studio", + action="store_true", + default=False, + help="Remove dataset from Studio", + ) + rm_dataset_parser.add_argument( + "--team", + action="store", + default=None, + help="The team to delete a dataset. By default, it will use team from config.", + ) dataset_stats_parser = subp.add_parser( "dataset-stats", @@ -909,8 +938,28 @@ def rm_dataset( name: str, version: Optional[int] = None, force: Optional[bool] = False, + studio: bool = False, + team: Optional[str] = None, +): + if studio: + remove_studio_dataset(team, name, version, force) + else: + catalog.remove_dataset(name, version=version, force=force) + + +def edit_dataset( + catalog: "Catalog", + name: str, + new_name: Optional[str] = None, + description: Optional[str] = None, + labels: Optional[list[str]] = None, + studio: bool = False, + team: Optional[str] = None, ): - catalog.remove_dataset(name, version=version, force=force) + if studio: + edit_studio_dataset(team, name, new_name, description, labels) + else: + catalog.edit_dataset(name, new_name, description, labels) def dataset_stats( @@ -1127,11 +1176,14 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09 edatachain_file=args.edatachain_file, ) elif args.command == "edit-dataset": - catalog.edit_dataset( + edit_dataset( + catalog, args.name, - description=args.description, new_name=args.new_name, + description=args.description, labels=args.labels, + studio=args.studio, + team=args.team, ) elif args.command == "ls": ls( @@ -1164,7 +1216,14 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09 schema=args.schema, ) elif args.command == "rm-dataset": - rm_dataset(catalog, args.name, version=args.version, force=args.force) + rm_dataset( + catalog, + args.name, + version=args.version, + force=args.force, + studio=args.studio, + team=args.team, + ) elif args.command == "dataset-stats": dataset_stats( catalog, diff --git a/src/datachain/remote/studio.py b/src/datachain/remote/studio.py index c0b1bb00..28e494d5 100644 --- a/src/datachain/remote/studio.py +++ b/src/datachain/remote/studio.py @@ -178,17 +178,9 @@ def _send_request(self, route: str, data: dict[str, Any]) -> Response[Any]: data = {} if not ok: - logger.error( - "Got bad response from Studio, content is %s", - response.content.decode("utf-8"), - ) if response.status_code == 403: message = f"Not authorized for the team {self.team}" else: - logger.error( - "Got bad response from Studio, content is %s", - response.content.decode("utf-8"), - ) message = data.get("message", "") else: message = "" @@ -230,6 +222,38 @@ def ls(self, paths: Iterable[str]) -> Iterator[tuple[str, Response[LsData]]]: def ls_datasets(self) -> Response[LsData]: return self._send_request("datachain/ls-datasets", {}) + def edit_dataset( + self, + name: str, + new_name: Optional[str] = None, + description: Optional[str] = None, + labels: Optional[list[str]] = None, + ) -> Response[DatasetInfoData]: + return self._send_request( + "datachain/edit-dataset", + { + "dataset_name": name, + "new_name": new_name, + "description": description, + "labels": labels, + }, + ) + + def rm_dataset( + self, + name: str, + version: Optional[int] = None, + force: Optional[bool] = False, + ) -> Response[DatasetInfoData]: + return self._send_request( + "datachain/rm-dataset", + { + "dataset_name": name, + "version": version, + "force": force, + }, + ) + def dataset_info(self, name: str) -> Response[DatasetInfoData]: def _parse_dataset_info(dataset_info): _parse_dates(dataset_info, ["created_at", "finished_at"]) diff --git a/src/datachain/studio.py b/src/datachain/studio.py index 9ef39017..25f82e88 100644 --- a/src/datachain/studio.py +++ b/src/datachain/studio.py @@ -130,6 +130,35 @@ def list_datasets(team: Optional[str] = None): yield (name, version) +def edit_studio_dataset( + team_name: Optional[str], + name: str, + new_name: Optional[str] = None, + description: Optional[str] = None, + labels: Optional[list[str]] = None, +): + client = StudioClient(team=team_name) + response = client.edit_dataset(name, new_name, description, labels) + if not response.ok: + raise_remote_error(response.message) + + print(f"Dataset {name} updated") + + +def remove_studio_dataset( + team_name: Optional[str], + name: str, + version: Optional[int] = None, + force: Optional[bool] = False, +): + client = StudioClient(team=team_name) + response = client.rm_dataset(name, version, force) + if not response.ok: + raise_remote_error(response.message) + + print(f"Dataset {name} removed") + + def save_config(hostname, token): config = Config(ConfigLevel.GLOBAL) with config.edit() as conf: From 12dd3fd7d045cf4a27010e38e0fa4fb56f65b6e3 Mon Sep 17 00:00:00 2001 From: Amrit Ghimire Date: Thu, 7 Nov 2024 22:46:48 +0545 Subject: [PATCH 2/3] Add tests --- src/datachain/cli.py | 3 - src/datachain/remote/studio.py | 20 ++++-- tests/test_cli_studio.py | 119 +++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 9 deletions(-) diff --git a/src/datachain/cli.py b/src/datachain/cli.py index 1c8e6ef7..4f68fe67 100644 --- a/src/datachain/cli.py +++ b/src/datachain/cli.py @@ -408,18 +408,15 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915 parse_edit_dataset.add_argument( "--new-name", action="store", - default="", help="Dataset new name", ) parse_edit_dataset.add_argument( "--description", action="store", - default="", help="Dataset description", ) parse_edit_dataset.add_argument( "--labels", - default=[], nargs="+", help="Dataset labels", ) diff --git a/src/datachain/remote/studio.py b/src/datachain/remote/studio.py index 28e494d5..5dce6e89 100644 --- a/src/datachain/remote/studio.py +++ b/src/datachain/remote/studio.py @@ -229,14 +229,22 @@ def edit_dataset( description: Optional[str] = None, labels: Optional[list[str]] = None, ) -> Response[DatasetInfoData]: + body = { + "dataset_name": name, + } + + if new_name is not None: + body["new_name"] = new_name + + if description is not None: + body["description"] = description + + if labels is not None: + body["labels"] = labels # type: ignore[assignment] + return self._send_request( "datachain/edit-dataset", - { - "dataset_name": name, - "new_name": new_name, - "description": description, - "labels": labels, - }, + body, ) def rm_dataset( diff --git a/tests/test_cli_studio.py b/tests/test_cli_studio.py index 02054196..e34a8f33 100644 --- a/tests/test_cli_studio.py +++ b/tests/test_cli_studio.py @@ -1,3 +1,4 @@ +import requests_mock from dvc_studio_client.auth import AuthorizationExpiredError from tabulate import tabulate @@ -164,3 +165,121 @@ def list_datasets_local(_): assert main(["datasets"]) == 0 out = capsys.readouterr().out assert sorted(out.splitlines()) == sorted(both_output.splitlines()) + + +def test_studio_edit_dataset(capsys, mocker): + with requests_mock.mock() as m: + m.post(f"{STUDIO_URL}/api/datachain/edit-dataset", json={}) + + # Studio token is required + assert ( + main( + [ + "edit-dataset", + "name", + "--new-name", + "new-name", + "--team", + "team_name", + "--studio", + ] + ) + == 1 + ) + out = capsys.readouterr().err + assert "Studio token is not set" in out + + # Set the studio token + with Config(ConfigLevel.GLOBAL).edit() as conf: + conf["studio"] = {"token": "isat_access_token", "team": "team_name"} + + assert ( + main( + [ + "edit-dataset", + "name", + "--new-name", + "new-name", + "--team", + "team_name", + "--studio", + ] + ) + == 0 + ) + + assert m.called + + last_request = m.last_request + assert last_request.json() == { + "dataset_name": "name", + "new_name": "new-name", + "team_name": "team_name", + } + + # With all arguments + assert ( + main( + [ + "edit-dataset", + "name", + "--new-name", + "new-name", + "--description", + "description", + "--labels", + "label1", + "--team", + "team_name", + "--studio", + ] + ) + == 0 + ) + last_request = m.last_request + assert last_request.json() == { + "dataset_name": "name", + "new_name": "new-name", + "description": "description", + "labels": ["label1"], + "team_name": "team_name", + } + + +def test_studio_rm_dataset(capsys, mocker): + with requests_mock.mock() as m: + m.post(f"{STUDIO_URL}/api/datachain/rm-dataset", json={}) + + # Studio token is required + assert main(["rm-dataset", "name", "--team", "team_name", "--studio"]) == 1 + out = capsys.readouterr().err + assert "Studio token is not set" in out + + # Set the studio token + with Config(ConfigLevel.GLOBAL).edit() as conf: + conf["studio"] = {"token": "isat_access_token", "team": "team_name"} + + assert ( + main( + [ + "rm-dataset", + "name", + "--team", + "team_name", + "--version", + "1", + "--force", + "--studio", + ] + ) + == 0 + ) + assert m.called + + last_request = m.last_request + assert last_request.json() == { + "dataset_name": "name", + "team_name": "team_name", + "version": 1, + "force": True, + } From a1610f5fee5237184c8b92203044c0ce86403fe7 Mon Sep 17 00:00:00 2001 From: Amrit Ghimire Date: Fri, 8 Nov 2024 08:10:37 +0545 Subject: [PATCH 3/3] Fix the help message --- src/datachain/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datachain/cli.py b/src/datachain/cli.py index 4f68fe67..ecdc5d32 100644 --- a/src/datachain/cli.py +++ b/src/datachain/cli.py @@ -424,7 +424,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915 "--studio", action="store_true", default=False, - help="Remove dataset from Studio", + help="Edit dataset from Studio", ) parse_edit_dataset.add_argument( "--team",