Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add studio flag for rm-dataset and edit-dataset #572

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 64 additions & 8 deletions src/datachain/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
from datachain.config import Config
from datachain.error import DataChainError
from datachain.lib.dc import DataChain
from datachain.studio import list_datasets, process_studio_cli_args
from datachain.studio import (
edit_studio_dataset,
list_datasets,
process_studio_cli_args,
remove_studio_dataset,
)
from datachain.telemetry import telemetry

if TYPE_CHECKING:
Expand Down Expand Up @@ -403,21 +408,30 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
parse_edit_dataset.add_argument(
"--new-name",
action="store",
default="",
help="Dataset new name",
)
parse_edit_dataset.add_argument(
"--description",
action="store",
default="",
help="Dataset description",
)
parse_edit_dataset.add_argument(
"--labels",
default=[],
nargs="+",
help="Dataset labels",
)
parse_edit_dataset.add_argument(
"--studio",
action="store_true",
default=False,
help="Edit dataset from Studio",
)
parse_edit_dataset.add_argument(
"--team",
action="store",
default=None,
help="The team to edit a dataset. By default, it will use team from config.",
)

datasets_parser = subp.add_parser(
"datasets", parents=[parent_parser], description="List datasets"
Expand Down Expand Up @@ -466,6 +480,18 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
action=BooleanOptionalAction,
help="Force delete registered dataset with all of it's versions",
)
rm_dataset_parser.add_argument(
"--studio",
action="store_true",
default=False,
help="Remove dataset from Studio",
)
rm_dataset_parser.add_argument(
"--team",
action="store",
default=None,
help="The team to delete a dataset. By default, it will use team from config.",
)

dataset_stats_parser = subp.add_parser(
"dataset-stats",
Expand Down Expand Up @@ -909,8 +935,28 @@ def rm_dataset(
name: str,
version: Optional[int] = None,
force: Optional[bool] = False,
studio: bool = False,
team: Optional[str] = None,
):
catalog.remove_dataset(name, version=version, force=force)
if studio:
remove_studio_dataset(team, name, version, force)
else:
catalog.remove_dataset(name, version=version, force=force)


def edit_dataset(
catalog: "Catalog",
name: str,
new_name: Optional[str] = None,
description: Optional[str] = None,
labels: Optional[list[str]] = None,
studio: bool = False,
team: Optional[str] = None,
):
if studio:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Q] Is there a reason for deviating from the behaviour implemented in #561? Why can't a user edit/remove a dataset from both local and remote in the same call?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For list operation, having the option to list the datasets from both local and remote seems like a frequent operation.

But I am not sure how helpful it will edit or remove dataset from both local and remote in the same call.

I was also looking at the implementation of pull where ds:// prefix is used to identify if it is remote or if it is local. I don't know if it will be confusing to user though to implement it in edit and remove as well. cc. @shcheklein

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I am not sure how helpful it will edit or remove dataset from both local and remote in the same call.

If you think about this in terms of #578 users would definitely want to be able to rename both at the same time. We want to avoid users ending up with datasets with the same identifier being named differently between local and remote as much as possible.

edit_studio_dataset(team, name, new_name, description, labels)
else:
catalog.edit_dataset(name, new_name, description, labels)


def dataset_stats(
Expand Down Expand Up @@ -1127,11 +1173,14 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
edatachain_file=args.edatachain_file,
)
elif args.command == "edit-dataset":
catalog.edit_dataset(
edit_dataset(
catalog,
args.name,
description=args.description,
new_name=args.new_name,
description=args.description,
labels=args.labels,
studio=args.studio,
team=args.team,
)
elif args.command == "ls":
ls(
Expand Down Expand Up @@ -1164,7 +1213,14 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
schema=args.schema,
)
elif args.command == "rm-dataset":
rm_dataset(catalog, args.name, version=args.version, force=args.force)
rm_dataset(
catalog,
args.name,
version=args.version,
force=args.force,
studio=args.studio,
team=args.team,
)
elif args.command == "dataset-stats":
dataset_stats(
catalog,
Expand Down
48 changes: 40 additions & 8 deletions src/datachain/remote/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,17 +178,9 @@ def _send_request(self, route: str, data: dict[str, Any]) -> Response[Any]:
data = {}

if not ok:
logger.error(
"Got bad response from Studio, content is %s",
response.content.decode("utf-8"),
)
if response.status_code == 403:
message = f"Not authorized for the team {self.team}"
else:
logger.error(
"Got bad response from Studio, content is %s",
response.content.decode("utf-8"),
)
message = data.get("message", "")
else:
message = ""
Expand Down Expand Up @@ -230,6 +222,46 @@ def ls(self, paths: Iterable[str]) -> Iterator[tuple[str, Response[LsData]]]:
def ls_datasets(self) -> Response[LsData]:
return self._send_request("datachain/ls-datasets", {})

def edit_dataset(
self,
name: str,
new_name: Optional[str] = None,
description: Optional[str] = None,
labels: Optional[list[str]] = None,
) -> Response[DatasetInfoData]:
body = {
"dataset_name": name,
}

if new_name is not None:
body["new_name"] = new_name

if description is not None:
body["description"] = description

if labels is not None:
body["labels"] = labels # type: ignore[assignment]

return self._send_request(
"datachain/edit-dataset",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

related to the groups of commands, etc - let's please make API also clean

datachain/datasets/ls
datachain/datasets/rename

-etc

rm-dataset and similar look very weird for REST api.

Copy link
Member

@skshetry skshetry Nov 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could even be a single endpoint that lists on GET, renames via PUT (assuming dataset name is not part of a resource URI as PUT has to be idempotent), removes via DELETE.

Since dataset is part of a team, ideally the URI should be /datachain/<team>/datasets.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this comment to the #576 as well. We can take it as a part of refactor.

body,
)

def rm_dataset(
self,
name: str,
version: Optional[int] = None,
force: Optional[bool] = False,
) -> Response[DatasetInfoData]:
return self._send_request(
"datachain/rm-dataset",
{
"dataset_name": name,
"version": version,
"force": force,
},
)

def dataset_info(self, name: str) -> Response[DatasetInfoData]:
def _parse_dataset_info(dataset_info):
_parse_dates(dataset_info, ["created_at", "finished_at"])
Expand Down
29 changes: 29 additions & 0 deletions src/datachain/studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,35 @@
yield (name, version)


def edit_studio_dataset(
team_name: Optional[str],
name: str,
new_name: Optional[str] = None,
description: Optional[str] = None,
labels: Optional[list[str]] = None,
):
client = StudioClient(team=team_name)
response = client.edit_dataset(name, new_name, description, labels)
if not response.ok:
raise_remote_error(response.message)

Check warning on line 143 in src/datachain/studio.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/studio.py#L143

Added line #L143 was not covered by tests

print(f"Dataset {name} updated")


def remove_studio_dataset(
team_name: Optional[str],
name: str,
version: Optional[int] = None,
force: Optional[bool] = False,
):
client = StudioClient(team=team_name)
response = client.rm_dataset(name, version, force)
if not response.ok:
raise_remote_error(response.message)

Check warning on line 157 in src/datachain/studio.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/studio.py#L157

Added line #L157 was not covered by tests

print(f"Dataset {name} removed")


def save_config(hostname, token):
config = Config(ConfigLevel.GLOBAL)
with config.edit() as conf:
Expand Down
119 changes: 119 additions & 0 deletions tests/test_cli_studio.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import requests_mock
from dvc_studio_client.auth import AuthorizationExpiredError
from tabulate import tabulate

Expand Down Expand Up @@ -164,3 +165,121 @@ def list_datasets_local(_):
assert main(["datasets"]) == 0
out = capsys.readouterr().out
assert sorted(out.splitlines()) == sorted(both_output.splitlines())


def test_studio_edit_dataset(capsys, mocker):
with requests_mock.mock() as m:
m.post(f"{STUDIO_URL}/api/datachain/edit-dataset", json={})

# Studio token is required
assert (
main(
[
"edit-dataset",
"name",
"--new-name",
"new-name",
"--team",
"team_name",
"--studio",
]
)
== 1
)
out = capsys.readouterr().err
assert "Studio token is not set" in out

# Set the studio token
with Config(ConfigLevel.GLOBAL).edit() as conf:
conf["studio"] = {"token": "isat_access_token", "team": "team_name"}

assert (
main(
[
"edit-dataset",
"name",
"--new-name",
"new-name",
"--team",
"team_name",
"--studio",
]
)
== 0
)

assert m.called

last_request = m.last_request
assert last_request.json() == {
"dataset_name": "name",
"new_name": "new-name",
"team_name": "team_name",
}

# With all arguments
assert (
main(
[
"edit-dataset",
"name",
"--new-name",
"new-name",
"--description",
"description",
"--labels",
"label1",
"--team",
"team_name",
"--studio",
]
)
== 0
)
last_request = m.last_request
assert last_request.json() == {
"dataset_name": "name",
"new_name": "new-name",
"description": "description",
"labels": ["label1"],
"team_name": "team_name",
}


def test_studio_rm_dataset(capsys, mocker):
with requests_mock.mock() as m:
m.post(f"{STUDIO_URL}/api/datachain/rm-dataset", json={})

# Studio token is required
assert main(["rm-dataset", "name", "--team", "team_name", "--studio"]) == 1
out = capsys.readouterr().err
assert "Studio token is not set" in out

# Set the studio token
with Config(ConfigLevel.GLOBAL).edit() as conf:
conf["studio"] = {"token": "isat_access_token", "team": "team_name"}

assert (
main(
[
"rm-dataset",
"name",
"--team",
"team_name",
"--version",
"1",
"--force",
"--studio",
]
)
== 0
)
assert m.called

last_request = m.last_request
assert last_request.json() == {
"dataset_name": "name",
"team_name": "team_name",
"version": 1,
"force": True,
}