From 9bb4501bf00c55417b825a6500eb37a4f537f03a Mon Sep 17 00:00:00 2001 From: skshetry <18718008+skshetry@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:06:38 +0545 Subject: [PATCH] ds add: use --dvc/--dvcx/--url flags to accept url (#10326) * ds add: use --dvc/--dvcx/--url flags to accept url .. instead of relying on `scheme://` I think this is more simpler from user's perspective, easier to teach and easier to maintain as we won't have to do url parsing. I am taking inspiration from `cargo install` has `--git`/`--rev`, etc. to install from alternate sources. ```console Options: --version Specify a version to install --index Registry index to install from --registry Registry to use --git Git URL to install the specified crate from --branch Branch to use when installing from git --tag Tag to use when installing from git --rev Specific commit to use when installing from git --path Filesystem path to local crate to install --root Directory to install packages into``` ``` The `name` is also changed from an option to an argument. ### Examples ```console dvc ds add example-get-started --dvc git@github.com:iterative/example-get-started.git dvc ds add dogs --dvcx dogs dvc ds add dogs --url s3://bucket/key/path ``` * add --version alias --- dvc/commands/dataset.py | 48 ++++++++++++++++++++---------- dvc/repo/datasets.py | 35 +++++++--------------- tests/func/test_dataset.py | 26 ++++++++-------- tests/unit/command/test_dataset.py | 4 +-- tests/unit/test_dataset.py | 45 ---------------------------- 5 files changed, 58 insertions(+), 100 deletions(-) delete mode 100644 tests/unit/test_dataset.py diff --git a/dvc/commands/dataset.py b/dvc/commands/dataset.py index 0ab245f2ec..d4c35b61bc 100644 --- a/dvc/commands/dataset.py +++ b/dvc/commands/dataset.py @@ -49,6 +49,17 @@ def display(cls, name: str, dataset: "Dataset", action: str = "Adding"): ui.write(action, ui.rich_text(name, "cyan"), text, styled=True) def run(self): + if not self.args.dvc and self.args.rev: + raise DvcException("--rev can't be used without --dvc") + if not self.args.dvc and self.args.path: + raise DvcException("--path can't be used without --dvc") + + d = vars(self.args) + for key in ["dvc", "dvcx", "url"]: + if url := d.pop(key, None): + d.update({"type": key, "url": url}) + break + existing = self.repo.datasets.get(self.args.name) with self.repo.scm_context: if not self.args.force and existing: @@ -57,7 +68,7 @@ def run(self): f"{self.args.name} already exists in {path}, " "use the --force to overwrite" ) - dataset = self.repo.datasets.add(**vars(self.args)) + dataset = self.repo.datasets.add(**d) self.display(self.args.name, dataset) return 0 @@ -154,33 +165,37 @@ def add_parser(subparsers, parent_parser): formatter_class=formatter.RawTextHelpFormatter, help=dataset_add_help, ) - ds_add_parser.add_argument( + + url_exclusive_group = ds_add_parser.add_mutually_exclusive_group(required=True) + url_exclusive_group.add_argument( + "--dvcx", metavar="name", help="Name of the dvcx dataset to track" + ) + url_exclusive_group.add_argument( + "--dvc", + help="Path or URL to a Git/DVC repository to track", + metavar="url", + ) + url_exclusive_group.add_argument( "--url", - required=True, help="""\ -Location of the data to download. Supported URLs: +URL of a cloud-versioned remote to track. Supported URLs: s3://bucket/key/path gs://bucket/path/to/file/or/dir azure://mycontainer/path remote://remote_name/path/to/file/or/dir (see `dvc remote`) -dvcx://dataset_name - -To import data from dvc/git repositories, \ -add dvc:// schema to the repo url, e.g: -dvc://git@github.com:iterative/example-get-started.git -dvc+https://github.com/iterative/example-get-started.git""", - ) - ds_add_parser.add_argument( - "--name", help="Name of the dataset to add", required=True +""", ) + ds_add_parser.add_argument("name", help="Name of the dataset to add") ds_add_parser.add_argument( "--rev", - help="Git revision, e.g. SHA, branch, tag " - "(only applicable for dvc/git repository)", + help="Git revision, e.g. SHA, branch, tag (only applicable with --dvc)", + metavar="", ) ds_add_parser.add_argument( - "--path", help="Path to a file or directory within the git repository" + "--path", + help="Path to a file or a directory within a git repository " + "(only applicable with --dvc)", ) ds_add_parser.add_argument( "-f", @@ -202,6 +217,7 @@ def add_parser(subparsers, parent_parser): ds_update_parser.add_argument("name", help="Name of the dataset to update") ds_update_parser.add_argument( "--rev", + "--version", nargs="?", help="DVCX dataset version or Git revision (e.g. SHA, branch, tag)", metavar="", diff --git a/dvc/repo/datasets.py b/dvc/repo/datasets.py index b3af559a3c..c06308f497 100644 --- a/dvc/repo/datasets.py +++ b/dvc/repo/datasets.py @@ -26,26 +26,6 @@ logger = logger.getChild(__name__) -def parse_url_and_type(url: str): - from urllib.parse import urlsplit - - if os.path.exists(url): - return {"type": "dvc", "url": url} - - url_obj = urlsplit(url) - if url_obj.scheme == "dvcx": - return {"type": "dvcx", "url": url} - if url_obj.scheme and not url_obj.scheme.startswith("dvc"): - return {"type": "url", "url": url} - - protos = tuple(url_obj.scheme.split("+")) - if not protos or protos == ("dvc",): - url = url_obj.netloc + url_obj.path - else: - url = url_obj._replace(scheme=protos[1]).geturl() - return {"type": "dvc", "url": url} - - def _get_dataset_record(name: str) -> "DatasetRecord": from dvc.exceptions import DvcException @@ -189,10 +169,15 @@ class DVCXDataset: type: ClassVar[Literal["dvcx"]] = "dvcx" + @property + def pinned(self) -> bool: + return self.name_version[1] is not None + @property def name_version(self) -> tuple[str, Optional[int]]: url = urlparse(self.spec.url) - parts = url.netloc.split("@v") + path = url.netloc + url.path + parts = path.split("@v") assert parts name = parts[0] @@ -384,13 +369,15 @@ def _build_dataset( def add( self, - url: str, name: str, + url: str, + type: str, # noqa: A002 manifest_path: StrPath = "dvc.yaml", **kwargs: Any, ) -> Dataset: - spec = kwargs | parse_url_and_type(url) | {"name": name} - dataset = self._build_dataset(os.path.abspath(manifest_path), spec) + assert type in {"dvc", "dvcx", "url"} + kwargs.update({"name": name, "url": url, "type": type}) + dataset = self._build_dataset(os.path.abspath(manifest_path), kwargs) dataset = dataset.update(self.repo) self.dump(dataset) diff --git a/tests/func/test_dataset.py b/tests/func/test_dataset.py index ad1f9be859..77a83e1fe2 100644 --- a/tests/func/test_dataset.py +++ b/tests/func/test_dataset.py @@ -48,7 +48,7 @@ def test_dvc(tmp_dir, scm, dvc: "Repo"): datasets = dvc.datasets tmp_dir.scm_gen("file", "file", commit="add file") - dataset = datasets.add(tmp_dir.fs_path, name="mydataset", path="file") + dataset = datasets.add("mydataset", tmp_dir.fs_path, "dvc", path="file") expected = DVCDataset( manifest_path=(tmp_dir / "dvc.yaml").fs_path, spec=DVCDatasetSpec( @@ -84,13 +84,13 @@ def test_dvcx(tmp_dir, dvc, mocker): version_info.append(version_info[1]) mocker.patch("dvc.repo.datasets._get_dataset_info", side_effect=version_info) - dataset = datasets.add(url="dvcx://dataset", name="mydataset") + dataset = datasets.add("mydataset", "dataset", "dvcx") expected = DVCXDataset( manifest_path=(tmp_dir / "dvc.yaml").fs_path, - spec=DatasetSpec(name="mydataset", url="dvcx://dataset", type="dvcx"), + spec=DatasetSpec(name="mydataset", url="dataset", type="dvcx"), lock=DVCXDatasetLock( name="mydataset", - url="dvcx://dataset", + url="dataset", type="dvcx", version=1, created_at=version_info[0].created_at, @@ -125,7 +125,7 @@ def mocked_save(d): mocker.patch.object(Dependency, "save", mocked_save) - dataset = datasets.add(url="s3://dataset", name="mydataset") + dataset = datasets.add("mydataset", "s3://dataset", "url") expected = URLDataset( manifest_path=(tmp_dir / "dvc.yaml").fs_path, spec=DatasetSpec(name="mydataset", url="s3://dataset", type="url"), @@ -191,14 +191,14 @@ def test_dvc_dump(tmp_dir, dvc): def test_dvcx_dump(tmp_dir, dvc): manifest_path = os.path.join(tmp_dir, "dvc.yaml") - spec = DatasetSpec(name="mydataset", url="dvcx://dataset", type="dvcx") + spec = DatasetSpec(name="mydataset", url="dataset", type="dvcx") dt = datetime.now(tz=timezone.utc) lock = DVCXDatasetLock(version=1, created_at=dt, **spec.to_dict()) dataset = DVCXDataset(manifest_path=manifest_path, spec=spec, lock=lock) dvc.datasets.dump(dataset) - spec_d = {"name": "mydataset", "type": "dvcx", "url": "dvcx://dataset"} + spec_d = {"name": "mydataset", "type": "dvcx", "url": "dataset"} assert (tmp_dir / "dvc.yaml").parse() == {"datasets": [spec_d]} assert (tmp_dir / "dvc.lock").parse() == { "schema": "2.0", @@ -246,7 +246,7 @@ def test_invalidation(tmp_dir, dvc): spec = DatasetSpec(name="mydataset", url="url1", type="url") lock = DVCXDatasetLock( name="mydataset", - url="dvcx://dataset", + url="dataset", type="dvcx", version=1, created_at=datetime.now(tz=timezone.utc), @@ -262,7 +262,7 @@ def test_invalidation(tmp_dir, dvc): def test_dvc_dataset_pipeline(tmp_dir, dvc, scm): - dvc.datasets.add(name="mydataset", url=tmp_dir.fs_path) + dvc.datasets.add("mydataset", tmp_dir.fs_path, "dvc") stage = dvc.stage.add(cmd="echo", name="train", deps=["ds://mydataset"]) assert (tmp_dir / "dvc.yaml").parse() == { @@ -293,11 +293,11 @@ def test_dvcx_dataset_pipeline(mocker, tmp_dir, dvc): version_info = [MockedDVCXVersionInfo(1), MockedDVCXVersionInfo(2)] mocker.patch("dvc.repo.datasets._get_dataset_info", side_effect=version_info) - dvc.datasets.add(name="mydataset", url="dvcx://mydataset") + dvc.datasets.add("mydataset", "dataset", "dvcx") stage = dvc.stage.add(cmd="echo", name="train", deps=["ds://mydataset"]) assert (tmp_dir / "dvc.yaml").parse() == { - "datasets": [{"name": "mydataset", "url": "dvcx://mydataset", "type": "dvcx"}], + "datasets": [{"name": "mydataset", "url": "dataset", "type": "dvcx"}], "stages": {"train": {"cmd": "echo", "deps": ["ds://mydataset"]}}, } @@ -330,7 +330,7 @@ def mocked_save(d): mocker.patch.object(Dependency, "save", mocked_save) - dvc.datasets.add(name="mydataset", url="s3://mydataset") + dvc.datasets.add("mydataset", "s3://mydataset", "url") stage = dvc.stage.add(cmd="echo", name="train", deps=["ds://mydataset"]) assert (tmp_dir / "dvc.yaml").parse() == { @@ -362,7 +362,7 @@ def test_pipeline_when_not_in_sync(tmp_dir, dvc): spec = DatasetSpec(name="mydataset", url="url1", type="url") lock = DVCXDatasetLock( name="mydataset", - url="dvcx://dataset", + url="dataset", type="dvcx", version=1, created_at=datetime.now(tz=timezone.utc), diff --git a/tests/unit/command/test_dataset.py b/tests/unit/command/test_dataset.py index 616d91345b..15ac37b321 100644 --- a/tests/unit/command/test_dataset.py +++ b/tests/unit/command/test_dataset.py @@ -33,7 +33,7 @@ def test_add(dvc, capsys, mocker, spec, lock, expected_output): m = mocker.patch("dvc.repo.datasets.Datasets.add", return_value=dataset) - assert main(["dataset", "add", "--url", spec["url"], "--name", spec["name"]]) == 0 + assert main(["dataset", "add", spec["name"], f"--{spec['type']}", spec["url"]]) == 0 out, err = capsys.readouterr() assert out == expected_output assert not err @@ -45,7 +45,7 @@ def test_add_already_exists(dvc, caplog, mocker): dataset = dvc.datasets._build_dataset("dvc.yaml", spec, None) mocker.patch("dvc.repo.datasets.Datasets.get", return_value=dataset) - assert main(["dataset", "add", "--url", "url", "--name", "ds"]) == 255 + assert main(["dataset", "add", "ds", "--dvcx", "dataset"]) == 255 assert "ds already exists in dvc.yaml, use the --force to overwrite" in caplog.text diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py deleted file mode 100644 index 22beaf581e..0000000000 --- a/tests/unit/test_dataset.py +++ /dev/null @@ -1,45 +0,0 @@ -import pytest - -from dvc.repo.datasets import parse_url_and_type - - -@pytest.mark.parametrize( - "url,expected", - [ - ("s3://bucket/path", {"type": "url", "url": "s3://bucket/path"}), - ("gs://bucket/path", {"type": "url", "url": "gs://bucket/path"}), - ("azure://container/path", {"type": "url", "url": "azure://container/path"}), - ( - "remote://remote_name/path", - {"type": "url", "url": "remote://remote_name/path"}, - ), - ("dvcx://dataset_name", {"type": "dvcx", "url": "dvcx://dataset_name"}), - ( - "dvc+file:///home/user/repository", - { - "type": "dvc", - "url": "file:///home/user/repository", - }, - ), - ( - "dvc://git@github.com:iterative/example-get-started.git", - {"type": "dvc", "url": "git@github.com:iterative/example-get-started.git"}, - ), - ( - "dvc+https://github.com/iterative/example-get-started.git", - { - "type": "dvc", - "url": "https://github.com/iterative/example-get-started.git", - }, - ), - ( - "dvc+ssh://github.com/iterative/example-get-started.git", - { - "type": "dvc", - "url": "ssh://github.com/iterative/example-get-started.git", - }, - ), - ], -) -def test_url_parsing(url, expected): - assert parse_url_and_type(url) == expected