diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index fbfedd6bb7..1caa6c6ee3 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -29,6 +29,9 @@ __all__ = ["Butler"] +import os +import urllib.parse +import uuid from abc import abstractmethod from collections.abc import Collection, Iterable, Mapping, Sequence from contextlib import AbstractContextManager @@ -525,6 +528,83 @@ def get_known_repos(cls) -> set[str]: """ return ButlerRepoIndex.get_known_repos() + @classmethod + def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]: + """Extract the butler label and dataset ID from a dataset URI. + + Parameters + ---------- + uri : `str` + The dataset URI to parse. + + Returns + ------- + label : `str` + The label associated with the butler repository from which this + dataset originates. + dataset_id : `DatasetId` + The ID of the dataset. + + Notes + ----- + Supports dataset URIs of the forms ``ivo://rubin/butler_label/UUID`` + and ``butler://butler_label/UUID``. In ``ivo`` URIs the butler label + can include ``/`` and the leading ``/`` is always stripped. If the + repository label starts with ``/`` then it must be doubled up. e.g., + + ivo://rubin//repo/main/82d79caa-0823-4300-9874-67b737367ee0 + + will return a label of ``/repo/main``. + + This method does not attempt to check that the dataset exists in the + labeled butler. + """ + parsed = urllib.parse.urlparse(uri) + if parsed.scheme == "ivo": + # TODO: Validate netloc component. + label, id_ = os.path.split(parsed.path) + # Strip the leading /. + label = label[1:] + elif parsed.scheme == "butler": + label = parsed.netloc + # Need to strip the leading /. + id_ = parsed.path[1:] + else: + raise ValueError(f"Unrecognized URI scheme: {uri!r}") + if not label: + raise ValueError(f"No butler repository label found in uri {uri!r}") + try: + dataset_id = uuid.UUID(hex=id_) + except Exception as e: + e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}") + raise + + return label, dataset_id + + @classmethod + def get_dataset_from_uri(cls, uri: str) -> DatasetRef | None: + """Get the dataset associated with the given dataset URI. + + Parameters + ---------- + uri : `str` + The URI associated with a dataset. + + Returns + ------- + ref : `DatasetRef` or `None` + The dataset associated with that URI, or `None` if the UUID + is valid but the dataset is not known to this butler. + + Notes + ----- + It might be possible to pass in an optional ``LabeledButlerFactory`` + but how would a caller know the right access token to supply? + """ + label, dataset_id = cls.parse_dataset_uri(uri) + butler = cls.from_config(label) + return butler.get_dataset(dataset_id) + @abstractmethod def _caching_context(self) -> AbstractContextManager[None]: """Context manager that enables caching.""" diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 595b08957f..56ff44de53 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -54,7 +54,7 @@ from lsst.daf.butler.datastore.file_templates import FileTemplate from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory from lsst.daf.butler.tests import DatastoreMock -from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir +from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir try: from lsst.daf.butler.tests.server import create_test_server @@ -882,10 +882,53 @@ def makeButler(self, writeable: bool = False) -> Butler: registryConfig = RegistryConfig(config.get("registry")) _RegistryFactory(registryConfig).create_from_config() + # Write the YAML file so that some tests can recreate butler from it. + config.dumpToUri(os.path.join(self.root, "butler.yaml")) butler = Butler.from_config(config, writeable=writeable) DatastoreMock.apply(butler) return butler + def test_dataset_uris(self): + """Test that dataset URIs can be parsed and retrieved.""" + butler = self.makeButler(writeable=True) + butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) + butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile)) + + butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") + self.assertIsInstance(ref, DatasetRef) + + # Get the butler root for the URI. + config_dir = butler._config["root"] + + # Read it via a repo label and a path. + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file: + label = "test_repo" + index_file.write(f"{label}: {config_dir}\n") + index_file.flush() + with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}): + for dataset_uri in ( + f"ivo://rubin/{config_dir}/{ref.id}", + f"ivo://rubin/{config_dir}/butler.yaml/{ref.id}", + f"butler://{label}/{ref.id}", + f"ivo://rubin/{label}/{ref.id}", + ): + ref2 = Butler.get_dataset_from_uri(dataset_uri) + self.assertEqual(ref, ref2) + + # Non existent dataset. + missing_id = str(ref.id).replace("2", "3") + no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}") + self.assertIsNone(no_ref) + + # Test some failure modes. + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("butler://label/1234") + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("butler://1234") + with self.assertRaises(ValueError): + Butler.parse_dataset_uri("https://something.edu/1234") + class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase): """Run tests against DirectButler implementation using the