Add API for parsing butler dataset URIs (butler and ivo)

lsst · Nov 1, 2024 · 57d6ec3 · 57d6ec3
1 parent 6cb0823
commit 57d6ec3
Show file tree

Hide file tree

Showing 2 changed files with 124 additions and 1 deletion.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -29,6 +29,9 @@
 
 __all__ = ["Butler"]
 
+import os
+import urllib.parse
+import uuid
 from abc import abstractmethod
 from collections.abc import Collection, Iterable, Mapping, Sequence
 from contextlib import AbstractContextManager
@@ -525,6 +528,83 @@ def get_known_repos(cls) -> set[str]:
         """
         return ButlerRepoIndex.get_known_repos()
 
+    @classmethod
+    def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]:
+        """Extract the butler label and dataset ID from a dataset URI.
+
+        Parameters
+        ----------
+        uri : `str`
+            The dataset URI to parse.
+
+        Returns
+        -------
+        label : `str`
+            The label associated with the butler repository from which this
+            dataset originates.
+        dataset_id : `DatasetId`
+            The ID of the dataset.
+
+        Notes
+        -----
+        Supports dataset URIs of the forms ``ivo://rubin/butler_label/UUID``
+        and ``butler://butler_label/UUID``. In ``ivo`` URIs the butler label
+        can include ``/`` and the leading ``/`` is always stripped. If the
+        repository label starts with ``/`` then it must be doubled up. e.g.,
+
+            ivo://rubin//repo/main/82d79caa-0823-4300-9874-67b737367ee0
+
+        will return a label of ``/repo/main``.
+
+        This method does not attempt to check that the dataset exists in the
+        labeled butler.
+        """
+        parsed = urllib.parse.urlparse(uri)
+        if parsed.scheme == "ivo":
+            # TODO: Validate netloc component.
+            label, id_ = os.path.split(parsed.path)
+            # Strip the leading /.
+            label = label[1:]
+        elif parsed.scheme == "butler":
+            label = parsed.netloc
+            # Need to strip the leading /.
+            id_ = parsed.path[1:]
+        else:
+            raise ValueError(f"Unrecognized URI scheme: {uri!r}")
+        if not label:
+            raise ValueError(f"No butler repository label found in uri {uri!r}")
+        try:
+            dataset_id = uuid.UUID(hex=id_)
+        except Exception as e:
+            e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
+            raise
+
+        return label, dataset_id
+
+    @classmethod
+    def get_dataset_from_uri(cls, uri: str) -> DatasetRef | None:
+        """Get the dataset associated with the given dataset URI.
+
+        Parameters
+        ----------
+        uri : `str`
+            The URI associated with a dataset.
+
+        Returns
+        -------
+        ref : `DatasetRef` or `None`
+            The dataset associated with that URI, or `None` if the UUID
+            is valid but the dataset is not known to this butler.
+
+        Notes
+        -----
+        It might be possible to pass in an optional ``LabeledButlerFactory``
+        but how would a caller know the right access token to supply?
+        """
+        label, dataset_id = cls.parse_dataset_uri(uri)
+        butler = cls.from_config(label)
+        return butler.get_dataset(dataset_id)
+
     @abstractmethod
     def _caching_context(self) -> AbstractContextManager[None]:
         """Context manager that enables caching."""

diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py
@@ -54,7 +54,7 @@
 from lsst.daf.butler.datastore.file_templates import FileTemplate
 from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory
 from lsst.daf.butler.tests import DatastoreMock
-from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir
+from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir
 
 try:
     from lsst.daf.butler.tests.server import create_test_server
@@ -882,10 +882,53 @@ def makeButler(self, writeable: bool = False) -> Butler:
         registryConfig = RegistryConfig(config.get("registry"))
         _RegistryFactory(registryConfig).create_from_config()
 
+        # Write the YAML file so that some tests can recreate butler from it.
+        config.dumpToUri(os.path.join(self.root, "butler.yaml"))
         butler = Butler.from_config(config, writeable=writeable)
         DatastoreMock.apply(butler)
         return butler
 
+    def test_dataset_uris(self):
+        """Test that dataset URIs can be parsed and retrieved."""
+        butler = self.makeButler(writeable=True)
+        butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
+        butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile))
+
+        butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
+        ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
+        self.assertIsInstance(ref, DatasetRef)
+
+        # Get the butler root for the URI.
+        config_dir = butler._config["root"]
+
+        # Read it via a repo label and a path.
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file:
+            label = "test_repo"
+            index_file.write(f"{label}: {config_dir}\n")
+            index_file.flush()
+            with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}):
+                for dataset_uri in (
+                    f"ivo://rubin/{config_dir}/{ref.id}",
+                    f"ivo://rubin/{config_dir}/butler.yaml/{ref.id}",
+                    f"butler://{label}/{ref.id}",
+                    f"ivo://rubin/{label}/{ref.id}",
+                ):
+                    ref2 = Butler.get_dataset_from_uri(dataset_uri)
+                    self.assertEqual(ref, ref2)
+
+                # Non existent dataset.
+                missing_id = str(ref.id).replace("2", "3")
+                no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}")
+                self.assertIsNone(no_ref)
+
+        # Test some failure modes.
+        with self.assertRaises(ValueError):
+            Butler.parse_dataset_uri("butler://label/1234")
+        with self.assertRaises(ValueError):
+            Butler.parse_dataset_uri("butler://1234")
+        with self.assertRaises(ValueError):
+            Butler.parse_dataset_uri("https://something.edu/1234")
+
 
 class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase):
     """Run tests against DirectButler implementation using the