Skip to content

Commit

Permalink
Add API for parsing butler dataset URIs (butler and ivo)
Browse files Browse the repository at this point in the history
  • Loading branch information
timj committed Nov 22, 2024
1 parent 09cb37a commit ea1c47e
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 1 deletion.
80 changes: 80 additions & 0 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

__all__ = ["Butler"]

import os
import urllib.parse
import uuid
from abc import abstractmethod
from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
from contextlib import AbstractContextManager
Expand Down Expand Up @@ -526,6 +529,83 @@ def get_known_repos(cls) -> set[str]:
"""
return ButlerRepoIndex.get_known_repos()

@classmethod
def parse_dataset_uri(cls, uri: str) -> tuple[str, DatasetId]:
"""Extract the butler label and dataset ID from a dataset URI.
Parameters
----------
uri : `str`
The dataset URI to parse.
Returns
-------
label : `str`
The label associated with the butler repository from which this
dataset originates.
dataset_id : `DatasetId`
The ID of the dataset.
Notes
-----
Supports dataset URIs of the forms ``ivo://rubin/butler_label/UUID``
and ``butler://butler_label/UUID``. In ``ivo`` URIs the butler label
can include ``/`` and the leading ``/`` is always stripped. If the
repository label starts with ``/`` then it must be doubled up. e.g.,
ivo://rubin//repo/main/82d79caa-0823-4300-9874-67b737367ee0
will return a label of ``/repo/main``.
This method does not attempt to check that the dataset exists in the
labeled butler.
"""
parsed = urllib.parse.urlparse(uri)
if parsed.scheme == "ivo":
# TODO: Validate netloc component.
label, id_ = os.path.split(parsed.path)
# Strip the leading /.
label = label[1:]
elif parsed.scheme == "butler":
label = parsed.netloc
# Need to strip the leading /.
id_ = parsed.path[1:]
else:
raise ValueError(f"Unrecognized URI scheme: {uri!r}")
if not label:
raise ValueError(f"No butler repository label found in uri {uri!r}")
try:
dataset_id = uuid.UUID(hex=id_)
except Exception as e:
e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
raise

return label, dataset_id

@classmethod
def get_dataset_from_uri(cls, uri: str) -> DatasetRef | None:
"""Get the dataset associated with the given dataset URI.
Parameters
----------
uri : `str`
The URI associated with a dataset.
Returns
-------
ref : `DatasetRef` or `None`
The dataset associated with that URI, or `None` if the UUID
is valid but the dataset is not known to this butler.
Notes
-----
It might be possible to pass in an optional ``LabeledButlerFactory``
but how would a caller know the right access token to supply?
"""
label, dataset_id = cls.parse_dataset_uri(uri)
butler = cls.from_config(label)
return butler.get_dataset(dataset_id)

@abstractmethod
def _caching_context(self) -> AbstractContextManager[None]:
"""Context manager that enables caching."""
Expand Down
47 changes: 46 additions & 1 deletion tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
from lsst.daf.butler.datastore.file_templates import FileTemplate
from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory
from lsst.daf.butler.tests import DatastoreMock
from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir
from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir

try:
from lsst.daf.butler.tests.server import create_test_server
Expand Down Expand Up @@ -882,10 +882,55 @@ def makeButler(self, writeable: bool = False) -> Butler:
registryConfig = RegistryConfig(config.get("registry"))
_RegistryFactory(registryConfig).create_from_config()

# Write the YAML file so that some tests can recreate butler from it.
config.dumpToUri(os.path.join(self.root, "butler.yaml"))
butler = Butler.from_config(config, writeable=writeable)
DatastoreMock.apply(butler)
return butler

def test_dataset_uris(self):
"""Test that dataset URIs can be parsed and retrieved."""
butler = self.makeButler(writeable=True)
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile))

butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
self.assertIsInstance(ref, DatasetRef)

# Get the butler root for the URI.
config_dir = butler._config["root"]

# Read it via a repo label and a path.
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file:
label = "test_repo"
index_file.write(f"{label}: {config_dir}\n")
index_file.flush()
with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}):
for dataset_uri in (
f"ivo://rubin/{config_dir}/{ref.id}",
f"ivo://rubin/{config_dir}/butler.yaml/{ref.id}",
f"butler://{label}/{ref.id}",
f"ivo://rubin/{label}/{ref.id}",
):
ref2 = Butler.get_dataset_from_uri(dataset_uri)
self.assertEqual(ref, ref2)

# Non existent dataset.
missing_id = str(ref.id).replace("2", "3")
no_ref = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}")
self.assertIsNone(no_ref)

# Test some failure modes.
with self.assertRaises(ValueError):
Butler.parse_dataset_uri("ivo://rubin/1234")
with self.assertRaises(ValueError):
Butler.parse_dataset_uri("butler://label/1234")
with self.assertRaises(ValueError):
Butler.parse_dataset_uri("butler://1234")
with self.assertRaises(ValueError):
Butler.parse_dataset_uri("https://something.edu/1234")


class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase):
"""Run tests against DirectButler implementation using the
Expand Down

0 comments on commit ea1c47e

Please sign in to comment.