From ef2c5d0469698d89be321cd903ec65d7dc8e59ce Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Thu, 29 Aug 2024 13:55:26 +0200 Subject: [PATCH 1/7] WIP: start to implement dtool secial remote --- datalad_dtool/dtool_remote.py | 74 +++++++++++++++++++ datalad_dtool/import.py | 130 ++++++++++++++++++++++++++++++++++ setup.cfg | 2 + 3 files changed, 206 insertions(+) create mode 100644 datalad_dtool/dtool_remote.py create mode 100644 datalad_dtool/import.py diff --git a/datalad_dtool/dtool_remote.py b/datalad_dtool/dtool_remote.py new file mode 100644 index 0000000..65516ce --- /dev/null +++ b/datalad_dtool/dtool_remote.py @@ -0,0 +1,74 @@ +import logging +import shutil + +from annexremote import Master +from annexremote import SpecialRemote +from annexremote import RemoteError + +from dtoolcore import DataSet + +logger = logging.getLogger(__name__) + + +class DtoolRemote(SpecialRemote): + """A read-only special remote for retrieving files from dtool datasets.""" + transfer_store = None + remove = None + + def __init__(self, annex): + super().__init__(annex) + self.configs = { + 'uri': "dtool dataset URI" + } + + def initremote(self) -> None: + # initialize the remote, e.g. create the folders + # raise RemoteError if the remote couldn't be initialized + self.uri = self.annex.getconfig("uri") + if not self.uri: + raise RemoteError("You need to set uri=") + logger.debug("Set dtool dataset uri=%s", self.uri) + + def prepare(self) -> None: + # prepare to be used, eg. open TCP connection, authenticate with the server etc. + # raise RemoteError if not ready to use + self.uri = self.annex.getconfig("uri") + self.dtool_dataset = DataSet.from_uri(self.uri) + + def transfer_retrieve(self, key, filename): + # get the file identified by `key` and store it to `filename` + # raise RemoteError if the file couldn't be retrieved + try: + fpath = self.dtool_dataset.item_content_abspath(key) + except Exception as e: + raise RemoteError(e) + shutil.copyfile(fpath, filename) + + def checkpresent(self, key): + # return True if the key is present in the remote + # return False if the key is not present + # raise RemoteError if the presence of the key couldn't be determined, eg. in case of connection error + if key in self.dtool_dataset.identifiers: + return True + else: + return False + + def claimurl(self, url: str) -> bool: + return url.startswith("dtool:") + + def checkurl(self, url: str) -> bool: + return url.startswith("dtool:") + + def getcost(self) -> int: + # This is a very expensive remote + return 1000 + + def getavailability(self) -> str: + return "global" + + +def main() -> None: + master = Master() + remote = DtoolRemote(master) + master.LinkRemote(remote) + master.Listen() \ No newline at end of file diff --git a/datalad_dtool/import.py b/datalad_dtool/import.py new file mode 100644 index 0000000..0519dd3 --- /dev/null +++ b/datalad_dtool/import.py @@ -0,0 +1,130 @@ +"""DataLad extension for the Climate Data Store""" + +__docformat__ = "restructuredtext" +import logging +from typing import Iterable, Literal, Optional, Union + +from datalad.distribution.dataset import ( + EnsureDataset, + datasetmethod, + require_dataset, +) +from datalad.interface.base import Interface, build_doc, eval_results +from datalad.interface.common_opts import nosave_opt, save_message_opt +from datalad.interface.results import get_status_dict +from datalad.support.annexrepo import AnnexRepo +from datalad.support.constraints import EnsureNone, EnsureStr +from datalad.support.param import Parameter + +import datalad_dtool.dtool_remote +# import datalad_cds.spec + +logger = logging.getLogger("datalad.cds.download_cds") + + +# decoration auto-generates standard help +@build_doc +# all commands must be derived from Interface +class DownloadCDS(Interface): + """Downloads specified datasets from the CDS data store""" + + _params_ = dict( + spec=Parameter( + doc="""A json string or python dictionary containing the key + "dataset" with the datasets name (i.e. what is shown as the first + parameter to cdsapi.Client.retrieve if you do a "Show API request" + on some dataset in the CDS) and the key "sub-selection" with the + sub-selection of the dataset that should be fetched (i.e. what is + shown as the second parameter to cdsapi.Client.retrieve).""", + ), + dataset=Parameter( + args=("-d", "--dataset"), + metavar="PATH", + doc="""specify the dataset to add files to. If no dataset is given, + an attempt is made to identify the dataset based on the current + working directory. Use [CMD: --nosave CMD][PY: save=False PY] to + prevent adding files to the dataset.""", + constraints=EnsureDataset() | EnsureNone(), + ), + path=Parameter( + args=("-O", "--path"), + doc="""target path to download to.""", + constraints=EnsureStr(), + ), + lazy=Parameter( + args=("--lazy",), + action="store_true", + doc="""By default the file will be immediately downloaded. If the + lazy flag is supplied then the dtool dataset and item is only recorded as a + source for the file, but no download is initiated. Keep in mind that + there is no way to validate the correctness of the request if the + lazy flag is used.""", + ), + save=nosave_opt, + message=save_message_opt, + ) + + @staticmethod + @datasetmethod(name="import_dtool") + @eval_results + def __call__( + spec: Union[str, dict], + path: str, + *, + dataset: Optional[str] = None, + message: Optional[str] = None, + save: bool = True, + lazy: bool = False, + ) -> Iterable[dict]: + if isinstance(spec, dict): + parsed_spec = datalad_cds.spec.Spec.from_dict(spec) + elif isinstance(spec, str): + parsed_spec = datalad_cds.spec.Spec.from_json(spec) + else: + raise TypeError("spec could not be parsed") + ds = require_dataset(dataset, check_installed=True) + ensure_special_remote_exists_and_is_enabled(ds.repo, "cds") + pathobj = ds.pathobj / path + url = parsed_spec.to_url() + options = [] + if lazy: + options.append("--relaxed") + ds.repo.add_url_to_file(pathobj, url, options=options) + if save: + msg = ( + message + if message is not None + else "[DATALAD] Download from Climate Data Store" + ) + yield ds.save(pathobj, message=msg) + yield get_status_dict(action="cds", ds=ds, status="ok") + + +def ensure_special_remote_exists_and_is_enabled( + repo: AnnexRepo, remote: Literal["dtool"] +) -> None: + """Initialize and enable the dtool special remote, if it isn't already. + + Very similar to datalad.customremotes.base.ensure_datalad_remote. + """ + + uuids = {"cds": datalad_dtool.dtool_remote.DTOOL_REMOTE_UUID} + uuid = uuids[remote] + + name = repo.get_special_remotes().get(uuid, {}).get("name") + if not name: + repo.init_remote( + remote, + [ + "encryption=none", + "type=external", + "autoenable=true", + "externaltype={}".format(remote), + "uuid={}".format(uuid), + ], + ) + elif repo.is_special_annex_remote(name, check_if_known=False): + logger.debug("special remote %s is enabled", name) + else: + logger.debug("special remote %s found, enabling", name) + repo.enable_remote(name) \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index de04f76..ac1bdb1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,8 @@ datalad.extensions = # the entrypoint can point to any symbol of any name, as long it is # valid datalad interface specification (see demo in this extensions) dtool = datalad_dtool:command_suite +console_scripts = + git-annex-remote-dtool = datalad_dtool.dtool_remote:main [versioneer] # See the docstring in versioneer.py for instructions. Note that you must From 29b3ea00c1c7b89e9e3a1023f9d5e7fb6f9c3df2 Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Thu, 5 Sep 2024 15:31:26 +0200 Subject: [PATCH 2/7] ENH: added test script for dtool special remote --- examples/test_git-annex-remote-dtool | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 examples/test_git-annex-remote-dtool diff --git a/examples/test_git-annex-remote-dtool b/examples/test_git-annex-remote-dtool new file mode 100755 index 0000000..9a8f9eb --- /dev/null +++ b/examples/test_git-annex-remote-dtool @@ -0,0 +1,28 @@ +#!/bin/bash +# Test script to drive the example external remote + +set -eu -o pipefail -x + +cd $(dirname "$0") + +export PATH=$PWD:$PATH + +TMP="$(mktemp -d "${TMPDIR:-/tmp}/gar-XXXXXXX")" +# so there is no global git config side-effects +export HOME="$TMP" + +SOURCE_DATASET="file://$TMP/test-dataset" +SOURCE_DATASET_NAME="test-dataset" +REPO_DIR="$TMP/repo" +# mkdir -p "$REMOTE_DIR" +cd "$TMP" +dtool create ${SOURCE_DATASET_NAME} +mkdir -p "$REPO_DIR" + +cd "$REPO_DIR" +git init +git config user.email "someuser@gmail.com" +git config user.name "Some User" +git annex init +git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none uri="${SOURCE_DATASET}" +git annex testremote --verbose --debug dtool_remote 2>&1 | tail -n 1000 From 8dddf2cccfd7de7d864d2c7bb72c80bf10e34aa0 Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Thu, 10 Oct 2024 16:01:27 +0200 Subject: [PATCH 3/7] ENH: read-only test working for dtool special remote --- datalad_dtool/dtool_remote.py | 51 +++++++++++++++++++++------- examples/test_git-annex-remote-dtool | 30 ++++++++++++++-- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/datalad_dtool/dtool_remote.py b/datalad_dtool/dtool_remote.py index 65516ce..e1b6dbf 100644 --- a/datalad_dtool/dtool_remote.py +++ b/datalad_dtool/dtool_remote.py @@ -5,7 +5,7 @@ from annexremote import SpecialRemote from annexremote import RemoteError -from dtoolcore import DataSet +from dtoolcore import DataSet, ProtoDataSet, DtoolCoreTypeError logger = logging.getLogger(__name__) @@ -33,31 +33,56 @@ def prepare(self) -> None: # prepare to be used, eg. open TCP connection, authenticate with the server etc. # raise RemoteError if not ready to use self.uri = self.annex.getconfig("uri") - self.dtool_dataset = DataSet.from_uri(self.uri) + try: + self.dtool_dataset = DataSet.from_uri(self.uri) + logger.debug("Dataset uri=%s frozen, immutable.", self.uri) + except DtoolCoreTypeError as exc: + logger.warning(exc) + self.dtool_dataset = ProtoDataSet.from_uri(self.uri) def transfer_retrieve(self, key, filename): # get the file identified by `key` and store it to `filename` # raise RemoteError if the file couldn't be retrieved - try: - fpath = self.dtool_dataset.item_content_abspath(key) - except Exception as e: - raise RemoteError(e) - shutil.copyfile(fpath, filename) + + if isinstance(self.dtool_dataset, ProtoDataSet): + self.dtool_dataset.freeze() + self.dtool_dataset = DataSet.from_uri(self.uri) + + manifest = self.dtool_dataset.generate_manifest() + for uuid, entry in manifest['items'].items(): + if entry["relpath"] == key: + try: + fpath = self.dtool_dataset.item_content_abspath(uuid) + except Exception as e: + raise RemoteError(e) + shutil.copyfile(fpath, filename) + return + + raise RemoteError() def checkpresent(self, key): # return True if the key is present in the remote # return False if the key is not present # raise RemoteError if the presence of the key couldn't be determined, eg. in case of connection error - if key in self.dtool_dataset.identifiers: - return True - else: - return False + logger.debug("Looking for item %s in dataset %s", key, self.uri) + + if isinstance(self.dtool_dataset, ProtoDataSet): + self.dtool_dataset.freeze() + self.dtool_dataset = DataSet.from_uri(self.uri) + + manifest = self.dtool_dataset.generate_manifest() + for _, entry in manifest['items'].items(): + if entry["relpath"] == key: + return True + + return False def claimurl(self, url: str) -> bool: - return url.startswith("dtool:") + return url.startswith("dtool") def checkurl(self, url: str) -> bool: - return url.startswith("dtool:") + return url.startswith("dtool") + # TODO: implement more sophisticated checking on URL def getcost(self) -> int: # This is a very expensive remote diff --git a/examples/test_git-annex-remote-dtool b/examples/test_git-annex-remote-dtool index 9a8f9eb..09b02a4 100755 --- a/examples/test_git-annex-remote-dtool +++ b/examples/test_git-annex-remote-dtool @@ -14,9 +14,14 @@ export HOME="$TMP" SOURCE_DATASET="file://$TMP/test-dataset" SOURCE_DATASET_NAME="test-dataset" REPO_DIR="$TMP/repo" -# mkdir -p "$REMOTE_DIR" cd "$TMP" + +echo "This is a test file." > testfile.txt + +TESTFILE_PATH="$TMP/testfile.txt" + dtool create ${SOURCE_DATASET_NAME} + mkdir -p "$REPO_DIR" cd "$REPO_DIR" @@ -24,5 +29,26 @@ git init git config user.email "someuser@gmail.com" git config user.name "Some User" git annex init + +cp ${TESTFILE_PATH} . +git annex add testfile.txt +git commit -m "Add test file to git annex" + +# get git annex-generated key +ANNEX_KEY=$(git annex lookupkey testfile.txt) + +# put item into dataset at git annex-expected key +dtool add item testfile.txt "${SOURCE_DATASET}" "${ANNEX_KEY}" +dtool freeze "${SOURCE_DATASET}" + git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none uri="${SOURCE_DATASET}" -git annex testremote --verbose --debug dtool_remote 2>&1 | tail -n 1000 + +git annex info + +SPECIAL_REMOTE_UUID=$(git config --get remote.dtool_remote.annex-uuid) + +# tell git annex that file is retrievable from special remote +git annex setpresentkey "${ANNEX_KEY}" "${SPECIAL_REMOTE_UUID}" 1 + +# test read-only special remote +git annex testremote --verbose --debug dtool_remote --test-readonly=testfile.txt 2>&1 | tail -n 1000 From 7174f989a2a4ec0d7a0a190210f08c236259a1e8 Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Mon, 14 Oct 2024 16:25:11 +0200 Subject: [PATCH 4/7] ENH: addurl working --- datalad_dtool/dtool_remote.py | 78 +++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/datalad_dtool/dtool_remote.py b/datalad_dtool/dtool_remote.py index e1b6dbf..48d9e3e 100644 --- a/datalad_dtool/dtool_remote.py +++ b/datalad_dtool/dtool_remote.py @@ -17,48 +17,38 @@ class DtoolRemote(SpecialRemote): def __init__(self, annex): super().__init__(annex) - self.configs = { - 'uri': "dtool dataset URI" - } def initremote(self) -> None: # initialize the remote, e.g. create the folders # raise RemoteError if the remote couldn't be initialized - self.uri = self.annex.getconfig("uri") - if not self.uri: - raise RemoteError("You need to set uri=") - logger.debug("Set dtool dataset uri=%s", self.uri) + pass def prepare(self) -> None: # prepare to be used, eg. open TCP connection, authenticate with the server etc. # raise RemoteError if not ready to use - self.uri = self.annex.getconfig("uri") - try: - self.dtool_dataset = DataSet.from_uri(self.uri) - logger.debug("Dataset uri=%s frozen, immutable.", self.uri) - except DtoolCoreTypeError as exc: - logger.warning(exc) - self.dtool_dataset = ProtoDataSet.from_uri(self.uri) + pass def transfer_retrieve(self, key, filename): # get the file identified by `key` and store it to `filename` # raise RemoteError if the file couldn't be retrieved - - if isinstance(self.dtool_dataset, ProtoDataSet): - self.dtool_dataset.freeze() - self.dtool_dataset = DataSet.from_uri(self.uri) - - manifest = self.dtool_dataset.generate_manifest() - for uuid, entry in manifest['items'].items(): - if entry["relpath"] == key: - try: - fpath = self.dtool_dataset.item_content_abspath(uuid) - except Exception as e: - raise RemoteError(e) + urls = self.annex.geturls(key, "dtool:") + logger.debug("Retrieve from %s", urls) + + exceptions = [] + for url in urls: + url = url[len('dtool:'):] + try: + dataset_uri, item_uuid = url.rsplit('/', 1) + logger.debug("Try to retrieve item %s from dataset %s", item_uuid, dataset_uri) + dtool_dataset = DataSet.from_uri(dataset_uri) + fpath = dtool_dataset.item_content_abspath(item_uuid) + logger.debug("Cached item content at %s", fpath) shutil.copyfile(fpath, filename) - return - - raise RemoteError() + break + except Exception as e: + exceptions.append(e) + else: + raise RemoteError(exceptions) def checkpresent(self, key): # return True if the key is present in the remote @@ -66,22 +56,40 @@ def checkpresent(self, key): # raise RemoteError if the presence of the key couldn't be determined, eg. in case of connection error logger.debug("Looking for item %s in dataset %s", key, self.uri) + urls = self.annex.geturls(key, "dtool:") + + for url in urls: + url = url[len('dtool:'):] + try: + dataset_uri, item_uuid = url.rsplit('/', 1) + logger.debug("Try to locate item %s in dataset %s", item_uuid, dataset_uri) + + dtool_dataset = DataSet.from_uri(dataset_uri) + manifest = dtool_dataset.generate_manifest() + if item_uuid in manifest['items']: + logger.debug("Located item %s in dataset %s", item_uuid, dataset_uri) + return True + + except Exception as e: + exceptions.append(e) + + return False + + logger.debug("Present at %s", urls) + if isinstance(self.dtool_dataset, ProtoDataSet): self.dtool_dataset.freeze() self.dtool_dataset = DataSet.from_uri(self.uri) - manifest = self.dtool_dataset.generate_manifest() - for _, entry in manifest['items'].items(): - if entry["relpath"] == key: - return True + return False def claimurl(self, url: str) -> bool: - return url.startswith("dtool") + return url.startswith("dtool:") def checkurl(self, url: str) -> bool: - return url.startswith("dtool") + return url.startswith("dtool:") # TODO: implement more sophisticated checking on URL def getcost(self) -> int: From c10df9c497a6fe0d99e412d10d26676b039c71bb Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Mon, 14 Oct 2024 16:34:40 +0200 Subject: [PATCH 5/7] ENH: working tests --- datalad_dtool/dtool_remote.py | 15 ++++----------- examples/test_git-annex-remote-dtool | 20 +++++--------------- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/datalad_dtool/dtool_remote.py b/datalad_dtool/dtool_remote.py index 48d9e3e..6737843 100644 --- a/datalad_dtool/dtool_remote.py +++ b/datalad_dtool/dtool_remote.py @@ -54,10 +54,9 @@ def checkpresent(self, key): # return True if the key is present in the remote # return False if the key is not present # raise RemoteError if the presence of the key couldn't be determined, eg. in case of connection error - logger.debug("Looking for item %s in dataset %s", key, self.uri) - urls = self.annex.geturls(key, "dtool:") + exceptions = [] for url in urls: url = url[len('dtool:'):] try: @@ -73,17 +72,11 @@ def checkpresent(self, key): except Exception as e: exceptions.append(e) + if len(exceptions) > 0: + raise exceptions[-1] + return False - logger.debug("Present at %s", urls) - - if isinstance(self.dtool_dataset, ProtoDataSet): - self.dtool_dataset.freeze() - self.dtool_dataset = DataSet.from_uri(self.uri) - - - - return False def claimurl(self, url: str) -> bool: return url.startswith("dtool:") diff --git a/examples/test_git-annex-remote-dtool b/examples/test_git-annex-remote-dtool index 09b02a4..07a532e 100755 --- a/examples/test_git-annex-remote-dtool +++ b/examples/test_git-annex-remote-dtool @@ -30,25 +30,15 @@ git config user.email "someuser@gmail.com" git config user.name "Some User" git annex init -cp ${TESTFILE_PATH} . -git annex add testfile.txt -git commit -m "Add test file to git annex" - -# get git annex-generated key -ANNEX_KEY=$(git annex lookupkey testfile.txt) - # put item into dataset at git annex-expected key -dtool add item testfile.txt "${SOURCE_DATASET}" "${ANNEX_KEY}" +dtool add item "${TESTFILE_PATH}" "${SOURCE_DATASET}" dtool freeze "${SOURCE_DATASET}" -git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none uri="${SOURCE_DATASET}" - -git annex info +ITEM_UUID=$(dtool ls "${SOURCE_DATASET}" | awk '{ print $1 }') -SPECIAL_REMOTE_UUID=$(git config --get remote.dtool_remote.annex-uuid) +git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none -# tell git annex that file is retrievable from special remote -git annex setpresentkey "${ANNEX_KEY}" "${SPECIAL_REMOTE_UUID}" 1 +git annex addurl --file testfile.txt "dtool:${SOURCE_DATASET}/${ITEM_UUID}" # test read-only special remote -git annex testremote --verbose --debug dtool_remote --test-readonly=testfile.txt 2>&1 | tail -n 1000 +git annex testremote --debug --verbose dtool_remote --test-readonly=testfile.txt 2>&1 | tail -n 1000 From 432e9449dbee70e1e87f4caa5a8d23e8478f5ee1 Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Thu, 17 Oct 2024 14:28:45 +0200 Subject: [PATCH 6/7] MAINT: renamed git annex remote test --- ...nnex-remote-dtool => test_readonly_git-annex-remote-dtool} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename examples/{test_git-annex-remote-dtool => test_readonly_git-annex-remote-dtool} (86%) diff --git a/examples/test_git-annex-remote-dtool b/examples/test_readonly_git-annex-remote-dtool similarity index 86% rename from examples/test_git-annex-remote-dtool rename to examples/test_readonly_git-annex-remote-dtool index 07a532e..b1bfc48 100755 --- a/examples/test_git-annex-remote-dtool +++ b/examples/test_readonly_git-annex-remote-dtool @@ -36,9 +36,9 @@ dtool freeze "${SOURCE_DATASET}" ITEM_UUID=$(dtool ls "${SOURCE_DATASET}" | awk '{ print $1 }') -git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none +git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none uri="${SOURCE_DATASET}" -git annex addurl --file testfile.txt "dtool:${SOURCE_DATASET}/${ITEM_UUID}" +git annex addurl --backend=MD5E --file testfile.txt "dtool:${SOURCE_DATASET}/${ITEM_UUID}" # test read-only special remote git annex testremote --debug --verbose dtool_remote --test-readonly=testfile.txt 2>&1 | tail -n 1000 From dd8813c58576bdc47a7f20ac95a648746fdf4a71 Mon Sep 17 00:00:00 2001 From: Johannes Laurin Hoermann Date: Wed, 23 Oct 2024 15:34:37 +0200 Subject: [PATCH 7/7] WIP: md5 keys mapping --- datalad_dtool/dtool_remote.py | 148 ++++++++++++++++++++++++--- examples/test_git-annex-remote-dtool | 44 ++++++++ 2 files changed, 176 insertions(+), 16 deletions(-) create mode 100755 examples/test_git-annex-remote-dtool diff --git a/datalad_dtool/dtool_remote.py b/datalad_dtool/dtool_remote.py index 6737843..cd5b853 100644 --- a/datalad_dtool/dtool_remote.py +++ b/datalad_dtool/dtool_remote.py @@ -2,69 +2,160 @@ import shutil from annexremote import Master -from annexremote import SpecialRemote +from annexremote import ExportRemote from annexremote import RemoteError from dtoolcore import DataSet, ProtoDataSet, DtoolCoreTypeError +logging.basicConfig(level=logging.DEBUG) + logger = logging.getLogger(__name__) -class DtoolRemote(SpecialRemote): +def extract_backend(key): + # Split the key by "--" + parts = key.split("-") + + if len(parts) < 2: + return None # Invalid key format + + # Get the last part (hash + possible extension) + return parts[0] + + + return hash_only + + +def extract_hash(key): + # Split the key by "--" + parts = key.split("--") + + if len(parts) < 2: + return None # Invalid key format + + # Get the last part (hash + possible extension) + hash_part = parts[-1] + + # Remove file extension if present + hash_only = hash_part.split('.')[0] + + return hash_only + + +class DtoolRemote(ExportRemote): """A read-only special remote for retrieving files from dtool datasets.""" transfer_store = None remove = None def __init__(self, annex): super().__init__(annex) + self.configs = { + 'uri': "dtool dataset URI" + } def initremote(self) -> None: # initialize the remote, e.g. create the folders # raise RemoteError if the remote couldn't be initialized - pass + self.uri = self.annex.getconfig("uri") + if not self.uri: + raise RemoteError("You need to set uri=") + logger.debug("Set dtool dataset uri=%s", self.uri) def prepare(self) -> None: # prepare to be used, eg. open TCP connection, authenticate with the server etc. # raise RemoteError if not ready to use + self.uri = self.annex.getconfig("uri") + try: + self.dtool_dataset = DataSet.from_uri(self.uri) + logger.debug("Dataset uri=%s frozen, immutable.", self.uri) + except DtoolCoreTypeError as exc: + logger.warning(exc) + self.dtool_dataset = ProtoDataSet.from_uri(self.uri) pass def transfer_retrieve(self, key, filename): # get the file identified by `key` and store it to `filename` # raise RemoteError if the file couldn't be retrieved - urls = self.annex.geturls(key, "dtool:") + exceptions = [] + + backend = self.annex.getconfig('keybackend_' + key) + logger.debug("Key %s uses backend %s", key, backend) + + file_hash = extract_backend(key) + + logger.debug("Try to locate file of chekcsum/hash %s in dataset %s", file_hash, self.uri) + manifest = self.dtool_dataset.generate_manifest() + if backend.startswith('MD5') and (manifest["hash_function"] == "md5sum_hexdigest"): + for uuid, entry in manifest['items'].items(): + if entry["hash"] == file_hash: + try: + fpath = self.dtool_dataset.item_content_abspath(uuid) + shutil.copyfile(fpath, filename) + return + except Exception as e: + exceptions.append(e) + + urls = self.annex.geturls(key, f"dtool:{self.uri}") logger.debug("Retrieve from %s", urls) - exceptions = [] for url in urls: url = url[len('dtool:'):] try: dataset_uri, item_uuid = url.rsplit('/', 1) + + assert dataset_uri == self.uri + logger.debug("Try to retrieve item %s from dataset %s", item_uuid, dataset_uri) - dtool_dataset = DataSet.from_uri(dataset_uri) - fpath = dtool_dataset.item_content_abspath(item_uuid) + # dtool_dataset = DataSet.from_uri(dataset_uri) + fpath = self.dtool_dataset.item_content_abspath(item_uuid) logger.debug("Cached item content at %s", fpath) shutil.copyfile(fpath, filename) - break + return except Exception as e: exceptions.append(e) - else: - raise RemoteError(exceptions) + + raise RemoteError(exceptions) def checkpresent(self, key): # return True if the key is present in the remote # return False if the key is not present # raise RemoteError if the presence of the key couldn't be determined, eg. in case of connection error - urls = self.annex.geturls(key, "dtool:") + + # first, try to identify file from actual md5 key exceptions = [] + + backend = extract_backend(key) + logger.debug("Key %s uses backend %s", key, backend) + + try: + file_hash = extract_hash(key) + logger.debug("Try to locate hash/checksum %s in dataset %s", file_hash, self.uri) + + manifest = self.dtool_dataset.generate_manifest() + if backend.startswith('MD5') and (manifest["hash_function"] == "md5sum_hexdigest"): + for uuid, entry in manifest['items'].items(): + if entry["hash"] == file_hash: + logger.debug("Located item %s in dataset %s", uuid, self.uri) + return True + except Exception as e: + exceptions.append(e) + + # next, try to identify file from dtool URLs + + urls = self.annex.geturls(key, f"dtool:{self.uri}") + for url in urls: url = url[len('dtool:'):] try: dataset_uri, item_uuid = url.rsplit('/', 1) + + assert dataset_uri == self.uri + logger.debug("Try to locate item %s in dataset %s", item_uuid, dataset_uri) - dtool_dataset = DataSet.from_uri(dataset_uri) - manifest = dtool_dataset.generate_manifest() + # dtool_dataset = DataSet.from_uri(dataset_uri) + manifest = self.dtool_dataset.generate_manifest() if item_uuid in manifest['items']: logger.debug("Located item %s in dataset %s", item_uuid, dataset_uri) return True @@ -77,12 +168,12 @@ def checkpresent(self, key): return False - def claimurl(self, url: str) -> bool: - return url.startswith("dtool:") + logger.debug("Check claim to URL %s", url) + return url.startswith(f"dtool:{self.uri}/") def checkurl(self, url: str) -> bool: - return url.startswith("dtool:") + return url.startswith(f"dtool:{self.uri}/") # TODO: implement more sophisticated checking on URL def getcost(self) -> int: @@ -92,6 +183,31 @@ def getcost(self) -> int: def getavailability(self) -> str: return "global" + ## Export methods + def transferexport_store(self, key, local_file, remote_file): + pass + + def transferexport_retrieve(self, key, local_file, remote_file): + manifest = self.dtool_dataset.generate_manifest() + for uuid, entry in manifest['items'].items(): + if entry["relpath"] == remote_file: + try: + fpath = self.dtool_dataset.item_content_abspath(uuid) + shutil.copyfile(fpath, local_file) + except Exception as e: + raise RemoteError(e) + + pass + + def checkpresentexport(self, key, remote_file): + pass + + def removeexport(self, key, remote_file): + pass + + def removeexportdirectory(self, remote_directory): + pass + def main() -> None: master = Master() diff --git a/examples/test_git-annex-remote-dtool b/examples/test_git-annex-remote-dtool new file mode 100755 index 0000000..0ab5048 --- /dev/null +++ b/examples/test_git-annex-remote-dtool @@ -0,0 +1,44 @@ +#!/bin/bash +# Test script to drive the example external remote + +set -eu -o pipefail -x + +cd $(dirname "$0") + +export PATH=$PWD:$PATH + +TMP="$(mktemp -d "${TMPDIR:-/tmp}/gar-XXXXXXX")" +# so there is no global git config side-effects +export HOME="$TMP" + +SOURCE_DATASET="file://$TMP/test-dataset" +SOURCE_DATASET_NAME="test-dataset" +REPO_DIR="$TMP/repo" +cd "$TMP" + +echo "This is a test file." > testfile.txt + +TESTFILE_PATH="$TMP/testfile.txt" + +dtool create ${SOURCE_DATASET_NAME} + +mkdir -p "$REPO_DIR" + +cd "$REPO_DIR" +git init +git config user.email "someuser@gmail.com" +git config user.name "Some User" +git annex init + +# put item into dataset at git annex-expected key +dtool add item "${TESTFILE_PATH}" "${SOURCE_DATASET}" +dtool freeze "${SOURCE_DATASET}" + +ITEM_UUID=$(dtool ls "${SOURCE_DATASET}" | awk '{ print $1 }') + +git annex initremote --verbose --debug dtool_remote type=external externaltype=dtool encryption=none exporttree=yes uri="${SOURCE_DATASET}" + +git annex addurl --backend=MD5E --file testfile.txt "dtool:${SOURCE_DATASET}/${ITEM_UUID}" + +# test read-only special remote +git annex testremote --debug --verbose dtool_remote --test-readonly=testfile.txt 2>&1 | tail -n 1000