From a70464b6c7d33fac1a7efbf335464710f8f4fc19 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Mon, 4 Nov 2024 14:45:48 +0100 Subject: [PATCH] feat: `iter_gittree()` --- datalad_core/iter_collections/gittree.py | 93 +++++++++++++++++++ .../tests/test_itergittree.py | 90 ++++++++++++++++++ datalad_core/iter_collections/utils.py | 20 +++- 3 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 datalad_core/iter_collections/gittree.py create mode 100644 datalad_core/iter_collections/tests/test_itergittree.py diff --git a/datalad_core/iter_collections/gittree.py b/datalad_core/iter_collections/gittree.py new file mode 100644 index 0000000..1cbf915 --- /dev/null +++ b/datalad_core/iter_collections/gittree.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import ( + Path, + PurePosixPath, +) +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator + +from datalad_core.iter_collections.utils import ( + GitTreeItemType, + git_ls_tree, + git_mode_type_map, +) + + +@dataclass(frozen=True) +class GitTreeItem: + """Item in a Git tree""" + + relpath: PurePosixPath + # gitsha is not the sha1 of the file content, but the output + # of `git hash-object` which does something like + # `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` + gitsha: str | None = None + gittype: GitTreeItemType | None = None + + +def iter_gittree( + path: Path, + treeish: str, + *, + recursive: str = 'repository', +) -> Generator[GitTreeItem]: + """Uses ``git ls-tree`` to report on a tree in a Git repository + + Parameters + ---------- + path: Path + Path of a directory in a Git repository to report on. This directory + need not be the root directory of the repository, but must be part of + the repository. If the directory is not the root directory of a + non-bare repository, the iterator is constrained to items underneath + that directory. + recursive: {'repository', 'no'}, optional + Behavior for recursion into subtrees. By default (``repository``), + all tree within the repository underneath ``path``) are reported, + but not tree within submodules. If ``no``, only direct children + are reported on. + + Yields + ------ + :class:`GitTreeItem` + The ``name`` attribute of an item is a ``str`` with the corresponding + (relative) path, as reported by Git (in POSIX conventions). + """ + # we force-convert to Path to give us the piece of mind we want. + # The docs already ask for that, but it is easy to + # forget/ignore and leads to non-obvious errors. Running this once is + # a cheap safety net + path = Path(path) + + # although it would be easy to also query the object size, we do not + # do so, because it has a substantial runtime impact. It is unclear + # what the main factor for the slowdown is, but in test cases I can + # see 10x slower + # lstree_args = ['--long'] + # we do not go for a custom format that would allow for a single split + # by tab, because if we do, Git starts quoting paths with special + # characters (like tab) again + # lstree_args = ['--format=%(objectmode)%x09%(objectname)%x09%(path)'] + lstree_args = [] + if recursive == 'repository': + lstree_args.append('-r') + + for line in git_ls_tree(path, treeish, *lstree_args): + yield _get_tree_item(line) + + +def _get_tree_item(spec: str) -> GitTreeItem: + props, path = spec.split('\t', maxsplit=1) + # 0::2 gets the first and third (last) item, effectively skippping the + # type name (blob/tree etc.), we have the mode lookup for that, which + # provides more detail + mode, sha = props.split(' ')[0::2] + return GitTreeItem( + relpath=PurePosixPath(path), + gitsha=sha, + gittype=git_mode_type_map[mode], + ) diff --git a/datalad_core/iter_collections/tests/test_itergittree.py b/datalad_core/iter_collections/tests/test_itergittree.py new file mode 100644 index 0000000..a21a63d --- /dev/null +++ b/datalad_core/iter_collections/tests/test_itergittree.py @@ -0,0 +1,90 @@ +from pathlib import ( + PurePosixPath, +) + +import pytest + +from datalad_core.consts import PRE_INIT_COMMIT_SHA +from datalad_core.runners import CommandError +from datalad_core.tests import ( + call_git_addcommit, +) + +from ..gittree import ( + GitTreeItem, + GitTreeItemType, + iter_gittree, +) + + +def test_iter_gittree(gitrepo): + # an initial commit + (gitrepo / '.gitkeep').touch() + call_git_addcommit(gitrepo) + + # we add a new file and test its expected properties + probe_name = 'probe.txt' + # on crippled FS we are testing the managed branch which contains + # pointer files, not symlinks + expected_probe_sha = '24ae15ce9741d53115a9fc71c2b761790ca47995' + probe = gitrepo / 'subdir' / probe_name + probe.parent.mkdir() + probe.write_text('probe') + call_git_addcommit(gitrepo) + + tracked_items = list(iter_gittree(gitrepo, 'HEAD')) + # without untracked's and no link resolution this is plain and fast + assert all( + isinstance(i, GitTreeItem) and i.gitsha and i.gittype for i in tracked_items + ) + assert any( + # let's query a Path instance here, to get that covered too + i.relpath == PurePosixPath(f'subdir/{probe_name}') + and i.gitsha == expected_probe_sha + and i.gittype == GitTreeItemType.file + for i in iter_gittree(gitrepo, 'HEAD') + ) + # if we check the prior version, we do not see it (hence the + # tree-ish passing is working + assert not any( + i.relpath == PurePosixPath(f'subdir/{probe_name}') + for i in iter_gittree(gitrepo, 'HEAD~1') + ) + + # if we disable recursion, the probe is not listed, but its + # parent dir is + tracked_toplevel_items = list(iter_gittree(gitrepo, 'HEAD', recursive='no')) + assert not any(i.relpath == f'subdir/{probe_name}' for i in tracked_toplevel_items) + assert any( + i.relpath == PurePosixPath('subdir') + and i.gitsha == '0dd69202ba4657a5d9c37d5716d5b27127c4b57b' + and i.gittype == GitTreeItemType.directory + for i in tracked_toplevel_items + ) + # iterating on a subdir does constrain the report + tracked_subdir_items = list(iter_gittree(probe.parent, 'HEAD')) + assert len(tracked_subdir_items) == 1 + probe_item = tracked_subdir_items[0] + assert probe_item.relpath.name == probe_name + assert probe_item.gitsha == expected_probe_sha + + +def test_name_starting_with_tab(gitrepo): + tabbed_file_name = '\ttab.txt' + tabbed_file = gitrepo / tabbed_file_name + try: + tabbed_file.write_text('name of this file starts with a tab') + except OSError: + pytest.skip('not applicable on crippled filesystems') + + call_git_addcommit(gitrepo) + iter_names = [item.relpath for item in iter_gittree(gitrepo, 'HEAD')] + assert PurePosixPath(tabbed_file_name) in iter_names + + +def test_iter_gittree_empty(gitrepo): + with pytest.raises(CommandError, match='Not a valid object name HEAD'): + list(iter_gittree(gitrepo, 'HEAD')) + + all_items = list(iter_gittree(gitrepo, PRE_INIT_COMMIT_SHA)) + assert len(all_items) == 0 diff --git a/datalad_core/iter_collections/utils.py b/datalad_core/iter_collections/utils.py index 0f3f830..947b1cc 100644 --- a/datalad_core/iter_collections/utils.py +++ b/datalad_core/iter_collections/utils.py @@ -15,8 +15,12 @@ from datalad_core.runners import iter_git_subproc -def git_ls_files(path: Path, *args: str) -> Iterator[str]: - """Run ``git ls-files`` at a given ``path`` and with ``args`` +def iter_gitcmd_zlines( + path: Path, + cmd: str, + *args: str, +) -> Iterator[str]: + """Run ``git `` at a given ``path`` and with ``args`` An unconditional ``-z`` argument is used to get zero-byte separation of output items, internally. A generator is returned that yields ``str`` @@ -24,7 +28,7 @@ def git_ls_files(path: Path, *args: str) -> Iterator[str]: """ with iter_git_subproc( [ - 'ls-files', + cmd, # we rely on zero-byte splitting below '-z', # otherwise take whatever is coming in @@ -39,6 +43,16 @@ def git_ls_files(path: Path, *args: str) -> Iterator[str]: ) +def git_ls_files(path: Path, *args: str) -> Iterator[str]: + """Run ``git ls-files`` at a given ``path`` and with ``args``""" + return iter_gitcmd_zlines(path, 'ls-files', *args) + + +def git_ls_tree(path: Path, *args) -> Iterator[str]: + """Run ``git ls-tree`` at a given ``path`` and with ``args``""" + return iter_gitcmd_zlines(path, 'ls-tree', *args) + + # TODO: Could be `StrEnum`, came with PY3.11 class GitTreeItemType(Enum): """Enumeration of item types of Git trees"""