Skip to content

Commit

Permalink
feat: iter_gittree()
Browse files Browse the repository at this point in the history
  • Loading branch information
mih committed Nov 4, 2024
1 parent 3f4402d commit a70464b
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 3 deletions.
93 changes: 93 additions & 0 deletions datalad_core/iter_collections/gittree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import (
Path,
PurePosixPath,
)
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from collections.abc import Generator

from datalad_core.iter_collections.utils import (
GitTreeItemType,
git_ls_tree,
git_mode_type_map,
)


@dataclass(frozen=True)
class GitTreeItem:
"""Item in a Git tree"""

relpath: PurePosixPath
# gitsha is not the sha1 of the file content, but the output
# of `git hash-object` which does something like
# `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum`
gitsha: str | None = None
gittype: GitTreeItemType | None = None


def iter_gittree(
path: Path,
treeish: str,
*,
recursive: str = 'repository',
) -> Generator[GitTreeItem]:
"""Uses ``git ls-tree`` to report on a tree in a Git repository
Parameters
----------
path: Path
Path of a directory in a Git repository to report on. This directory
need not be the root directory of the repository, but must be part of
the repository. If the directory is not the root directory of a
non-bare repository, the iterator is constrained to items underneath
that directory.
recursive: {'repository', 'no'}, optional
Behavior for recursion into subtrees. By default (``repository``),
all tree within the repository underneath ``path``) are reported,
but not tree within submodules. If ``no``, only direct children
are reported on.
Yields
------
:class:`GitTreeItem`
The ``name`` attribute of an item is a ``str`` with the corresponding
(relative) path, as reported by Git (in POSIX conventions).
"""
# we force-convert to Path to give us the piece of mind we want.
# The docs already ask for that, but it is easy to
# forget/ignore and leads to non-obvious errors. Running this once is
# a cheap safety net
path = Path(path)

# although it would be easy to also query the object size, we do not
# do so, because it has a substantial runtime impact. It is unclear
# what the main factor for the slowdown is, but in test cases I can
# see 10x slower
# lstree_args = ['--long']
# we do not go for a custom format that would allow for a single split
# by tab, because if we do, Git starts quoting paths with special
# characters (like tab) again
# lstree_args = ['--format=%(objectmode)%x09%(objectname)%x09%(path)']
lstree_args = []
if recursive == 'repository':
lstree_args.append('-r')

for line in git_ls_tree(path, treeish, *lstree_args):
yield _get_tree_item(line)


def _get_tree_item(spec: str) -> GitTreeItem:
props, path = spec.split('\t', maxsplit=1)
# 0::2 gets the first and third (last) item, effectively skippping the
# type name (blob/tree etc.), we have the mode lookup for that, which
# provides more detail
mode, sha = props.split(' ')[0::2]
return GitTreeItem(
relpath=PurePosixPath(path),
gitsha=sha,
gittype=git_mode_type_map[mode],
)
90 changes: 90 additions & 0 deletions datalad_core/iter_collections/tests/test_itergittree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from pathlib import (
PurePosixPath,
)

import pytest

from datalad_core.consts import PRE_INIT_COMMIT_SHA
from datalad_core.runners import CommandError
from datalad_core.tests import (
call_git_addcommit,
)

from ..gittree import (
GitTreeItem,
GitTreeItemType,
iter_gittree,
)


def test_iter_gittree(gitrepo):
# an initial commit
(gitrepo / '.gitkeep').touch()
call_git_addcommit(gitrepo)

# we add a new file and test its expected properties
probe_name = 'probe.txt'
# on crippled FS we are testing the managed branch which contains
# pointer files, not symlinks
expected_probe_sha = '24ae15ce9741d53115a9fc71c2b761790ca47995'
probe = gitrepo / 'subdir' / probe_name
probe.parent.mkdir()
probe.write_text('probe')
call_git_addcommit(gitrepo)

tracked_items = list(iter_gittree(gitrepo, 'HEAD'))
# without untracked's and no link resolution this is plain and fast
assert all(
isinstance(i, GitTreeItem) and i.gitsha and i.gittype for i in tracked_items
)
assert any(
# let's query a Path instance here, to get that covered too
i.relpath == PurePosixPath(f'subdir/{probe_name}')
and i.gitsha == expected_probe_sha
and i.gittype == GitTreeItemType.file
for i in iter_gittree(gitrepo, 'HEAD')
)
# if we check the prior version, we do not see it (hence the
# tree-ish passing is working
assert not any(
i.relpath == PurePosixPath(f'subdir/{probe_name}')
for i in iter_gittree(gitrepo, 'HEAD~1')
)

# if we disable recursion, the probe is not listed, but its
# parent dir is
tracked_toplevel_items = list(iter_gittree(gitrepo, 'HEAD', recursive='no'))
assert not any(i.relpath == f'subdir/{probe_name}' for i in tracked_toplevel_items)
assert any(
i.relpath == PurePosixPath('subdir')
and i.gitsha == '0dd69202ba4657a5d9c37d5716d5b27127c4b57b'
and i.gittype == GitTreeItemType.directory
for i in tracked_toplevel_items
)
# iterating on a subdir does constrain the report
tracked_subdir_items = list(iter_gittree(probe.parent, 'HEAD'))
assert len(tracked_subdir_items) == 1
probe_item = tracked_subdir_items[0]
assert probe_item.relpath.name == probe_name
assert probe_item.gitsha == expected_probe_sha


def test_name_starting_with_tab(gitrepo):
tabbed_file_name = '\ttab.txt'
tabbed_file = gitrepo / tabbed_file_name
try:
tabbed_file.write_text('name of this file starts with a tab')
except OSError:
pytest.skip('not applicable on crippled filesystems')

call_git_addcommit(gitrepo)
iter_names = [item.relpath for item in iter_gittree(gitrepo, 'HEAD')]
assert PurePosixPath(tabbed_file_name) in iter_names


def test_iter_gittree_empty(gitrepo):
with pytest.raises(CommandError, match='Not a valid object name HEAD'):
list(iter_gittree(gitrepo, 'HEAD'))

all_items = list(iter_gittree(gitrepo, PRE_INIT_COMMIT_SHA))
assert len(all_items) == 0
20 changes: 17 additions & 3 deletions datalad_core/iter_collections/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,20 @@
from datalad_core.runners import iter_git_subproc


def git_ls_files(path: Path, *args: str) -> Iterator[str]:
"""Run ``git ls-files`` at a given ``path`` and with ``args``
def iter_gitcmd_zlines(
path: Path,
cmd: str,
*args: str,
) -> Iterator[str]:
"""Run ``git <cmd>`` at a given ``path`` and with ``args``
An unconditional ``-z`` argument is used to get zero-byte separation
of output items, internally. A generator is returned that yields ``str``
type values corresponding to these items.
"""
with iter_git_subproc(
[
'ls-files',
cmd,
# we rely on zero-byte splitting below
'-z',
# otherwise take whatever is coming in
Expand All @@ -39,6 +43,16 @@ def git_ls_files(path: Path, *args: str) -> Iterator[str]:
)


def git_ls_files(path: Path, *args: str) -> Iterator[str]:
"""Run ``git ls-files`` at a given ``path`` and with ``args``"""
return iter_gitcmd_zlines(path, 'ls-files', *args)


def git_ls_tree(path: Path, *args) -> Iterator[str]:
"""Run ``git ls-tree`` at a given ``path`` and with ``args``"""
return iter_gitcmd_zlines(path, 'ls-tree', *args)


# TODO: Could be `StrEnum`, came with PY3.11
class GitTreeItemType(Enum):
"""Enumeration of item types of Git trees"""
Expand Down

0 comments on commit a70464b

Please sign in to comment.