From 77fe158f7f3e810ace42fb02f4a25837bf3bb534 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Fri, 8 Nov 2024 09:38:32 +0100 Subject: [PATCH] feat: `iter_gitdiff()` --- datalad_core/iter_collections/gitdiff.py | 641 ++++++++++++++++++ .../tests/test_itergitdiff.py | 360 ++++++++++ datalad_core/iter_collections/utils.py | 37 + 3 files changed, 1038 insertions(+) create mode 100644 datalad_core/iter_collections/gitdiff.py create mode 100644 datalad_core/iter_collections/tests/test_itergitdiff.py diff --git a/datalad_core/iter_collections/gitdiff.py b/datalad_core/iter_collections/gitdiff.py new file mode 100644 index 0000000..99f0470 --- /dev/null +++ b/datalad_core/iter_collections/gitdiff.py @@ -0,0 +1,641 @@ +from __future__ import annotations + +from contextlib import suppress +from copy import deepcopy +from dataclasses import ( + dataclass, + replace, +) +from itertools import chain +from pathlib import ( + Path, + PurePosixPath, +) +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + +from datasalad.gitpathspec import GitPathSpecs + +from datalad_core.consts import PRE_INIT_COMMIT_SHA +from datalad_core.iter_collections.gittree import ( + GitTreeItem, +) +from datalad_core.iter_collections.utils import ( + GitContainerModificationType, + GitDiffStatus, + GitTreeItemType, + git_diffstatus_map, + git_mode_type_map, + iter_gitcmd_zlines, +) +from datalad_core.runners import ( + CommandError, + call_git, + call_git_oneline, +) + + +@dataclass(frozen=True) +class GitDiffItem(GitTreeItem): + """``GitTreeItem`` with "previous" property values given a state comparison""" + + prev_relpath: PurePosixPath | None = None + prev_gitsha: str | None = None + prev_gittype: GitTreeItemType | None = None + + status: GitDiffStatus | None = None + percentage: int | None = None + """This is the percentage of similarity for copy-status and + rename-status diff items, and the percentage of dissimilarity + for modifications.""" + modification_types: tuple[GitContainerModificationType, ...] | None = None + """Qualifiers for modification types of container-type + items (directories, submodules).""" + + +def iter_gitdiff( + path: Path, + from_treeish: str | None, + to_treeish: str | None, + *, + recursive: str = 'repository', + find_renames: int | None = None, + find_copies: int | None = None, + yield_tree_items: str | None = None, + eval_submodule_state: str = 'full', + pathspecs: list[str] | GitPathSpecs | None = None, +) -> Generator[GitDiffItem, None, None]: + """Report differences between Git tree-ishes or tracked worktree content + + This function is a wrapper around the Git command ``diff-tree`` and + ``diff-index``. Therefore most semantics also apply here. + + The main difference with respect to the Git commands are: 1) uniform + support for non-recursive, single tree reporting (no subtrees); and + 2) support for submodule recursion. + + Notes on 'no' recursion mode + + When comparing to the worktree, ``git diff-index`` always reports on + subdirectories. For homogeneity with the report on a committed tree, + a non-recursive mode emulation is implemented. It compresses all reports + from a direct subdirectory into a single report on that subdirectory. + The ``gitsha`` of that directory item will always be ``None``. Moreover, + no type or typechange inspection, or further filesystem queries are + performed. Therefore, ``prev_gittype`` will always be ``None``, and + any change other than the addition of the directory will be labeled + as a ``GitDiffStatus.modification``. + + Parameters + ---------- + path: Path + Path of a directory in a Git repository to report on. This directory + need not be the root directory of the repository, but must be part of + the repository. If the directory is not the root directory of a + non-bare repository, the iterator is constrained to items underneath + that directory. + from_treeish: str or None + Git "tree-ish" that defines the comparison reference. If ``None``, + ``to_treeeish`` must not be ``None`` (see its documentation for + details). + to_treeish: + Git "tree-ish" that defines the comparison target. If ``None``, + ``from_treeish`` must not be ``None``, and that tree-ish will be + compared against the worktree. (see its documentation for + details). If ``from_treeish`` is ``None``, the given tree-ish is + compared to its immediate parents (see ``git diff-tree`` documentation + for details). + recursive: {'repository', 'submodules', 'no'}, optional + Behavior for recursion into subtrees. By default (``repository``), + all trees within the repository underneath ``path``) are reported, + but no tree within submodules. With ``submodules``, recursion includes + any submodule that is present. If ``no``, only direct children + are reported on. + find_renames: int, optional + If given, this defines the similarity threshold for detecting renames + (see ``git diff-{index,tree} --find-renames``). By default, no rename + detection is done and reported items never have the ``rename`` status. + Instead, a renames would be reported as a deletion and an addition. + find_copied: int, optional + If given, this defines the similarity threshold for detecting copies + (see ``git diff-{index,tree} --find-copies``). By default, no copy + detection is done and reported items never have the ``copy`` status. + Instead, a copy would be reported as addition. + This option always implies the use of the ``--find-copies-harder`` + Git option that enables reporting of copy sources, even when they + have not been modified in the same change. This is a very expensive + operation for large projects, so use it with caution. + yield_tree_items: {'submodules', 'directories', 'all', None}, optional + Whether to yield an item on type of subtree that will also be recursed + into. For example, a submodule item, when submodule recursion is + enabled. When disabled, subtree items (directories, submodules) + will still be reported whenever there is no recursion into them. + For example, submodule items are reported when + ``recursive='repository``, even when ``yield_tree_items=None``. + eval_submodule_state: {"no", "commit", "full"}, optional + If 'full' (default), the state of a submodule is evaluated by + considering all modifications ('--ignore-submodules=none'). + If 'commit', the modification check is restricted to comparing the + submodule's "HEAD" commit to the one recorded in the superdataset + ('--ignore-submodules=dirty'). If 'no', the state of the subdataset is + not evaluated ('--ignore-submodules=all'). + pathspecs: optional + Patterns used to limit results to particular paths. Any pathspecs + supported by Git can be used and are passed to the underlying ``git + ls-files`` queries. Pathspecs are also supported for recursive reporting + on submodules. In such a case, the results match those of individual + queries with analog pathspecs on the respective submodules (Git itself + does not support pathspecs for submodule-recursive operations). For + example, a ``submodule`` recursion with a pathspec ``*.jpg`` will yield + reports on all JPG files in all submodules, even though a submodule path + itself does not match ``*.jpg``. On the other hand, a pathspec + ``submoddir/*.jpg`` will only report on JPG files in the submodule at + ``submoddir/``, but on all JPG files in that submodule. + As of version 1.5, the pathspec support for submodule recursion is + preliminary and results should be carefully investigated. + + Yields + ------ + :class:`GitDiffItem` + The ``name`` and ``prev_name`` attributes of an item are a ``str`` with + the corresponding (relative) path, as reported by Git + (in POSIX conventions). + """ + # we force-convert to Path to give us the piece of mind we want. + # The docs already ask for that, but it is easy to + # forget/ignore and leads to non-obvious errors. Running this once is + # a cheap safety net + path = Path(path) + _pathspecs = GitPathSpecs(pathspecs) + processed_submodules: set[PurePosixPath] = set() + + for item in _iter_gitdiff( + path=path, + from_treeish=from_treeish, + to_treeish=to_treeish, + recursive=recursive, + find_renames=find_renames, + find_copies=find_copies, + yield_tree_items=yield_tree_items, + eval_submodule_state=eval_submodule_state, + pathspecs=_pathspecs, + ): + # exclude non-submodules, or a submodule that was found at + # the root path -- which would indicate that the submodule + # itself it not around, only its record in the parent + if ( + recursive == 'submodules' + and item.gittype == GitTreeItemType.submodule + and item.relpath != PurePosixPath('.') + ): + # mark as processed immediately, independent of whether anything + # need to be reported + processed_submodules.add(item.relpath) + yield item + + # we may need to loop over the (remaining) submodules, because + # with pathspecs there is a chance that a given pathspec set did not + # match a submodule (directly) that could have content that matches a + # pathspec + if not (recursive == 'submodules' and _pathspecs): + return + + for item in _iter_gitdiff( + path=path, + from_treeish=from_treeish, + to_treeish=to_treeish, + # no need to double-recurse, we just need to discover all + # submodules in the diff unconstrained by pathspecs + recursive='repository', + find_renames=None, + find_copies=None, + yield_tree_items=None, + # we need to look at the recorded commit to get submodule + # reports at all + eval_submodule_state='commit', + pathspecs=GitPathSpecs(None), + ): + if ( + item.gittype != GitTreeItemType.submodule + or item.relpath in processed_submodules + ): + # not a submodule or already reported on + continue + + yield from _yield_from_submodule( + basepath=path, + subm=item, + to_treeish=to_treeish, + recursive=recursive, + yield_tree_items=yield_tree_items, + find_renames=find_renames, + find_copies=find_copies, + eval_submodule_state=eval_submodule_state, + pathspecs=_pathspecs, + ) + + +def _iter_gitdiff( + path: Path, + from_treeish: str | None, + to_treeish: str | None, + *, + recursive: str, + find_renames: int | None, + find_copies: int | None, + yield_tree_items: str | None, + eval_submodule_state: str, + pathspecs: GitPathSpecs, +) -> Generator[GitDiffItem, None, None]: + cmd, cmd_args = _build_cmd( + from_treeish=from_treeish, + to_treeish=to_treeish, + recursive=recursive, + find_renames=find_renames, + find_copies=find_copies, + yield_tree_items=yield_tree_items, + eval_submodule_state=eval_submodule_state, + pathspecs=pathspecs, + ) + + if cmd == 'diff-index': + # when we compare to the index, we need a refresh run to not have + # something like plain mtime changes trigger modification reports + # https://github.com/datalad/datalad-next/issues/639 + call_git( + [ + 'update-index', + # must come first, we recurse ourselves + '--ignore-submodules', + # we want to continue the refresh when the index need updating + '-q', + '--refresh', + ], + cwd=path, + ) + + # when do we need to condense subdir reports into a single dir-report + reported_dirs: set[str] = set() + _single_dir = (cmd == 'diff-index') and recursive == 'no' + # diff-tree reports the compared tree when no from is given, we need + # to skip that output below + skip_first = (cmd == 'diff-tree') and from_treeish is None + pending_props = None + for line in chain(iter_gitcmd_zlines(path, cmd, *cmd_args), [None]): + if skip_first: + skip_first = False + continue + if pending_props: + if line is not None: + pending_props.append(line) + if pending_props[4][0] in ('C', 'R'): + # for copies and renames we expect a second path. + # unless we are already on the end marker, in which + # case this is already the missing + continue + yield from _yield_diff_item( + cwd=path, + single_dir=_single_dir, + spec=pending_props, + reported_dirs=reported_dirs, + from_treeish=from_treeish, + to_treeish=to_treeish, + recursive=recursive, + find_renames=find_renames, + find_copies=find_copies, + yield_tree_items=yield_tree_items, + eval_submodule_state=eval_submodule_state, + pathspecs=pathspecs, + ) + pending_props = None + elif line is None: + # this is the end marker, nothing to do + pass + elif line.startswith(':'): + pending_props = line[1:].split(' ') + else: # pragma: no cover + msg = 'we should not get here, unexpected diff output' + raise RuntimeError(msg) + + +def _build_cmd( + *, + from_treeish: str | None, + to_treeish: str | None, + recursive: str, + find_renames: int | None, + find_copies: int | None, + yield_tree_items: str | None, + eval_submodule_state: str, + pathspecs: GitPathSpecs, +) -> tuple[str, list[str]]: + # from : to : description + # --------------------------- + # HEAD : None : compare to worktree, not with the index (diff-index) + # HEAD~2 : HEAD : compare trees (diff-tree) + # None : HEAD~2 : compare tree with its parents (diff-tree) + # None : None : exception + + common_args: list[str] = [ + '--no-rename-empty', + # ignore changes above CWD + '--relative', + '--raw', + ] + if find_renames is not None: + common_args.append(f'--find-renames={find_renames}%') + if find_copies is not None: + common_args.append(f'--find-copies={find_copies}%') + # if someone wants to look for copies, we actually look + # for copies. This is expensive, but IMHO is the one + # thing that makes this useful + # TODO: possibly we only want to enable this when + # find_copies==100 (exact copies), based on the assumption + # that this is cheaper than reading all file content. + # but if that is actually true remains to be tested + common_args.append('--find-copies-harder') + + if eval_submodule_state == 'no': + common_args.append('--ignore-submodules=all') + elif eval_submodule_state == 'commit': + common_args.append('--ignore-submodules=dirty') + elif eval_submodule_state == 'full': + common_args.append('--ignore-submodules=none') + else: + msg = f'unknown submodule evaluation mode {eval_submodule_state!r}' + raise ValueError(msg) + + if from_treeish is None and to_treeish is None: + msg = 'either `from_treeish` or `to_treeish` must not be None' + raise ValueError(msg) + if to_treeish is None: + if TYPE_CHECKING: + assert from_treeish is not None + cmd = 'diff-index' + cmd_args = [*common_args, from_treeish] + else: + # diff NOT against the working tree + cmd = 'diff-tree' + cmd_args = [*common_args] + if recursive in ('repository', 'submodules'): + cmd_args.append('-r') + if yield_tree_items in ('all', 'directories'): + cmd_args.append('-t') + if from_treeish is None: + cmd_args.append(to_treeish) + else: + # two tree-ishes given + cmd_args.extend((from_treeish, to_treeish)) + + # add disambiguation marker for pathspec. + # even if we do not pass any, we get simpler error messages from Git + cmd_args.append('--') + + cmd_args.extend(pathspecs.arglist()) + return cmd, cmd_args + + +def _get_diff_item(spec: list[str]) -> GitDiffItem: + # this helper decodes the git-diff-tree/index raw output format to + # a GitDiffItem + non_rename_spec_length = 6 + + prev_gittype = git_mode_type_map.get(spec[0], None) + gittype = git_mode_type_map.get(spec[1], None) + + prev_gitsha = None if spec[2] == 40 * '0' else spec[2] + gitsha = None if spec[3] == 40 * '0' else spec[3] + + # first char is status code + status = git_diffstatus_map[spec[4][0]] + percentage: int | None = None + if len(spec[4]) > 1: + percentage = int(spec[4][1:]) + + modification_types: tuple[GitContainerModificationType, ...] | None = None + if status == GitDiffStatus.addition: + # this is an addition, we want `relpath` in the right place + relpath = PurePosixPath(spec[5]) + prev_relpath = None + if gitsha is None: + modification_types = (GitContainerModificationType.modified_content,) + else: + prev_relpath = PurePosixPath(spec[5]) + relpath = PurePosixPath( + spec[6] if len(spec) > non_rename_spec_length else spec[5] + ) + + return GitDiffItem( + relpath=relpath, + prev_relpath=prev_relpath, + gitsha=gitsha, + prev_gitsha=prev_gitsha, + gittype=gittype, + prev_gittype=prev_gittype, + status=status, + percentage=percentage, + modification_types=modification_types, + ) + + +def _yield_diff_item( + *, + cwd: Path, + recursive: str, + from_treeish: str | None, + to_treeish: str | None, + spec: list, + single_dir: bool, + reported_dirs: set, + yield_tree_items: str | None, + find_renames: int | None, + find_copies: int | None, + eval_submodule_state: str, + pathspecs: GitPathSpecs, +) -> Generator[GitDiffItem, None, None]: + item = _get_diff_item(spec) + + # the number of path parts that indicate an item inside a subdir + min_subdir_path_parts = 2 + + if single_dir: + if TYPE_CHECKING: + # we can only get here when diff-index ran, in which case + # to_treeish was None and consequently from_treeish can never + # be None at this point + assert from_treeish is not None + # handle the special case of reporting only on the 1st-level + # containing directory of an item. + relpath = item.relpath or item.prev_relpath + if TYPE_CHECKING: + # we cannot have items that have no relpath whatsoever + assert relpath is not None + # we decide on mangling the actual report to be on the containing + # directory only, or to withhold it entirely + if len(relpath.parts) < min_subdir_path_parts: + # nothing in a subdirectory + yield item + return + dname = relpath.parts[0] + if dname in reported_dirs: + # nothing else todo, we already reported + return + + reported_dirs.add(dname) + yield _mangle_item_for_singledir( + PurePosixPath(dname), + from_treeish, + cwd, + ) + return + + if item.gittype != GitTreeItemType.submodule: + # any non-submodule item can be yielded now and we are done here + yield item + return + + # this is about a present submodule. + # sadly, we cannot give details for modification types other than addition. + # depending on --ignore-submodules a range of situations + # could be the case + if item.status == GitDiffStatus.modification and item.gitsha is None: + # in 'git diff-index' speak the submodule is "out-of-sync" with + # the index: this happens when there are new commits + item = replace( + item, + modification_types=( + *(item.modification_types or ()), + GitContainerModificationType.new_commits, + ), + ) + + if recursive != 'submodules': + # no submodule recursion, we can yield this submodule item + # directly + yield item + return + + if yield_tree_items in ('all', 'submodules'): + # we are instructed to yield this submodule item, but if we are going + # to recurse into it, continuing to use the item instance for + # queries. Hence we yield a copy here to avoid data corruption + # TODO: we have frozen items now, no need for a deppcopy anymore + yield deepcopy(item) if recursive == 'submodules' else item + + if recursive != 'submodules': + return + + yield from _yield_from_submodule( + basepath=cwd, + subm=item, + to_treeish=to_treeish, + recursive=recursive, + yield_tree_items=yield_tree_items, + find_renames=find_renames, + find_copies=find_copies, + eval_submodule_state=eval_submodule_state, + pathspecs=pathspecs, + ) + + +def _yield_from_submodule( + *, + basepath: Path, + subm: GitDiffItem, + to_treeish: str | None, + recursive: str, + yield_tree_items: str | None, + find_renames: int | None, + find_copies: int | None, + eval_submodule_state: str, + pathspecs: GitPathSpecs, +) -> Generator[GitDiffItem, None, None]: + # I believe we need no protection against absent submodules. + # The only way they can appear here is a reported modification. + # The only modification that is possible with an absent submodule + # is a deletion. And that would cause the item.gittype to be None + # -- a condition that is caught above + subm_relpath = PurePosixPath(subm.relpath) + subm_pathspecs = pathspecs + if pathspecs: + # recode pathspecs to match the submodule scope + try: + subm_pathspecs = pathspecs.for_subdir(subm_relpath) + except ValueError: + # not a single pathspec could be translated, there is + # no chance for a match, we can stop here + return + for i in iter_gitdiff( + basepath / subm_relpath, + # we never want to pass None here + # if `prev_gitsha` is None, it means that the + # submodule record is new, and we want its full + # content reported. Passing None, however, + # would only report the change to the current + # state. + from_treeish=subm.prev_gitsha or PRE_INIT_COMMIT_SHA, + # when comparing the parent to the worktree, we + # also want to compare any children to the worktree + to_treeish=None if to_treeish is None else subm.gitsha, + # pass on the common args + recursive=recursive, + yield_tree_items=yield_tree_items, + find_renames=find_renames, + find_copies=find_copies, + eval_submodule_state=eval_submodule_state, + pathspecs=subm_pathspecs, + ): + # prepend any item relpath with the parent items + # relpath + yield replace( + i, + relpath=None if i.relpath is None else subm.relpath / i.relpath, + prev_relpath=None + if i.prev_relpath is None + else subm.relpath / i.prev_relpath, + ) + + +def _mangle_item_for_singledir( + dname: PurePosixPath, + from_treeish: str, + cwd: Path, +) -> GitDiffItem: + # at this point we have a change report on subdirectory content + # we only get here when comparing `from_treeish` to the worktree. + prev_gitsha = None + # on error `dname` is not known in `from_treeish` + with suppress(CommandError): + prev_gitsha = call_git_oneline( + ['rev-parse', '-q', f'{from_treeish}:./{dname}'], + cwd=cwd, + ) + + return GitDiffItem( + relpath=dname, + # if we have a previous gitsha, we know that the relpath was valid in + # `from_treeish` too + prev_relpath=dname if prev_gitsha else None, + # non-committed change -> no SHA (this ignored the index, + # like we do elsewhere too) + gitsha=None, + prev_gitsha=prev_gitsha, + # this is only ever about a directory + gittype=GitTreeItemType.directory, + # it would require more calls to figure out the mode and infer + # a possible type change. For now, we do not go there + prev_gittype=None, + status=GitDiffStatus.modification + if prev_gitsha + # the was nothing with this relpath in `from_treeish`, but now + # it exists. We compare to the worktree, but not any untracked + # content -- this means that we likely compare across multiple + # states and the directory become tracked after `from_treeish`. + # let's call it an addition + else GitDiffStatus.addition, + ) diff --git a/datalad_core/iter_collections/tests/test_itergitdiff.py b/datalad_core/iter_collections/tests/test_itergitdiff.py new file mode 100644 index 0000000..78e41b9 --- /dev/null +++ b/datalad_core/iter_collections/tests/test_itergitdiff.py @@ -0,0 +1,360 @@ +import shutil +from pathlib import PurePosixPath + +import pytest + +from datalad_core.consts import PRE_INIT_COMMIT_SHA +from datalad_core.repo import Worktree +from datalad_core.runners import call_git +from datalad_core.tests import ( + call_git_addcommit, + create_submodule, + rmtree, +) + +from ..gitdiff import ( + GitDiffStatus, + GitTreeItemType, + iter_gitdiff, +) + + +def test_iter_gitdiff_invalid(): + with pytest.raises(ValueError): + # no meaningful comparison + list(iter_gitdiff('.', None, None)) + with pytest.raises(ValueError): + # unsupported eval mode + list(iter_gitdiff('.', None, None, eval_submodule_state='weird')) + + +def test_iter_gitdiff_basic(gitrepo): + # we use two distinct content blobs below, hardcode sha here + # for readability + empty_sha = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' + content = '123' + content_sha = 'd800886d9c86731ae5c4a62b0b77c437015e00d2' + # clean dataset, no items + assert list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, None)) == [] + testpath = gitrepo / 'sub' / 'test' + testpath.parent.mkdir() + testpath.touch() + # dataset with untracked file, no items + assert list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, None)) == [] + call_git_addcommit(gitrepo) + # clean dataset again, no items, compared to HEAD + assert list(iter_gitdiff(gitrepo, 'HEAD', None)) == [] + # added file + diff = list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, 'HEAD')) + assert len(diff) == 1 + di = diff[0] + assert di.status == GitDiffStatus.addition + assert di.relpath == PurePosixPath('sub/test') + assert di.prev_relpath is di.prev_gitsha is di.prev_gittype is None + assert di.gitsha == empty_sha + assert di.gittype == GitTreeItemType.file + # modified file + testpath.write_text(content) + diff = list(iter_gitdiff(gitrepo, 'HEAD', None)) + assert len(diff) == 1 + di = diff[0] + # labeled as modified + assert di.status == GitDiffStatus.modification + # the name is POSIX + assert di.relpath == di.prev_relpath == PurePosixPath('sub/test') + # unstaged modification reports no shasum + assert di.gitsha is None + assert di.prev_gitsha == empty_sha + assert di.gittype == di.prev_gittype == GitTreeItemType.file + # make clean + call_git_addcommit(gitrepo) + moved_testpath = testpath.parent / 'moved_test' + testpath.rename(moved_testpath) + # renamed file, unstaged, reported as deletion, we do not see the addition + # yet (untracked) + diff = list(iter_gitdiff(gitrepo, 'HEAD', None)) + assert len(diff) == 1 + di = diff[0] + assert di.status == GitDiffStatus.deletion + assert di.relpath == di.prev_relpath == PurePosixPath('sub/test') + assert di.prev_gitsha == content_sha + assert di.prev_gittype == GitTreeItemType.file + assert di.gitsha is di.gittype is None + # make clean + call_git_addcommit(gitrepo) + # now we can look at the rename + diff = list(iter_gitdiff(gitrepo, 'HEAD~1', 'HEAD', find_renames=100)) + assert len(diff) == 1 + di = diff[0] + assert di.status == GitDiffStatus.rename + assert di.relpath == PurePosixPath('sub/moved_test') + assert di.prev_relpath == PurePosixPath('sub/test') + assert di.gitsha == di.prev_gitsha == content_sha + assert di.prev_gittype is di.gittype is GitTreeItemType.file + assert di.percentage == 100 + # now a copy + shutil.copyfile(moved_testpath, testpath) + call_git_addcommit(gitrepo) + diff = list(iter_gitdiff(gitrepo, 'HEAD~1', 'HEAD', find_copies=100)) + assert len(diff) == 1 + di = diff[0] + assert di.status == GitDiffStatus.copy + assert di.relpath == PurePosixPath('sub/test') + assert di.prev_relpath == PurePosixPath('sub/moved_test') + assert di.gitsha == di.prev_gitsha == content_sha + assert di.percentage == 100 + # now replace file with submodule + testpath.unlink() + # we must safe to appease datalad's content collision detection + call_git_addcommit(gitrepo) + # intermediate smoke test for describing a single tree (diff from parents) + diff = list(iter_gitdiff(gitrepo, None, 'HEAD')) + assert len(diff) == 1 + assert diff[0].status == GitDiffStatus.deletion + # now cause typechange + testpath.mkdir() + Worktree.init_at(testpath) + (testpath / '.gitkeep').touch() + call_git_addcommit(testpath) + call_git( + ['submodule', 'add', f'./{testpath}', f'{testpath.relative_to(gitrepo)}'], + cwd=gitrepo, + capture_output=True, + ) + call_git_addcommit(gitrepo) + diff = list(iter_gitdiff( + gitrepo, + # because we have an intermediate safe, compare to two states + # back + 'HEAD~2', 'HEAD', + )) + assert len(diff) == 2 + # let's ignore the uninteresting .gitmodules addition for further tests + di = [i for i in diff if i.relpath != PurePosixPath('.gitmodules')][0] + assert di.status == GitDiffStatus.typechange + assert di.relpath == di.prev_relpath == PurePosixPath('sub/test') + assert di.gitsha != di.prev_gitsha + assert di.prev_gitsha == content_sha + assert di.prev_gittype == GitTreeItemType.file + assert di.gittype == GitTreeItemType.submodule + + +def test_iter_gitdiff_nonroot(gitrepo): + # all tests are concerned with running not in the dataset root + root = gitrepo + nonroot = root / 'sub' + nonroot.mkdir() + + # nothing to report, no problem + assert list(iter_gitdiff(nonroot, PRE_INIT_COMMIT_SHA, None)) == [] + # change above CWD is not reported + (root / 'rootfile').touch() + call_git_addcommit(gitrepo) + assert list(iter_gitdiff(nonroot, PRE_INIT_COMMIT_SHA, 'HEAD')) == [] + # check worktree modification detection too + (root / 'rootfile').write_text('some') + assert list(iter_gitdiff(nonroot, 'HEAD', None)) == [] + # and now test that reporting is relative to + # CWD + (nonroot / 'nonrootfile').touch() + call_git_addcommit(gitrepo) + assert list(iter_gitdiff(nonroot, 'HEAD~1', 'HEAD'))[0].relpath == PurePosixPath('nonrootfile') + (nonroot / 'nonrootfile').write_text('other') + assert list(iter_gitdiff(nonroot, 'HEAD~1', 'HEAD'))[0].relpath == PurePosixPath('nonrootfile') + + +def test_iter_gitdiff_nonrec(gitrepo): + subdir = gitrepo / 'sub' + subdir.mkdir() + for fn in ('f1.txt', 'f2.txt'): + (subdir / fn).touch() + call_git_addcommit(gitrepo) + diff = list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, 'HEAD', recursive='no')) + assert len(diff) == 1 + di = diff[0] + assert di.relpath == PurePosixPath('sub') + assert di.gittype == GitTreeItemType.directory + assert di.status == GitDiffStatus.addition + di_tree = di + # same behavior for a worktree modification + for fn in ('f1.txt', 'f2.txt'): + (subdir / fn).write_text('modified') + diff = list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, None, recursive='no')) + assert len(diff) == 1 + di = diff[0] + # these are identical to the diff-tree based report + for p in ('relpath', 'gittype', 'prev_gitsha', 'prev_gittype'): + assert getattr(di, p) == getattr(di_tree, p) + # and there are different + # not staged, no gitsha + assert di.gitsha is None + # it does no type inference for the previous state (expensive) + assert di.prev_gittype is None + + # when the directory existed in the from-state it becomes a + # modification + diff = list(iter_gitdiff(gitrepo, 'HEAD', None, recursive='no')) + assert len(diff) == 1 + diff[0].status == GitDiffStatus.modification + + # now remove the subdir + rmtree(subdir) + diff = list(iter_gitdiff(gitrepo, 'HEAD', None, recursive='no')) + assert len(diff) == 1 + # it still reports a modification, even though the directory is empty/gone. + # it would require a filesystem STAT to detect a deletion, and a further + # type investigation in `from_treeish` to detect a type change. + # This is not done until there is evidence for a real use case + diff[0].status == GitDiffStatus.modification + + +def test_iter_gitdiff_typechange_issue6791(gitrepo): + # verify that we can handle to problem described in + # https://github.com/datalad/datalad/issues/6791 + # + # a subdataset is wiped out (uncommitted) and replaced by a file + test_relpath = PurePosixPath('test') + test_path = gitrepo / test_relpath + create_submodule(gitrepo, test_relpath) + # commit the submodule addition + call_git_addcommit(gitrepo) + rmtree(test_path) + test_path.touch() + diff = list(iter_gitdiff(gitrepo, 'HEAD', None)) + assert len(diff) == 1 + di = diff[0] + assert di.status == GitDiffStatus.typechange + assert di.relpath == di.prev_relpath == test_relpath + # unstaged change + assert di.gitsha is None + assert di.prev_gittype == GitTreeItemType.submodule + assert di.gittype == GitTreeItemType.file + + +def test_iter_gitdiff_rec(gitrepo): + subm_relpath = PurePosixPath('subds') + subm_wt = create_submodule(gitrepo, subm_relpath) + call_git_addcommit(gitrepo) + + diff = list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, 'HEAD', recursive='submodules')) + # we get .gitmodules and a submodule record + assert len(diff) == 2 + # the entire submodule is new and the first one, so everything + # is an addition + assert all(i.status == GitDiffStatus.addition for i in diff) + # only files, no submodule record, by default + assert all(i.gittype == GitTreeItemType.file for i in diff) + + # when we ask for it, we get the submodule item too + diff_w_sm = list(iter_gitdiff(gitrepo, PRE_INIT_COMMIT_SHA, 'HEAD', + recursive='submodules', + yield_tree_items='submodules')) + assert len(diff) + 1 == len(diff_w_sm) + assert any(i.relpath == subm_relpath and i.gittype == GitTreeItemType.submodule + for i in diff_w_sm) + + # smoke test for an all-clean diff against the worktrees + assert list(iter_gitdiff(gitrepo, 'HEAD', None, recursive='submodules')) == [] + + # make subdataset record modified + subm_test_relpath = subm_relpath / 'file' + subm_test_path = gitrepo / subm_test_relpath + subm_test_path.touch() + call_git_addcommit(subm_wt.path) + diff = list(iter_gitdiff(gitrepo, 'HEAD', None, recursive='submodules')) + assert len(diff) == 1 + di = diff[0] + assert di.relpath == subm_test_relpath + assert di.status == GitDiffStatus.addition + # now with submodule item + diff_w_sm = list(iter_gitdiff(gitrepo, 'HEAD', None, + recursive='submodules', + yield_tree_items='all')) + assert len(diff_w_sm) == 2 + di = diff_w_sm[0] + # the submodule item is always first + assert di.relpath == subm_relpath + assert di.gittype == GitTreeItemType.submodule + assert di.status == GitDiffStatus.modification + assert diff_w_sm[1] == diff[0] + + # safe the whole hierarchy + call_git_addcommit(gitrepo) + # we get the exact same change report via the diff to HEAD~1:HEAD + assert diff == list(iter_gitdiff(gitrepo, 'HEAD~1', 'HEAD', recursive='submodules')) + + # modify a tracked file in the subdataset + subm_test_path.write_text('123') + diff_w_sm = list(iter_gitdiff(gitrepo, 'HEAD', None, + recursive='submodules', + yield_tree_items='all')) + # same report for the submodule (and it is first again) + assert diff_w_sm[0].relpath == subm_relpath + assert diff_w_sm[0].gittype == GitTreeItemType.submodule + assert diff_w_sm[0].status == GitDiffStatus.modification + # but this time the file is not an addition but a modification + assert diff_w_sm[1].relpath == subm_test_relpath + assert diff_w_sm[1].status == GitDiffStatus.modification + + # force-wipe the subdataset, and create a condition where the subdatasets + # is expected but missing + rmtree(subm_wt.path) + diff = list(iter_gitdiff(gitrepo, 'HEAD', None)) + assert len(diff) == 1 + di = diff[0] + assert di.relpath == subm_relpath + assert di.status == GitDiffStatus.deletion + # if we now run with recursion, we get the exact same result, the absent + # submodule is a subtree that we do not recurse into, hence the report + # is only on the tree itself + assert diff == list(iter_gitdiff(gitrepo, 'HEAD', None, recursive='submodules')) + # use the opportunity to check equality of recursive='all' for this case + assert diff == list(iter_gitdiff(gitrepo, 'HEAD', None, recursive='all')) + + +def test_iter_gitdiff_multilvl_rec(gitrepo): + # for uniformity with the coming submodules + (gitrepo / '.gitkeep').touch() + s1_relpath = PurePosixPath('sublvl1') + s1_wt = create_submodule(gitrepo, s1_relpath) + s2_relpath = PurePosixPath('sublvl2') + s2_wt = create_submodule(s1_wt.path, s2_relpath) + call_git_addcommit(s1_wt.path) + call_git_addcommit(gitrepo) + + diff = list(iter_gitdiff( + gitrepo, PRE_INIT_COMMIT_SHA, 'HEAD', + # check that we get full repo content from all submodules + recursive='submodules', + # check that the container item flags are passed into the + # recursion properly + yield_tree_items='submodules', + )) + for relpath in (s1_relpath, s1_relpath / s2_relpath): + assert any( + d.relpath == relpath and d.gittype == GitTreeItemType.submodule + for d in diff) + for base in ( + PurePosixPath('.'), + s1_relpath, + s1_relpath / s2_relpath, + ): + assert any( + d.relpath == base / '.gitkeep' + and d.gittype == GitTreeItemType.file + for d in diff + ) + + # try with very simple pathspec constraint, where the pathspec + # itself does not match the submodules that contain the + # matches + diff = list(iter_gitdiff( + gitrepo, PRE_INIT_COMMIT_SHA, 'HEAD', + pathspecs=[':(glob)**/.gitkeep'], + recursive='submodules', + )) + assert len(diff) == 3 + assert all( + d.relpath.name == '.gitkeep' and d.gittype == GitTreeItemType.file + for d in diff + ) diff --git a/datalad_core/iter_collections/utils.py b/datalad_core/iter_collections/utils.py index 947b1cc..037c064 100644 --- a/datalad_core/iter_collections/utils.py +++ b/datalad_core/iter_collections/utils.py @@ -71,3 +71,40 @@ class GitTreeItemType(Enum): '120000': GitTreeItemType.symlink, '160000': GitTreeItemType.submodule, } + + +# TODO: Could be `StrEnum`, came with PY3.11 +class GitDiffStatus(Enum): + """Enumeration of statuses for diff items""" + + addition = 'addition' + copy = 'copy' + deletion = 'deletion' + modification = 'modification' + rename = 'rename' + typechange = 'typechange' + unmerged = 'unmerged' + unknown = 'unknown' + # this is a local addition and not defined by git + # AKA "untracked" + other = 'other' + + +git_diffstatus_map = { + 'A': GitDiffStatus.addition, + 'C': GitDiffStatus.copy, + 'D': GitDiffStatus.deletion, + 'M': GitDiffStatus.modification, + 'R': GitDiffStatus.rename, + 'T': GitDiffStatus.typechange, + 'U': GitDiffStatus.unmerged, + 'X': GitDiffStatus.unknown, + 'O': GitDiffStatus.other, +} + + +# TODO: Could be `StrEnum`, came with PY3.11 +class GitContainerModificationType(Enum): + new_commits = 'new commits' + untracked_content = 'untracked content' + modified_content = 'modified content'