Skip to content

Commit

Permalink
checkout: save_link: reuse metadata instead of collecting mtimes again (
Browse files Browse the repository at this point in the history
  • Loading branch information
skshetry authored Sep 25, 2024
1 parent b2a8512 commit 58262ed
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 7 deletions.
26 changes: 23 additions & 3 deletions src/dvc_data/hashfile/checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from dvc_objects.fs.generic import test_links, transfer
from dvc_objects.fs.local import LocalFileSystem
from dvc_objects.fs.system import inode
from fsspec.callbacks import DEFAULT_CALLBACK

from dvc_data.fsutils import _localfs_info
Expand Down Expand Up @@ -267,6 +268,23 @@ def __call__(
raise LinkError(to_path) from exc


def _save_link(
path: str,
fs: "FileSystem",
diff: DiffResult,
updated_mtimes: dict[str, float],
state: "StateBase",
) -> None:
if not isinstance(fs, LocalFileSystem):
return

from .utils import _get_mtime_from_changes

mtime = _get_mtime_from_changes(path, fs, diff, updated_mtimes)
ino = inode(path)
return state.set_link(path, ino, mtime)


def _checkout( # noqa: C901
diff: DiffResult,
path: str,
Expand All @@ -279,6 +297,8 @@ def _checkout( # noqa: C901
prompt: Optional[Callable[[str], bool]] = None,
):
if not diff:
if relink and state is not None:
_save_link(path, fs, diff, {}, state)
return

links = test_links(cache.cache_types, cache.fs, cache.path, fs, path)
Expand All @@ -294,6 +314,7 @@ def _checkout( # noqa: C901
failed = []
hashes_to_update: list[tuple[str, HashInfo, dict]] = []
is_local_fs = isinstance(fs, LocalFileSystem)
updated_mtimes = {}
for change in chain(diff.added, diff.modified):
entry_path = fs.join(path, *change.new.key) if change.new.key != ROOT else path
assert change.new.oid
Expand All @@ -319,9 +340,11 @@ def _checkout( # noqa: C901
if is_local_fs:
info = _localfs_info(entry_path)
hashes_to_update.append((entry_path, change.new.oid, info))
updated_mtimes[entry_path] = info["mtime"]

if state is not None:
state.save_many(hashes_to_update, fs)
_save_link(path, fs, diff, updated_mtimes, state)

if failed:
raise CheckoutError(failed)
Expand Down Expand Up @@ -379,9 +402,6 @@ def checkout( # noqa: PLR0913
except CheckoutError as exc:
failed.extend(exc.paths)

if (diff or relink) and state:
state.save_link(path, fs)

if failed or not diff:
if progress_callback and obj:
progress_callback.relative_update(len(obj))
Expand Down
11 changes: 10 additions & 1 deletion src/dvc_data/hashfile/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ def get_many(
) -> Iterator[Union[tuple[str, None, None], tuple[str, "Meta", "HashInfo"]]]:
pass

@abstractmethod
def set_link(self, path, inode, mtime):
pass

@abstractmethod
def save_link(self, path, fs):
pass
Expand Down Expand Up @@ -102,6 +106,9 @@ def get_many(
) -> Iterator[Union[tuple[str, None, None], tuple[str, "Meta", "HashInfo"]]]:
yield from zip_longest(items, [], [])

def set_link(self, path, inode, mtime):
pass

def save_link(self, path, fs):
pass

Expand Down Expand Up @@ -281,8 +288,10 @@ def save_link(self, path, fs):
return

inode = get_inode(path)
relative_path = relpath(path, self.root_dir)
return self.set_link(path, inode, mtime)

def set_link(self, path, inode, mtime):
relative_path = relpath(path, self.root_dir)
with self.links as ref:
ref[relative_path] = (inode, mtime)

Expand Down
53 changes: 50 additions & 3 deletions src/dvc_data/hashfile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,19 @@
from dvc_objects.fs.base import AnyFSPath, FileSystem

from ._ignore import Ignore
from .diff import DiffResult


def to_nanoseconds(ts: float) -> int:
return round(ts * 1_000_000_000)


def _tokenize_mtimes(files_mtimes: dict[str, float]) -> str:
data = json.dumps(files_mtimes, sort_keys=True).encode("utf-8")
digest = hashlib.md5(data) # noqa: S324
return digest.hexdigest()


def get_mtime_and_size(
path: "AnyFSPath", fs: "FileSystem", ignore: Optional["Ignore"] = None
) -> tuple[str, int]:
Expand Down Expand Up @@ -43,7 +50,47 @@ def get_mtime_and_size(

# We track file changes and moves, which cannot be detected with simply
# max(mtime(f) for f in non_ignored_files)
hasher = hashlib.md5() # noqa: S324
hasher.update(json.dumps(files_mtimes, sort_keys=True).encode("utf-8"))
mtime = hasher.hexdigest()
mtime = _tokenize_mtimes(files_mtimes)
return mtime, size


def _get_mtime_from_changes(
path: str,
fs: "FileSystem",
diff: "DiffResult",
updated_mtimes: dict[str, float],
) -> str:
from .diff import ROOT

fs_info = _localfs_info(path)
if fs_info["type"] == "file":
return str(to_nanoseconds(fs_info["mtime"]))

mtimes: dict[str, float] = {}
mtimes.update(updated_mtimes)

sep = fs.sep

for change in diff.unchanged:
key = change.old.key
if key == ROOT:
continue

entry_path = sep.join((path, *key))
if entry_path in mtimes:
continue
meta = change.old.meta
mtime = meta.mtime if meta is not None else None
if mtime is None:
try:
stats = _localfs_info(entry_path)
except OSError as exc:
# NOTE: broken symlink case.
if exc.errno != errno.ENOENT:
raise
continue
mtime = stats["mtime"]
assert mtime is not None
mtimes[entry_path] = mtime

return _tokenize_mtimes(mtimes)
78 changes: 78 additions & 0 deletions tests/hashfile/test_checkout.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
import os
from copy import deepcopy
from functools import partial
from os.path import realpath, samefile
from pathlib import Path

import pytest
from attrs import evolve
from dvc_objects.fs.generic import transfer
from dvc_objects.fs.local import LocalFileSystem, localfs
from dvc_objects.fs.system import inode

from dvc_data.hashfile import checkout as checkout_mod
from dvc_data.hashfile.build import build
from dvc_data.hashfile.checkout import LinkError, _determine_files_to_relink, checkout
from dvc_data.hashfile.db import HashFileDB
from dvc_data.hashfile.db.local import LocalHashFileDB
from dvc_data.hashfile.diff import Change, DiffResult, TreeEntry
from dvc_data.hashfile.state import State
from dvc_data.hashfile.transfer import transfer as otransfer
from dvc_data.hashfile.utils import get_mtime_and_size


@pytest.mark.parametrize("cache_type", ["copy", "hardlink", "symlink", "reflink"])
Expand Down Expand Up @@ -209,3 +214,76 @@ def test_recheckout_old_obj(tmp_path, relink):

assert (tmp_path / "dir" / "foo").read_text() == "foo"
assert (tmp_path / "dir" / "bar").read_text() == "bar"


def get_inode_and_mtime(path):
return inode(path), get_mtime_and_size(os.fspath(path), localfs)[0]


def test_checkout_save_link_dir(request, tmp_path):
fs = LocalFileSystem()
state = State(tmp_path, tmp_dir=tmp_path / "tmp")
request.addfinalizer(state.close)

cache = HashFileDB(fs, str(tmp_path), state=state)

directory = tmp_path / "dir"
directory.mkdir()
(directory / "foo").write_text("foo", encoding="utf-8")
(directory / "bar").write_text("bar", encoding="utf-8")

staging, _, obj = build(cache, os.fspath(directory), fs, "md5")
otransfer(staging, cache, {obj.hash_info}, shallow=False)
chkout = partial(checkout, os.fspath(directory), fs, obj, cache=cache, state=state)

chkout()
assert "dir" not in state.links

chkout(relink=True)
assert state.links["dir"] == get_inode_and_mtime(directory)

# modify file
(directory / "foo").write_text("food", encoding="utf-8")
chkout(force=True)
assert state.links["dir"] == get_inode_and_mtime(directory)

# remove file
(directory / "bar").unlink()
chkout()
assert state.links["dir"] == get_inode_and_mtime(directory)

# add file
(directory / "foobar").write_text("foobar", encoding="utf-8")
chkout(force=True)
assert state.links["dir"] == get_inode_and_mtime(directory)


def test_checkout_save_link_file(request, tmp_path):
fs = LocalFileSystem()
state = State(tmp_path, tmp_dir=tmp_path / "tmp")
request.addfinalizer(state.close)

cache = HashFileDB(fs, os.fspath(tmp_path), state=state)

file = tmp_path / "foo"
file.write_text("foo", encoding="utf-8")

staging, _, obj = build(cache, os.fspath(file), fs, "md5")
otransfer(staging, cache, {obj.hash_info}, shallow=False)
chkout = partial(checkout, os.fspath(file), fs, obj, cache=cache, state=state)

chkout()
assert "foo" not in state.links

chkout(relink=True)
assert state.links["foo"] == get_inode_and_mtime(file)

# modify file
file.write_text("food", encoding="utf-8")
chkout(force=True)
assert state.links["foo"] == get_inode_and_mtime(file)

# remove file
file.unlink()
chkout()
assert state.links["foo"] == get_inode_and_mtime(file)
11 changes: 11 additions & 0 deletions tests/hashfile/test_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ def test_state_many(tmp_path, state: State):
]


def test_set_link(tmp_path, state):
state.set_link(tmp_path / "foo", 42, "mtime")
assert state.links["foo"] == (42, "mtime")


def test_state_noop(tmp_path):
state = StateNoop()
fs = LocalFileSystem()
Expand All @@ -217,6 +222,12 @@ def test_state_noop(tmp_path):
("bar", None, None),
]

state.set_link(tmp_path / "foo", 42, "mtime")
assert state.get_unused_links([], fs) == []

state.save_link(tmp_path / "foo", fs)
assert state.get_unused_links([], fs) == []


def test_links(tmp_path, state: State):
foo, bar = tmp_path / "foo", tmp_path / "bar"
Expand Down

0 comments on commit 58262ed

Please sign in to comment.