Skip to content

Commit

Permalink
add: do not delete stage files before add (#6239)
Browse files Browse the repository at this point in the history
* add: do not delete stage files before add

Because of the way we collect stages and cache them, we were not able to
collect them for the `add` without removing them from the workspace.
As doing so, we'd have two same/similar stages - one collected from the
workspace and the other just created from the `dvc add` in-memory.

This would raise errors during graph checks, so we started to delete them
and reset them (which is very recently, see #2886 and #3349).

By deleting the file before we even do any checks, we are making DVC
fragile, and results in data loss for the users with even simple mistakes.
This should make it more reliable and robust.

And, recently, we have started to keep state of a lot of things, that by
resetting them on each stage, we waste a lot of performance, especially
on gitignores. We cache the dulwich's IgnoreManager, which when resetted
too many times, will waste a lot of our time just collecting them again
next time (see #6227).

It's hard to say how much this improves, as this very much depends on
no. of gitignores in the repo (which can be assumed to be quite in number
for a dvc repo) and the amount of files that we are adding
(eg: `-R` adding a large directory).

On a directory with 10,000 files (in a datadet-registry repo),
creating stages on `dvc add -R` went from 64 files/sec to 1.1k files/sec.

* add tests

* make the test more specific
  • Loading branch information
skshetry authored Jul 3, 2021
1 parent 07f5f6b commit 8d5f9ab
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 6 deletions.
9 changes: 4 additions & 5 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,11 @@ def add( # noqa: C901
**kwargs,
)

# remove existing stages that are to-be replaced with these
# new stages for the graph checks.
old_stages = set(repo.stages) - set(stages)
try:
repo.check_modified_graph(stages)
repo.check_modified_graph(stages, list(old_stages))
except OverlappingOutputPathsError as exc:
msg = (
"Cannot add '{out}', because it is overlapping with other "
Expand Down Expand Up @@ -250,7 +253,6 @@ def _create_stages(
transfer=False,
**kwargs,
):
from dvc.dvcfile import Dvcfile
from dvc.stage import Stage, create_stage, restore_meta

expanded_targets = glob_targets(targets, glob=glob)
Expand All @@ -276,12 +278,9 @@ def _create_stages(
external=external,
)
restore_meta(stage)
Dvcfile(repo, stage.path).remove()
if desc:
stage.outs[0].desc = desc

repo._reset() # pylint: disable=protected-access

if not stage:
if pbar is not None:
pbar.total -= 1
Expand Down
45 changes: 44 additions & 1 deletion tests/func/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
from dvc.hash_info import HashInfo
from dvc.main import main
from dvc.objects.db import ODBManager
from dvc.output import OutputAlreadyTrackedError, OutputIsStageFileError
from dvc.output import (
OutputAlreadyTrackedError,
OutputDoesNotExistError,
OutputIsStageFileError,
)
from dvc.stage import Stage
from dvc.stage.exceptions import (
StageExternalOutputsError,
Expand Down Expand Up @@ -1190,3 +1194,42 @@ def test_add_ignored(tmp_dir, scm, dvc):
assert str(exc.value) == ("bad DVC file name '{}' is git-ignored.").format(
os.path.join("dir", "subdir.dvc")
)


def test_add_on_not_existing_file_should_not_remove_stage_file(tmp_dir, dvc):
(stage,) = tmp_dir.dvc_gen("foo", "foo")
(tmp_dir / "foo").unlink()
dvcfile_contents = (tmp_dir / stage.path).read_text()

with pytest.raises(OutputDoesNotExistError):
dvc.add("foo")
assert (tmp_dir / "foo.dvc").exists()
assert (tmp_dir / stage.path).read_text() == dvcfile_contents


@pytest.mark.parametrize(
"target",
[
"dvc.repo.Repo.check_modified_graph",
"dvc.stage.Stage.save",
"dvc.stage.Stage.commit",
],
)
def test_add_does_not_remove_stage_file_on_failure(
tmp_dir, dvc, mocker, target
):
(stage,) = tmp_dir.dvc_gen("foo", "foo")
tmp_dir.gen("foo", "foobar") # update file
dvcfile_contents = (tmp_dir / stage.path).read_text()

exc_msg = f"raising error from mocked '{target}'"
mocker.patch(
target,
side_effect=DvcException(exc_msg),
)

with pytest.raises(DvcException) as exc_info:
dvc.add("foo")
assert str(exc_info.value) == exc_msg
assert (tmp_dir / "foo.dvc").exists()
assert (tmp_dir / stage.path).read_text() == dvcfile_contents

0 comments on commit 8d5f9ab

Please sign in to comment.