From 67bdfcf6f97d0750c225d14fbdd6bed4fbd1b8b2 Mon Sep 17 00:00:00 2001 From: David de la Iglesia Castro Date: Fri, 5 Aug 2022 14:38:57 +0200 Subject: [PATCH] repo: Add `ignore_revs`. Support to write a list of line-separated revisions to `.dvc/ignore_revs`. If `.dvc/ignore_revs` exists, parse the revisions and use them in `brancher` to skip. Allows to skip broken commits in `gc`, `pull` and `push`. Closes #5037 Closes #5066 Closes #7585 --- dvc/repo/__init__.py | 12 ++++++++++++ dvc/repo/brancher.py | 4 +++- tests/conftest.py | 14 ++++++++++++++ tests/func/test_data_cloud.py | 31 +++++++++++++++++++++++++++++++ tests/func/test_gc.py | 23 ++++++++++++++++++++++- tests/unit/repo/test_repo.py | 27 +++++++++++++++++++++++++++ 6 files changed, 109 insertions(+), 2 deletions(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index eb63e41bd6..b36a682428 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -381,6 +381,7 @@ def _ignore(self): def brancher(self, *args, **kwargs): from dvc.repo.brancher import brancher + kwargs["ignore_revs"] = self.ignore_revs return brancher(self, *args, **kwargs) def used_objs( @@ -448,6 +449,17 @@ def used_objs( def stages(self): # obsolete, only for backward-compatibility return self.index.stages + @property + def ignore_revs_file(self): + return self.fs.path.join(self.dvc_dir, "ignore_revs") + + @property + def ignore_revs(self): + if self.fs.exists(self.ignore_revs_file): + with self.fs.open(self.ignore_revs_file) as f: + return set(f.read().strip().splitlines()) + return set() + def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): # using `outs_graph` to ensure graph checks are run outs = outs or self.index.outs_graph diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 14f27ec12f..37dfd20451 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Set from dvc.scm import iter_revs @@ -13,6 +13,7 @@ def brancher( # noqa: E302 commit_date: Optional[str] = None, sha_only=False, num=1, + ignore_revs: Optional[Set[str]] = None, ): """Generator that iterates over specified revisions. @@ -71,6 +72,7 @@ def brancher( # noqa: E302 all_experiments=all_experiments, commit_date=commit_date, num=num, + ignore_revs=ignore_revs, ) try: diff --git a/tests/conftest.py b/tests/conftest.py index 0dffd7eeef..3bca810297 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -250,3 +250,17 @@ def run( return stage return run + + +@pytest.fixture +def broken_rev(tmp_dir, scm, dvc): + tmp_dir.gen("params.yaml", "foo: 1") + dvc.run(cmd="echo ${foo}", name="foo") + + scm.add(["dvc.yaml", "dvc.lock"]) + scm.commit("init broken") + _broken_rev = scm.get_rev() + + scm.add(["params.yaml"]) + scm.commit("fixed") + return _broken_rev diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 192e350f73..497757703c 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -7,6 +7,7 @@ import dvc_data from dvc.cli import main +from dvc.exceptions import DvcException from dvc.external_repo import clean_repos from dvc.stage.exceptions import StageNotFound from dvc.testing.test_remote import ( # noqa, pylint: disable=unused-import @@ -461,6 +462,36 @@ def test_push_pull_all(tmp_dir, scm, dvc, local_remote, key, expected): assert dvc.pull(**{key: True})["fetched"] == expected +def test_push_pull_ignore_revs(tmp_dir, scm, dvc, local_remote, broken_rev): + tmp_dir.dvc_gen({"foo": "foo"}, commit="first") + scm.tag("v1") + dvc.remove("foo.dvc") + tmp_dir.dvc_gen({"bar": "bar"}, commit="second") + scm.tag("v2") + with tmp_dir.branch("branch", new=True): + dvc.remove("bar.dvc") + tmp_dir.dvc_gen({"baz": "baz"}, commit="branch") + + with pytest.raises(DvcException): + dvc.push(all_commits=True) + + with dvc.fs.open(dvc.ignore_revs_file, "w") as f: + f.write(broken_rev) + + assert dvc.push(all_commits=True) == 3 + + remove(dvc.ignore_revs_file) + clean(["foo", "bar", "baz"], dvc) + + with pytest.raises(DvcException): + dvc.pull(all_commits=True) + + with dvc.fs.open(dvc.ignore_revs_file, "w") as f: + f.write(broken_rev) + + assert dvc.pull(all_commits=True)["fetched"] == 3 + + def test_push_pull_fetch_pipeline_stages(tmp_dir, dvc, run_copy, local_remote): tmp_dir.dvc_gen("foo", "foo") run_copy("foo", "bar", no_commit=True, name="copy-foo-bar") diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index 27a2e9f6a4..f757b20685 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -7,7 +7,7 @@ from git import Repo from dvc.cli import main -from dvc.exceptions import CollectCacheError +from dvc.exceptions import CollectCacheError, DvcException from dvc.fs import LocalFileSystem from dvc.repo import Repo as DvcRepo from dvc.utils.fs import remove @@ -412,3 +412,24 @@ def test_gc_rev_num(tmp_dir, scm, dvc): assert not cache.exists() else: assert cache.read_text() == str(i) + + +def test_gc_ignore_revs(tmp_dir, dvc, broken_rev): + """Covers #5037 and #7585""" + uncommitted = tmp_dir.dvc_gen("testfile", "uncommitted") + uncommitted_hash = uncommitted[0].outs[0].hash_info.value + tmp_dir.dvc_gen("testfile", "committed", commit="add testfile") + + cache = tmp_dir / ".dvc" / "cache" + uncommitted_cache = cache / uncommitted_hash[:2] / uncommitted_hash[2:] + + with pytest.raises(DvcException, match="Could not find 'foo'"): + dvc.gc(all_commits=True) + + assert uncommitted_cache.exists() + + with dvc.fs.open(dvc.ignore_revs_file, "w") as f: + f.write(broken_rev) + dvc.gc(all_commits=True) + + assert not uncommitted_cache.exists() diff --git a/tests/unit/repo/test_repo.py b/tests/unit/repo/test_repo.py index efb1ead44f..6a20e61ec9 100644 --- a/tests/unit/repo/test_repo.py +++ b/tests/unit/repo/test_repo.py @@ -59,6 +59,33 @@ def test_used_objs(tmp_dir, dvc, path): assert used == expected +def test_used_objs_ignore_revs(tmp_dir, scm, dvc): + def _get_used_values(dvc): + used_obj_ids = list(dvc.used_objs(all_commits=True).values())[0] + return {o.value for o in used_obj_ids} + + obj_ids = {} + revs = {} + for i in range(3): + o = tmp_dir.dvc_gen("foo", str(i), commit=str(i)) + obj_ids[i] = o[0].outs[0].hash_info.value + revs[i] = scm.get_rev() + + assert set(obj_ids.values()) == _get_used_values(dvc) + + with dvc.fs.open(dvc.ignore_revs_file, "w") as f: + f.write(revs[0]) + + expected = {obj_ids[1], obj_ids[2]} + assert expected == _get_used_values(dvc) + + with dvc.fs.open(dvc.ignore_revs_file, "w") as f: + f.write(f"{revs[0]}\n{revs[1]}") + + expected = {obj_ids[2]} + assert expected == _get_used_values(dvc) + + def test_locked(mocker): repo = mocker.MagicMock() repo._lock_depth = 0