Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dvc.yaml: make persist/cache bool flags in outs/metrics #3804

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions dvc/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from dvc import dependency, output
from dvc.output import CHECKSUMS_SCHEMA
from dvc.stage.params import OutputParams, StageParams
from dvc.output.base import BaseOutput
from dvc.stage.params import StageParams

STAGES = "stages"
SINGLE_STAGE_SCHEMA = {
Expand Down Expand Up @@ -34,7 +35,24 @@
Optional(StageParams.PARAM_LOCKED): bool,
Optional(StageParams.PARAM_META): object,
Optional(StageParams.PARAM_ALWAYS_CHANGED): bool,
**{Optional(p.value): [str] for p in OutputParams},
Optional(StageParams.PARAM_OUTS): [
Any(
str,
{
str: {
Optional(BaseOutput.PARAM_CACHE): bool,
Optional(BaseOutput.PARAM_PERSIST): bool,
}
},
)
],
Optional(StageParams.PARAM_OUTS_NO_CACHE): [str],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Being a bit restrictive here and not allowing additional flags (e.g. persist=True) for such fields.

Optional(StageParams.PARAM_METRICS): [
Any(str, {str: {Optional(BaseOutput.PARAM_CACHE): bool}})
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not supporting persist for metrics because it is the way we currently generate it during the serialization. Plus we don't really have a CLI option for it. But seems like persistent metrics does make sense, especially when we talk about plots that might accumulate data from run to run. But let's wait for someone to ask for it.

Note that in old dvc-files that(persistent metrics) was supported if user set metric: True, persist=True by hand.

],
Optional(StageParams.PARAM_METRICS_NO_CACHE): [str],
Optional(StageParams.PARAM_OUTS_PERSIST): [str],
Optional(StageParams.PARAM_OUTS_PERSIST_NO_CACHE): [str],
}
}
MULTI_STAGE_SCHEMA = {STAGES: SINGLE_PIPELINE_STAGE_SCHEMA}
Expand Down
15 changes: 9 additions & 6 deletions dvc/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@
def _get_outs(stage: "PipelineStage"):
outs_bucket = {}
for o in sort_by_path(stage.outs):
bucket_key = ["metrics"] if o.metric else ["outs"]
key = "metrics" if o.metric else "outs"

if not o.metric and o.persist:
bucket_key += ["persist"]
extra = {}
if o.persist:
extra["persist"] = True
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just matching existing style in this method, so using plain strings. Maybe we should discuss strings vs constants one more time, to see how people feel about it these days.

if not o.use_cache:
bucket_key += ["no_cache"]
key = "_".join(bucket_key)
outs_bucket[key] = outs_bucket.get(key, []) + [o.def_path]
extra["cache"] = False

value = {o.def_path: extra} if extra else o.def_path

outs_bucket[key] = outs_bucket.get(key, []) + [value]
return [(key, outs_bucket[key]) for key in sorted(outs_bucket.keys())]


Expand Down
17 changes: 6 additions & 11 deletions dvc/stage/params.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from enum import Enum


class StageParams:
PARAM_MD5 = "md5"
PARAM_CMD = "cmd"
Expand All @@ -12,11 +9,9 @@ class StageParams:
PARAM_ALWAYS_CHANGED = "always_changed"
PARAM_PARAMS = "params"


class OutputParams(Enum):
PERSIST = "outs_persist"
PERSIST_NO_CACHE = "outs_persist_no_cache"
METRICS_NO_CACHE = "metrics_no_cache"
METRICS = "metrics"
NO_CACHE = "outs_no_cache"
OUTS = "outs"
PARAM_OUTS_PERSIST = "outs_persist"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, might be worth renaming it to persist instead of outs_persist. Looks a lot nicer that way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

may be in this case we can make it a flag within the outs item?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shcheklein It is that already :) But we also keep plain format where it mimics CLI options. So I wonder if we should make it persist and --persist in dvc run.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, sounds good to me! never liked these JAVA-style - --outs-enterprisy-long-name-include-everything :)

PARAM_OUTS_PERSIST_NO_CACHE = "outs_persist_no_cache"
PARAM_METRICS_NO_CACHE = "metrics_no_cache"
PARAM_METRICS = "metrics"
PARAM_OUTS_NO_CACHE = "outs_no_cache"
PARAM_OUTS = "outs"
35 changes: 26 additions & 9 deletions dvc/stage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
StagePathNotFoundError,
StagePathOutsideError,
)
from .params import OutputParams


def check_stage_path(repo, path, is_wdir=False):
Expand All @@ -39,15 +38,33 @@ def check_stage_path(repo, path, is_wdir=False):
def fill_stage_outputs(stage, **kwargs):
assert not stage.outs

keys = [
"outs_persist",
"outs_persist_no_cache",
"metrics_no_cache",
"metrics",
"outs_no_cache",
"outs",
]

stage.outs = []
for key in (p.value for p in OutputParams):
stage.outs += output.loads_from(
stage,
kwargs.get(key, []),
use_cache="no_cache" not in key,
persist="persist" in key,
metric="metrics" in key,
)
for key in keys:
for entry in kwargs.get(key, []):
if isinstance(entry, dict):
assert len(entry) == 1
path, extra = list(entry.items())[0]
stage.outs += output.loadd_from(
stage, [{"path": path, **extra}]
)
continue

stage.outs += output.loads_from(
stage,
[entry],
use_cache="no_cache" not in key,
persist="persist" in key,
metric="metrics" in key,
)


def fill_stage_dependencies(stage, deps=None, erepo=None, params=None):
Expand Down
2 changes: 1 addition & 1 deletion tests/func/test_run_multistage.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def test_run_dump_on_multistage(tmp_dir, dvc):
"copy-foo-foo2": {
"cmd": "cp foo foo2",
"deps": ["foo"],
"outs_persist": ["foo2"],
"outs": [{"foo2": {"persist": True}}],
"always_changed": True,
"wdir": "dir",
},
Expand Down