fix(example-get-started): track the whole eval dir for simplicity (#251)

iterative · Sep 9, 2023 · 5aee50b · 5aee50b
1 parent a9c9a65
commit 5aee50b
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 27 deletions.
diff --git a/example-get-started/README.md b/example-get-started/README.md
@@ -124,7 +124,7 @@ repositories and/or use different remote types.
   have prefixes or suffixes to distinguish them.
 - `OPT_REMOTE="public-s3"` - (default `private-s3`). Other options: `public-s3`,
   `private-http`, `private-ssh`, etc.
-- `OPT_DVC_TRACKED_METRICS='false'` - (default `false`). Either we should use
+- `OPT_DVC_TRACKED_METRICS='true'` - (default `true`). Either we should use
   DVC to also track all metric and plot files (e.g. to test that Studio can get
   plots from the remote storage).
 - `OPT_REGISTER_MODELS='false'` - (default `true`). Use the `gto` to register

diff --git a/example-get-started/code/.github/workflows/cml.yaml b/example-get-started/code/.github/workflows/cml.yaml
@@ -27,6 +27,7 @@ jobs:
             git fetch origin main:main
           fi
 
+          dvc pull eval
           dvc plots diff $PREVIOUS_REF workspace \
             --show-vega --targets ROC | json5 > vega.json
           vl2svg vega.json roc.svg
@@ -39,11 +40,9 @@ jobs:
             --show-vega --targets Confusion-Matrix | json5 > vega.json
           vl2svg vega.json confusion.svg
 
-          dvc pull eval/plots/images
           cp eval/plots/images/importance.png importance_workspace.png
 
           git checkout $PREVIOUS_REF -- dvc.lock
-          dvc pull eval/plots/images
           cp eval/plots/images/importance.png importance_previous.png
 
           dvc_report=$(dvc exp diff $PREVIOUS_REF --md)

diff --git a/example-get-started/code/README.md b/example-get-started/code/README.md
@@ -57,12 +57,14 @@ $ dvc pull
 
 ## Running in your environment
 
-Run [`dvc repro`](https://man.dvc.org/repro) to reproduce the
-[pipeline](https://dvc.org/doc/commands-reference/pipeline):
+Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the
+[pipeline](https://dvc.org/doc/user-guide/pipelines) and create a new
+[experiment](https://dvc.org/doc/user-guide/experiment-management).
 
 ```console
-$ dvc repro
-Data and pipelines are up to date.
+$ dvc exp run
+Ran experiment(s): rapid-cane
+Experiment results have been applied to your workspace.
 ```
 
 If you'd like to test commands like [`dvc push`](https://man.dvc.org/push),
@@ -151,20 +153,20 @@ $ tree
 ├── dvc.lock
 ├── dvc.yaml              # <-- DVC pipeline file
 ├── eval
-│   ├── importance.png    # <-- Feature importance plot
-│   ├── live
-│   │   ├── metrics.json  # <-- Binary classifier final metrics (e.g. AUC)
-│   │   └── plots         # <-- Data points for ROC, confusion matrix
-│   │       └── sklearn
-│   │           ├── cm
-│   │           │   ├── test.json
-│   │           │   └── train.json
-│   │           └── roc
-│   │               ├── test.json
-│   │               └── train.json
-│   └── prc               # <-- Data points for custom PRC
-│       ├── test.json
-│       └── train.json
+│   ├── metrics.json      # <-- Binary classifier final metrics (e.g. AUC)
+│   └── plots             
+│       ├── images
+│       │   └── importance.png    # <-- Feature importance plot
+│       └── sklearn       # <-- Data points for ROC, confusion matrix
+│           ├── cm
+│           │   ├── test.json
+│           │   └── train.json
+│           ├── prc
+│           │   ├── test.json
+│           │   └── train.json
+│           └── roc
+│               ├── test.json
+│               └── train.json
 ├── model.pkl             # <-- Trained model file
 ├── params.yaml           # <-- Parameters file
 └── src                   # <-- Source code to run the pipeline stages

diff --git a/example-get-started/code/src/evaluate.py b/example-get-started/code/src/evaluate.py
@@ -100,7 +100,7 @@ def main():
         test, _ = pickle.load(fd)
 
     # Evaluate train and test datasets.
-    with Live(EVAL_PATH, cache_images=True, dvcyaml=False) as live:
+    with Live(EVAL_PATH, dvcyaml=False, report=None) as live:
         evaluate(model, train, "train", live, save_path=EVAL_PATH)
         evaluate(model, test, "test", live, save_path=EVAL_PATH)
 

diff --git a/example-get-started/generate.sh b/example-get-started/generate.sh
@@ -21,7 +21,7 @@ OPT_TAGS='true' # Default true.
 # Default "public-s3". Other options: "public-s3", "private-http", "private-ssh", etc.
 # See the details below in the `init_remote_storage` and in the README.
 OPT_REMOTE='public-s3'
-OPT_DVC_TRACKED_METRICS='false' # Default false.
+OPT_DVC_TRACKED_METRICS='true' # Default true.
 OPT_REGISTER_MODELS='true' # Default true.
 OPT_MODEL_NAME='text-classification' # Default "text-classification".
 OPT_TAG_MODELS='true' # Default true.
@@ -301,12 +301,11 @@ EOF
 
   if [ $OPT_DVC_TRACKED_METRICS == "true" ]; then
     dvc stage add -n evaluate \
-      -d src/evaluate.py -d model.pkl -d data/features \
-      -o eval/metrics.json -o eval/plots \
+      -d src/evaluate.py -d model.pkl -d data/features -o eval \
       python src/evaluate.py model.pkl data/features
   else
     dvc stage add -n evaluate \
-      -d src/evaluate.py -d model.pkl -d data/features \
+      -d src/evaluate.py -d model.pkl -d data/features -O eval \
       python src/evaluate.py model.pkl data/features
   fi
 
@@ -334,7 +333,11 @@ plots:
 - eval/plots/images/importance.png" >> dvc.yaml
 
   dvc repro
-  git add .gitignore dvc.yaml dvc.lock eval
+  if [ $OPT_DVC_TRACKED_METRICS == "true" ]; then
+    git add .gitignore dvc.yaml dvc.lock
+  else
+    git add .gitignore dvc.yaml dvc.lock eval
+  fi
   tick
   git commit -am "${COMMIT_PREFIX}Create evaluation stage"
   create_tag "8-dvclive-eval${GIT_TAG_SUFFIX}" "DVCLive evaluation stage created."