Update example-dvc-experiments with dvc exp init and confusion ma…

…trix (#97)
iterative · Feb 2, 2022 · ac76204 · ac76204
1 parent 41750c6
commit ac76204
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 61 deletions.
diff --git a/example-dvc-experiments/code/requirements-macos.txt b/example-dvc-experiments/code/requirements-macos.txt
@@ -0,0 +1,5 @@
+dvc[all]>=2.9
+tensorflow-macos>=2.6,<2.7
+ruamel.yaml>=0.17,<0.18
+imageio>=2.9,<3
+dvclive>=0.4
diff --git a/example-dvc-experiments/code/requirements.txt b/example-dvc-experiments/code/requirements.txt
@@ -1,3 +1,5 @@
+dvc[all]>=2.9
 tensorflow>=2.5,<2.6
 ruamel.yaml>=0.17,<0.18
 imageio>=2.9,<3
+dvclive>=0.4
diff --git a/example-dvc-experiments/code/src/train.py b/example-dvc-experiments/code/src/train.py
@@ -3,11 +3,13 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
 import tensorflow as tf
 import numpy as np
-from util import load_params, read_labeled_images
+from util import load_params, read_labeled_images, label_from_path, read_dataset, create_image_matrix
 import json
+import tarfile
+import imageio
+from dvclive.keras import DvcLiveCallback
 
-INPUT_DIR = "data/images"
-RESUME_PREVIOUS_MODEL = False
+DATASET_FILE = "data/images.tar.gz"
 OUTPUT_DIR = "models"
 
 METRICS_FILE = "metrics.json"
@@ -67,14 +69,10 @@ def history_to_csv(history):
 
 def main():
     params = load_params()
-    m = get_model()
+    m = get_model(conv_units=params['model']['conv_units'])
     m.summary()
 
-    training_images, training_labels = read_labeled_images(
-        os.path.join(INPUT_DIR, 'train/'))
-    testing_images, testing_labels = read_labeled_images(
-        os.path.join(INPUT_DIR, 'test/')
-    )
+    training_images, training_labels, testing_images, testing_labels = read_dataset(DATASET_FILE)
 
     assert training_images.shape[0] + testing_images.shape[0] == 70000
     assert training_labels.shape[0] + testing_labels.shape[0] == 70000
@@ -100,14 +98,9 @@ def main():
         epochs=params["train"]["epochs"],
         verbose=1,
         validation_data=(x_valid, y_valid),
+        callbacks=[DvcLiveCallback(model_file=f"{OUTPUT_DIR}/model.h5")],
     )
 
-    with open("logs.csv", "w") as f:
-        f.write(history_to_csv(history))
-
-    model_file = os.path.join(OUTPUT_DIR, "model.h5")
-    m.save(model_file)
-
     metrics_dict = m.evaluate(
         testing_images,
         testing_labels,
@@ -118,6 +111,25 @@ def main():
     with open(METRICS_FILE, "w") as f:
         f.write(json.dumps(metrics_dict))
 
+    misclassified = {}
+
+    # predictions for the confusion matrix
+    y_prob = m.predict(x_valid)
+    y_pred = y_prob.argmax(axis=-1)
+    os.makedirs("plots")
+    with open("plots/confusion.csv", "w") as f:
+        f.write("actual,predicted\n")
+        sx = y_valid.shape[0]
+        for i in range(sx):
+            actual=y_valid[i].argmax()
+            predicted=y_pred[i]
+            f.write(f"{actual},{predicted}\n")
+            misclassified[(actual, predicted)] = x_valid[i]
+
+
+    # find misclassified examples and generate a confusion table image
+    confusion_out = create_image_matrix(misclassified)
+    imageio.imwrite("plots/confusion.png", confusion_out)
 
 if __name__ == "__main__":
     main()
diff --git a/example-dvc-experiments/code/src/util.py b/example-dvc-experiments/code/src/util.py
@@ -3,6 +3,98 @@
 import os
 from imageio import imread
 
+def label_from_path(filepath):
+    """extracts "test", and 3 from a path like "images/test/3/00177.png" """
+    elements = filepath.split('/')
+    return (elements[1], int(elements[2]))
+
+def read_dataset(dataset_path):
+    """Reads the dataset from the specified tar.gz file and returns 4-tuple of
+    numpy arrays (training_images, training_labels, testing_images,
+    testing_labels)"""
+    ds = tarfile.open(name=dataset_path, mode='r:gz')
+    training, testing = [], []
+    print(f"Reading dataset from {dataset_path}")
+    for f in ds:
+        if f.isfile():
+            filepath = f.name
+            content = ds.extractfile(f)
+            image = imageio.imread(content)
+            imagesection, imagelabel = label_from_path(filepath)
+            if imagesection == "train":
+                training.append((imagelabel, image))
+            else:
+                testing.append((imagelabel, image))
+    training_len = len(training)
+    testing_len = len(testing)
+    print(f"Read {training_len} training images and {testing_len} testing images")
+    # we assume the images are 28x28 grayscale
+    shape_0, shape_1 = 28, 28
+    testing_images = np.ndarray(shape=(len(testing), shape_0, shape_1), dtype="uint8")
+    testing_labels = np.zeros(shape=(len(testing)), dtype="uint8")
+    for i, (label, image) in enumerate(testing):
+        testing_images[i] = image
+        testing_labels[i] = label
+    training_images = np.ndarray(shape=(len(training), shape_0, shape_1), dtype="uint8")
+    training_labels = np.zeros(shape=(len(training)), dtype="uint8")
+    for i, (label, image) in enumerate(training):
+        training_images[i] = image
+        training_labels[i] = label
+    return (training_images, training_labels, testing_images, testing_labels)
+
+
+def create_image_matrix(cells):
+    """cells is a dictionary containing 28x28 arrays for each (i, j) key. These
+    are printed on a max(i) * 30 x max(j) * 30 numpy uint8 array with 3
+    channels."""
+
+    max_i, max_j = 0, 0
+    for (i, j) in cells:
+        if i > max_i:
+            max_i = i
+        if j > max_j:
+            max_j = j
+
+    frame_size = 30
+    image_shape = (28, 28)
+    incorrect_color = np.array((255, 100, 100), dtype="uint8")
+    label_color = np.array((100, 100, 240), dtype="uint8")
+
+    # out_matrix contains examples in the axes
+
+    out_matrix = np.ones(shape=((max_i+2) * frame_size, (max_j+2) * frame_size, 3), dtype="uint8") * 240
+    print(f"out_matrix: {out_matrix.shape}")
+
+    ## put axis labels
+
+    for i in range(max_i+1):
+        if (i, i) in cells:
+            image = cells[(i, i)]
+            xs = (i + 1) * frame_size + 1
+            xe = (i + 2) * frame_size - 1
+            ys = 1
+            ye = frame_size - 1
+            for c in range(3):
+                out_matrix[xs:xe, ys:ye, c] = (1 - image) * label_color[c]
+                out_matrix[ys:ye, xs:xe, c] = (1 - image) * label_color[c]
+
+    for (i, j) in cells:
+        image = cells[(i, j)]
+        assert image.shape == image_shape
+        xs = (i + 1) * frame_size + 1
+        xe = (i + 2) * frame_size - 1
+        ys = (j + 1) * frame_size + 1
+        ye = (j + 2) * frame_size - 1
+        assert (xe-xs, ye-ys) == image_shape
+        print((i, j, xs, xe, ys, ye))
+        print(out_matrix[xs:xe, ys:ye, :].shape)
+        ## I'm sure there is an easier way to broadcast but I'll find it later
+        if i != j:
+            for c in range(3):
+                out_matrix[xs:xe, ys:ye, c] = (1 - image) * incorrect_color[c]
+
+    return out_matrix
+
 
 def get_images_from_directory(directory):
     image_file_extensions = [".png", ".jpg", ".bmp"]

diff --git a/example-dvc-experiments/generate.bash b/example-dvc-experiments/generate.bash
@@ -40,30 +40,6 @@ mkdir -p "${REPO_ROOT}"
 pushd "${REPO_ROOT}"
 
 
-add_main_pipeline() {
-
-    dvc stage add -n extract \
-        -d data/images.tar.gz \
-        --outs-no-cache data/images/ \
-        tar -xzf data/images.tar.gz --directory data
-    # The following is not added automatically as we use --no-cache
-
-    echo "/images/" >> data/.gitignore
-
-    mkdir -p models
-
-    dvc stage add -n train \
-                -d data/images/ \
-                -d src/train.py \
-                -p model.conv_units \
-                -p train.epochs \
-                --outs models/model.h5 \
-                --plots-no-cache logs.csv \
-                --metrics-no-cache metrics.json \
-                python3 src/train.py
-
-}
-
 export REPO_PATH="${REPO_ROOT}/${PROJECT_NAME}"
 
 mkdir -p "$REPO_PATH"
@@ -73,7 +49,7 @@ virtualenv -p python3 .venv
 export VIRTUAL_ENV_DISABLE_PROMPT=true
 source .venv/bin/activate
 echo '.venv/' > .gitignore
-pip install 'dvc[all]'
+pip install git+https://github.com/iterative/dvc.git 'dvc[all]'
 
 git init
 git checkout -b main
@@ -87,37 +63,33 @@ git tag "git-init"
 
 cp -r "${HERE}"/code/src .
 cp "${HERE}"/code/requirements.txt .
+cp "${HERE}"/code/requirements-macos.txt .
 cp "${HERE}"/code/params.yaml .
-pip install -r "${REPO_PATH}"/requirements.txt
+if [[ $(uname -s) == 'Darwin' ]] ; then
+    pip install -r "${REPO_PATH}"/requirements-macos.txt
+else
+    pip install -r "${REPO_PATH}"/requirements.txt
+fi
 tag_tick
 git add .
-git commit -m "Added source and params"
+git commit -m "Added requirements.txt, source code and params"
 git tag "source-code"
 
 test -d data/ || mkdir -p data/
 dvc get https://github.com/iterative/dataset-registry \
-        fashion-mnist/images.tar.gz -o data/images.tar.gz
+        mnist/images.tar.gz -o data/images.tar.gz
 
 dvc init
 
-tag_tick
-git add .dvc
-git commit -m "Initialized DVC"
-git tag "dvc-init"
-
+dvc exp init python3 src/train.py
+## it doesn't add data/ so adding it manually
 dvc add data/images.tar.gz
 tag_tick
-git add data/images.tar.gz.dvc data/.gitignore
-git commit -m "Added Fashion-MNIST images in tar.gz format"
-git tag "added-data"
-
-tag_tick
-add_main_pipeline
-git add dvc.yaml data/.gitignore models/.gitignore
-git commit -m "Added experiments pipeline"
-git tag "created-pipeline"
+git add .
+git commit -m "added .dvc, initialized experiment and added data"
+git status
+git tag "dvc-exp-init-run"
 
-tag_tick
 # Remote active on this env only, for writing to HTTP redirect below.
 dvc remote add --default --local storage s3://dvc-public/remote/example-dvc-experiments
 dvc remote add --default storage https://remote.dvc.org/example-dvc-experiments
@@ -127,10 +99,10 @@ git tag "configured-remote"
 
 git tag "get-started"
 
-# dvc exp run is not suitable for the first run due to missing file warnings
-dvc repro
+dvc exp run
 tag_tick
-git add models/.gitignore data/.gitignore dvc.lock logs.csv metrics.json
+git status
+git add .
 git commit -m "Baseline experiment run"
 git tag "baseline-experiment"
 
@@ -158,6 +130,9 @@ set -veux
 
 pushd ${REPO_PATH}
 
+# We use DVC we installed to the venv in generation.
+source .venv/bin/activate
+
 dvc remote add --force --default storage s3://dvc-public/remote/${PROJECT_NAME}/
 dvc push