Skip to content

Commit

Permalink
Update example-dvc-experiments with dvc exp init and confusion ma…
Browse files Browse the repository at this point in the history
…trix (#97)
  • Loading branch information
iesahin authored Feb 2, 2022
1 parent 41750c6 commit ac76204
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 61 deletions.
5 changes: 5 additions & 0 deletions example-dvc-experiments/code/requirements-macos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dvc[all]>=2.9
tensorflow-macos>=2.6,<2.7
ruamel.yaml>=0.17,<0.18
imageio>=2.9,<3
dvclive>=0.4
2 changes: 2 additions & 0 deletions example-dvc-experiments/code/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dvc[all]>=2.9
tensorflow>=2.5,<2.6
ruamel.yaml>=0.17,<0.18
imageio>=2.9,<3
dvclive>=0.4
42 changes: 27 additions & 15 deletions example-dvc-experiments/code/src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
import tensorflow as tf
import numpy as np
from util import load_params, read_labeled_images
from util import load_params, read_labeled_images, label_from_path, read_dataset, create_image_matrix
import json
import tarfile
import imageio
from dvclive.keras import DvcLiveCallback

INPUT_DIR = "data/images"
RESUME_PREVIOUS_MODEL = False
DATASET_FILE = "data/images.tar.gz"
OUTPUT_DIR = "models"

METRICS_FILE = "metrics.json"
Expand Down Expand Up @@ -67,14 +69,10 @@ def history_to_csv(history):

def main():
params = load_params()
m = get_model()
m = get_model(conv_units=params['model']['conv_units'])
m.summary()

training_images, training_labels = read_labeled_images(
os.path.join(INPUT_DIR, 'train/'))
testing_images, testing_labels = read_labeled_images(
os.path.join(INPUT_DIR, 'test/')
)
training_images, training_labels, testing_images, testing_labels = read_dataset(DATASET_FILE)

assert training_images.shape[0] + testing_images.shape[0] == 70000
assert training_labels.shape[0] + testing_labels.shape[0] == 70000
Expand All @@ -100,14 +98,9 @@ def main():
epochs=params["train"]["epochs"],
verbose=1,
validation_data=(x_valid, y_valid),
callbacks=[DvcLiveCallback(model_file=f"{OUTPUT_DIR}/model.h5")],
)

with open("logs.csv", "w") as f:
f.write(history_to_csv(history))

model_file = os.path.join(OUTPUT_DIR, "model.h5")
m.save(model_file)

metrics_dict = m.evaluate(
testing_images,
testing_labels,
Expand All @@ -118,6 +111,25 @@ def main():
with open(METRICS_FILE, "w") as f:
f.write(json.dumps(metrics_dict))

misclassified = {}

# predictions for the confusion matrix
y_prob = m.predict(x_valid)
y_pred = y_prob.argmax(axis=-1)
os.makedirs("plots")
with open("plots/confusion.csv", "w") as f:
f.write("actual,predicted\n")
sx = y_valid.shape[0]
for i in range(sx):
actual=y_valid[i].argmax()
predicted=y_pred[i]
f.write(f"{actual},{predicted}\n")
misclassified[(actual, predicted)] = x_valid[i]


# find misclassified examples and generate a confusion table image
confusion_out = create_image_matrix(misclassified)
imageio.imwrite("plots/confusion.png", confusion_out)

if __name__ == "__main__":
main()
92 changes: 92 additions & 0 deletions example-dvc-experiments/code/src/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,98 @@
import os
from imageio import imread

def label_from_path(filepath):
"""extracts "test", and 3 from a path like "images/test/3/00177.png" """
elements = filepath.split('/')
return (elements[1], int(elements[2]))

def read_dataset(dataset_path):
"""Reads the dataset from the specified tar.gz file and returns 4-tuple of
numpy arrays (training_images, training_labels, testing_images,
testing_labels)"""
ds = tarfile.open(name=dataset_path, mode='r:gz')
training, testing = [], []
print(f"Reading dataset from {dataset_path}")
for f in ds:
if f.isfile():
filepath = f.name
content = ds.extractfile(f)
image = imageio.imread(content)
imagesection, imagelabel = label_from_path(filepath)
if imagesection == "train":
training.append((imagelabel, image))
else:
testing.append((imagelabel, image))
training_len = len(training)
testing_len = len(testing)
print(f"Read {training_len} training images and {testing_len} testing images")
# we assume the images are 28x28 grayscale
shape_0, shape_1 = 28, 28
testing_images = np.ndarray(shape=(len(testing), shape_0, shape_1), dtype="uint8")
testing_labels = np.zeros(shape=(len(testing)), dtype="uint8")
for i, (label, image) in enumerate(testing):
testing_images[i] = image
testing_labels[i] = label
training_images = np.ndarray(shape=(len(training), shape_0, shape_1), dtype="uint8")
training_labels = np.zeros(shape=(len(training)), dtype="uint8")
for i, (label, image) in enumerate(training):
training_images[i] = image
training_labels[i] = label
return (training_images, training_labels, testing_images, testing_labels)


def create_image_matrix(cells):
"""cells is a dictionary containing 28x28 arrays for each (i, j) key. These
are printed on a max(i) * 30 x max(j) * 30 numpy uint8 array with 3
channels."""

max_i, max_j = 0, 0
for (i, j) in cells:
if i > max_i:
max_i = i
if j > max_j:
max_j = j

frame_size = 30
image_shape = (28, 28)
incorrect_color = np.array((255, 100, 100), dtype="uint8")
label_color = np.array((100, 100, 240), dtype="uint8")

# out_matrix contains examples in the axes

out_matrix = np.ones(shape=((max_i+2) * frame_size, (max_j+2) * frame_size, 3), dtype="uint8") * 240
print(f"out_matrix: {out_matrix.shape}")

## put axis labels

for i in range(max_i+1):
if (i, i) in cells:
image = cells[(i, i)]
xs = (i + 1) * frame_size + 1
xe = (i + 2) * frame_size - 1
ys = 1
ye = frame_size - 1
for c in range(3):
out_matrix[xs:xe, ys:ye, c] = (1 - image) * label_color[c]
out_matrix[ys:ye, xs:xe, c] = (1 - image) * label_color[c]

for (i, j) in cells:
image = cells[(i, j)]
assert image.shape == image_shape
xs = (i + 1) * frame_size + 1
xe = (i + 2) * frame_size - 1
ys = (j + 1) * frame_size + 1
ye = (j + 2) * frame_size - 1
assert (xe-xs, ye-ys) == image_shape
print((i, j, xs, xe, ys, ye))
print(out_matrix[xs:xe, ys:ye, :].shape)
## I'm sure there is an easier way to broadcast but I'll find it later
if i != j:
for c in range(3):
out_matrix[xs:xe, ys:ye, c] = (1 - image) * incorrect_color[c]

return out_matrix


def get_images_from_directory(directory):
image_file_extensions = [".png", ".jpg", ".bmp"]
Expand Down
67 changes: 21 additions & 46 deletions example-dvc-experiments/generate.bash
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,6 @@ mkdir -p "${REPO_ROOT}"
pushd "${REPO_ROOT}"


add_main_pipeline() {

dvc stage add -n extract \
-d data/images.tar.gz \
--outs-no-cache data/images/ \
tar -xzf data/images.tar.gz --directory data
# The following is not added automatically as we use --no-cache

echo "/images/" >> data/.gitignore

mkdir -p models

dvc stage add -n train \
-d data/images/ \
-d src/train.py \
-p model.conv_units \
-p train.epochs \
--outs models/model.h5 \
--plots-no-cache logs.csv \
--metrics-no-cache metrics.json \
python3 src/train.py

}

export REPO_PATH="${REPO_ROOT}/${PROJECT_NAME}"

mkdir -p "$REPO_PATH"
Expand All @@ -73,7 +49,7 @@ virtualenv -p python3 .venv
export VIRTUAL_ENV_DISABLE_PROMPT=true
source .venv/bin/activate
echo '.venv/' > .gitignore
pip install 'dvc[all]'
pip install git+https://github.com/iterative/dvc.git 'dvc[all]'

git init
git checkout -b main
Expand All @@ -87,37 +63,33 @@ git tag "git-init"

cp -r "${HERE}"/code/src .
cp "${HERE}"/code/requirements.txt .
cp "${HERE}"/code/requirements-macos.txt .
cp "${HERE}"/code/params.yaml .
pip install -r "${REPO_PATH}"/requirements.txt
if [[ $(uname -s) == 'Darwin' ]] ; then
pip install -r "${REPO_PATH}"/requirements-macos.txt
else
pip install -r "${REPO_PATH}"/requirements.txt
fi
tag_tick
git add .
git commit -m "Added source and params"
git commit -m "Added requirements.txt, source code and params"
git tag "source-code"

test -d data/ || mkdir -p data/
dvc get https://github.com/iterative/dataset-registry \
fashion-mnist/images.tar.gz -o data/images.tar.gz
mnist/images.tar.gz -o data/images.tar.gz

dvc init

tag_tick
git add .dvc
git commit -m "Initialized DVC"
git tag "dvc-init"

dvc exp init python3 src/train.py
## it doesn't add data/ so adding it manually
dvc add data/images.tar.gz
tag_tick
git add data/images.tar.gz.dvc data/.gitignore
git commit -m "Added Fashion-MNIST images in tar.gz format"
git tag "added-data"

tag_tick
add_main_pipeline
git add dvc.yaml data/.gitignore models/.gitignore
git commit -m "Added experiments pipeline"
git tag "created-pipeline"
git add .
git commit -m "added .dvc, initialized experiment and added data"
git status
git tag "dvc-exp-init-run"

tag_tick
# Remote active on this env only, for writing to HTTP redirect below.
dvc remote add --default --local storage s3://dvc-public/remote/example-dvc-experiments
dvc remote add --default storage https://remote.dvc.org/example-dvc-experiments
Expand All @@ -127,10 +99,10 @@ git tag "configured-remote"

git tag "get-started"

# dvc exp run is not suitable for the first run due to missing file warnings
dvc repro
dvc exp run
tag_tick
git add models/.gitignore data/.gitignore dvc.lock logs.csv metrics.json
git status
git add .
git commit -m "Baseline experiment run"
git tag "baseline-experiment"

Expand Down Expand Up @@ -158,6 +130,9 @@ set -veux
pushd ${REPO_PATH}
# We use DVC we installed to the venv in generation.
source .venv/bin/activate
dvc remote add --force --default storage s3://dvc-public/remote/${PROJECT_NAME}/
dvc push
Expand Down

0 comments on commit ac76204

Please sign in to comment.