Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade PyTorch version to v1.13.0 #2082

Merged
merged 2 commits into from
Jan 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Pytorch=1.11.0, cuda=11.6.0
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
FROM nvcr.io/nvidia/pytorch:22.02-py3
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
# PyTorch=1.13.0, cuda=11.8.0
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html#rel-22-11
FROM nvcr.io/nvidia/pytorch:22.11-py3

ENV TARGET_DIR /opt/darts-cnn-cifar10

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
torch==1.11.0
torchvision==0.12.0
torch==1.13.1
torchvision==0.14.1
Pillow>=9.1.1
7 changes: 4 additions & 3 deletions examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Pytorch=1.11.0, cuda=11.6.0
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
FROM nvcr.io/nvidia/pytorch:22.02-py3
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
# PyTorch=1.13.0, cuda=11.8.0
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html#rel-22-11
FROM nvcr.io/nvidia/pytorch:22.11-py3

ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist

Expand Down
4 changes: 2 additions & 2 deletions examples/v1beta1/trial-images/pytorch-mnist/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cloudml-hypertune==0.1.0.dev6
torch==1.11.0
torchvision==0.12.0
torch==1.13.1
torchvision==0.14.1
Pillow>=9.1.1
37 changes: 12 additions & 25 deletions test/e2e/v1beta1/scripts/gh-actions/build-load.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
set -o errexit
set -o pipefail
set -o nounset
cd "$(dirname "$0")"

pushd .
cd "$(dirname "$0")/../../../../.."
trap popd EXIT

TRIAL_IMAGES=${1:-""}
EXPERIMENTS=${2:-""}
Expand Down Expand Up @@ -48,14 +51,7 @@ _build_containers() {
done

echo -e "\nBuilding $CONTAINER_NAME image with $DOCKERFILE...\n"
docker buildx build --platform "$(uname -m)" --load -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "../../../../../$DOCKERFILE" ../../../../../
}

_load_minikube_cluster() {
CONTAINER_NAME=${1:-"katib-controller"}

echo -e "\n\nLoading $CONTAINER_NAME image...\n\n"
minikube image load "$REGISTRY/$CONTAINER_NAME:$TAG"
DOCKER_BUILDKIT=1 minikube image build --build-opt platform=linux/amd64 --all -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "$DOCKERFILE" .
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid OOM errors in CI, we must build container images using minikube instead of docker.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the difference here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The build succeeded but tests take longer time

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johnugeorge In minikube image build, we can build images using an actual container runtime on the minikube cluster. So we no longer load images to the minikube cluster.

Copy link
Member Author

@tenzen-y tenzen-y Jan 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The build succeeded but tests take longer time

@johnugeorge As I can see, It seems to be faster.

2023-01-17 3 13 46

}

_install_tools() {
Expand All @@ -66,11 +62,6 @@ _install_tools() {
fi
}

cleanup_build_cache() {
echo -e "\nCleanup Build Cache...\n"
docker builder prune
}

run() {
CONTAINER_NAME=${1:-"katib-controller"}
DOCKERFILE=${2:-"$CMD_PREFIX/katib-controller/$VERSION/Dockerfile"}
Expand All @@ -85,10 +76,10 @@ run() {
# Search for Suggestion Images required for Trial.
for exp_name in "${EXPERIMENT_ARRAY[@]}"; do

exp_path=$(find ../../../../../examples/v1beta1 -name "${exp_name}.yaml")
exp_path=$(find examples/v1beta1 -name "${exp_name}.yaml")
algorithm_name="$(yq eval '.spec.algorithm.algorithmName' "$exp_path")"

suggestion_image_name="$(yq eval '.data.suggestion' ../../../../../manifests/v1beta1/components/controller/katib-config.yaml |
suggestion_image_name="$(yq eval '.data.suggestion' manifests/v1beta1/components/controller/katib-config.yaml |
algorithm_name=$algorithm_name yq eval '.[env(algorithm_name)].image' | cut -d: -f1)"
suggestion_name="$(basename "$suggestion_image_name")"

Expand All @@ -99,7 +90,6 @@ run() {
for s in "${suggestions[@]}"; do
if [ "$s" == "$CONTAINER_NAME" ]; then
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
_load_minikube_cluster "$CONTAINER_NAME"
break
fi
done
Expand All @@ -112,10 +102,10 @@ run() {
# Search for EarlyStopping Images required for Trial.
for exp_name in "${EXPERIMENT_ARRAY[@]}"; do

exp_path=$(find ../../../../../examples/v1beta1 -name "${exp_name}.yaml")
exp_path=$(find examples/v1beta1 -name "${exp_name}.yaml")
algorithm_name="$(yq eval '.spec.earlyStopping.algorithmName' "$exp_path")"

earlystopping_image_name="$(yq eval '.data.early-stopping' ../../../../../manifests/v1beta1/components/controller/katib-config.yaml |
earlystopping_image_name="$(yq eval '.data.early-stopping' manifests/v1beta1/components/controller/katib-config.yaml |
algorithm_name=$algorithm_name yq eval '.[env(algorithm_name)].image' | cut -d: -f1)"
earlystopping_name="$(basename "$earlystopping_image_name")"

Expand All @@ -126,15 +116,13 @@ run() {
for e in "${earlystoppings[@]}"; do
if [ "$e" == "$CONTAINER_NAME" ]; then
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
_load_minikube_cluster "$CONTAINER_NAME"
break
fi
done

# Others
else
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
_load_minikube_cluster "$CONTAINER_NAME"
fi
}

Expand All @@ -153,7 +141,6 @@ fi
run "cert-generator" "$CMD_PREFIX/cert-generator/$VERSION/Dockerfile"
run "file-metrics-collector" "$CMD_PREFIX/metricscollector/$VERSION/file-metricscollector/Dockerfile"
run "tfevent-metrics-collector" "$CMD_PREFIX/metricscollector/$VERSION/tfevent-metricscollector/Dockerfile"
cleanup_build_cache

# Suggestion images
echo -e "\nBuilding suggestion images..."
Expand All @@ -165,18 +152,18 @@ run "suggestion-optuna" "$CMD_PREFIX/suggestion/optuna/$VERSION/Dockerfile"
run "suggestion-pbt" "$CMD_PREFIX/suggestion/pbt/$VERSION/Dockerfile"
run "suggestion-enas" "$CMD_PREFIX/suggestion/nas/enas/$VERSION/Dockerfile"
run "suggestion-darts" "$CMD_PREFIX/suggestion/nas/darts/$VERSION/Dockerfile"
cleanup_build_cache

# Early stopping images
echo -e "\nBuilding early stopping images...\n"
run "earlystopping-medianstop" "$CMD_PREFIX/earlystopping/medianstop/$VERSION/Dockerfile"
cleanup_build_cache

# Training container images
echo -e "\nBuilding training container images..."
for name in "${TRIAL_IMAGE_ARRAY[@]}"; do
run "$name" "examples/$VERSION/trial-images/$name/Dockerfile"
done
cleanup_build_cache

echo -e "\nCleanup Build Cache...\n"
docker buildx prune -f

echo -e "\nAll Katib images with ${TAG} tag have been built successfully!\n"