-
Notifications
You must be signed in to change notification settings - Fork 45
/
buildspec.yml
176 lines (160 loc) · 9.94 KB
/
buildspec.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
version: 0.2
env:
variables:
COACH_MXNET_TOOLKIT_VERSION: '0.11.0'
COACH_MXNET_FRAMEWORK_VERSION: '1.3.0'
COACH_TF_TOOLKIT_VERSION: '0.11.1'
COACH_TF_FRAMEWORK_VERSION: '1.12.0'
RAY_TF_TOOKIT_VERSION: '0.6.5'
RAY_TF_FRAMEWORK_VERSION: '1.12.0'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
PY_VERSION: '3'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-rl-container'
PROD_ACCOUNT: '520713654638'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .'
phases:
pre_build:
commands:
- start-dockerd
- |
ACCOUNT=$(aws sts get-caller-identity --query 'Account' --output text)
PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
# keep ssh connection alive when communicating with remote ec2 server during integ test
# largest connection idle time allowed: 10 seconds * 300 attempts = 50 minutes
- |
echo ' ServerAliveInterval 10' >> ~/.ssh/config
echo ' ServerAliveCountMax 300' >> ~/.ssh/config
build:
commands:
# install
- echo "install"
- pip3 install -U -e .
# Update awscli for compatibility with the latest botocore version that breaks it
# https://github.com/boto/boto3/issues/2596
- pip3 install --upgrade awscli
# launch remote gpu instance
- echo "launch remote gpu instance"
- |
prefix='ml.'
instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu
- $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids $PROD_ACCOUNT)
- |
MXNET_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-mxnet"
TF_IMAGE="$PROD_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/sagemaker-tensorflow-scriptmode"
BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
# pull cpu base images
- echo "pull cpu base images"
- |
COACH_MXNET_CPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-cpu-py$PY_VERSION"
docker pull $MXNET_IMAGE:$COACH_MXNET_CPU_BASE_TAG
COACH_TF_CPU_BASE_TAG="$COACH_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION"
docker pull $TF_IMAGE:$COACH_TF_CPU_BASE_TAG
if [ "$RAY_TF_FRAMEWORK_VERSION" != "$COACH_TF_FRAMEWORK_VERSION" ]; then
RAY_TF_CPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-cpu-py$PY_VERSION"
docker pull $TF_IMAGE:$RAY_TF_CPU_BASE_TAG
fi
# pull gpu base images
- echo "pull gpu base images"
- |
COACH_MXNET_GPU_BASE_TAG="$COACH_MXNET_FRAMEWORK_VERSION-gpu-py$PY_VERSION"
docker pull $MXNET_IMAGE:$COACH_MXNET_GPU_BASE_TAG
COACH_TF_GPU_BASE_TAG="$COACH_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION"
docker pull $TF_IMAGE:$COACH_TF_GPU_BASE_TAG
if [ "$RAY_TF_FRAMEWORK_VERSION" != "$COACH_TF_FRAMEWORK_VERSION" ]; then
RAY_TF_GPU_BASE_TAG="$RAY_TF_FRAMEWORK_VERSION-gpu-py$PY_VERSION"
docker pull $TF_IMAGE:$RAY_TF_GPU_BASE_TAG
fi
# build cpu images
- echo "build cpu images"
- |
COACH_MXNET_CPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-cpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=cpu .
COACH_TF_CPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$COACH_TF_CPU_TAG -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf --build-arg processor=cpu .
RAY_TF_CPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-cpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$RAY_TF_CPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=cpu .
# push cpu images to ecr
- echo "push cpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$COACH_MXNET_CPU_TAG
docker push $PREPROD_IMAGE:$COACH_TF_CPU_TAG
docker push $PREPROD_IMAGE:$RAY_TF_CPU_TAG
# run cpu integration tests
- echo "run cpu integration tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" "buildspec.yml"; then
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --processor cpu
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --processor cpu
pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --processor cpu
else
echo "skipping cpu integration tests"
fi
# build gpu images
- echo "build gpu images"
- |
COACH_MXNET_GPU_TAG="coach-$COACH_MXNET_TOOLKIT_VERSION-mxnet-gpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG -f coach/docker/$COACH_MXNET_TOOLKIT_VERSION/Dockerfile.mxnet --build-arg processor=gpu .
COACH_TF_GPU_TAG="coach-$COACH_TF_TOOLKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$COACH_TF_GPU_TAG -f coach/docker/$COACH_TF_TOOLKIT_VERSION/Dockerfile.tf --build-arg processor=gpu .
RAY_TF_GPU_TAG="ray-$RAY_TF_TOOKIT_VERSION-tf-gpu-py$PY_VERSION-$BUILD_ID"
docker build -t $PREPROD_IMAGE:$RAY_TF_GPU_TAG -f ray/docker/$RAY_TF_TOOKIT_VERSION/Dockerfile.tf --build-arg processor=gpu .
# push gpu images to ecr
- echo "push gpu images to ecr"
- |
$(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
docker push $PREPROD_IMAGE:$COACH_MXNET_GPU_TAG
docker push $PREPROD_IMAGE:$COACH_TF_GPU_TAG
docker push $PREPROD_IMAGE:$RAY_TF_GPU_TAG
# run gpu integration tests
- echo "run gpu integration tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "coach/*" "ray/*" "buildspec.yml"; then
printf "$SETUP_CMDS" > $SETUP_FILE
cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework mxnet --docker-base-name $PREPROD_IMAGE --tag $COACH_MXNET_GPU_TAG --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --setup-file $SETUP_FILE --pr-number $PR_NUM
cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit coach --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $COACH_TF_GPU_TAG --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --pr-number $PR_NUM --skip-setup
cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --toolkit ray --framework tensorflow --docker-base-name $PREPROD_IMAGE --tag $RAY_TF_GPU_TAG --processor gpu"
remote-test --github-repo $GITHUB_REPO --test-cmd "$cmd" --pr-number $PR_NUM --skip-setup
else
echo "skipping coach gpu integration tests"
fi
# run cpu sagemaker tests
- echo "run cpu sagemaker tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_CPU_TAG --framework mxnet --toolkit coach --instance-type $CPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_CPU_TAG --framework tensorflow --toolkit coach --instance-type $CPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_CPU_TAG --framework tensorflow --toolkit ray --instance-type $CPU_INSTANCE_TYPE
else
echo "skipping cpu sagemaker tests"
fi
# run gpu sagemaker tests
- echo "run gpu sagemaker tests"
- |
if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "ray/*" "buildspec.yml"; then
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_MXNET_GPU_TAG --framework mxnet --toolkit coach --instance-type $GPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $COACH_TF_GPU_TAG --framework tensorflow --toolkit coach --instance-type $GPU_INSTANCE_TYPE
pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --tag $RAY_TF_GPU_TAG --framework tensorflow --toolkit ray --instance-type $GPU_INSTANCE_TYPE
else
echo "skipping gpu sagemaker tests"
fi
finally:
# shut down remote gpu instance
- cleanup-gpu-instances
- cleanup-key-pairs
# remove ecr image
- |
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_MXNET_CPU_TAG
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_CPU_TAG
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_CPU_TAG
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_MXNET_GPU_TAG
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$COACH_TF_GPU_TAG
aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$RAY_TF_GPU_TAG