-
Notifications
You must be signed in to change notification settings - Fork 17
/
nvidia-efa-ml-al2022.yml
147 lines (147 loc) · 7.96 KB
/
nvidia-efa-ml-al2022.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
{
"variables": {
"region": "us-east-1",
"flag": "al2022-base",
"subnet_id": "subnet-baff22e5",
"security_groupids": "sg-053996a563511a3c6,sg-050407dfb1c555723",
"build_ami": "ami-0b45a5b26d1df8ed7",
"efa_pkg": "aws-efa-installer-latest.tar.gz",
"intel_mkl_version": "intel-mkl-2020.0-088",
"nvidia_version": "515.86.01",
"cuda_version": "cuda-toolkit-11-6",
"cudnn_version": "libcudnn8",
"nccl_version": "v2.12.7-1"
},
"builders": [{
"type": "amazon-ebs",
"region": "{{user `region`}}",
"source_ami": "{{user `build_ami`}}",
"run_tags": {
"Name": "packer-gpu-processor_{{user `flag`}}"
},
"subnet_id": "{{user `subnet_id`}}",
"security_group_ids": "{{user `security_groupids`}}",
"instance_type":"g5.2xlarge",
"ssh_username": "ec2-user",
"ami_name": "aws_{{user `flag`}}_{{user `nvidia_version`}}-{{timestamp}}",
"launch_block_device_mappings":[{
"delete_on_termination": true,
"device_name": "/dev/xvda",
"volume_size": 100,
"throughput": 1000,
"iops": 10000,
"volume_type": "gp3"
}]
}],
"provisioners": [{
"type": "shell",
"expect_disconnect": true,
"inline": [
"sudo dnf update -y",
"sudo dnf install amazon-cloudwatch-agent python3 yum-utils rsync ldconfig cmake dkms mdadm git htop hwloc kernel-tools rpm-build rpmdevtools numactl parallel pigz wget kernel-devel kernel-headers make check check-devel -y",
"sudo dnf groupinstall 'Development Tools' -y",
"curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py && sudo python3 /tmp/get-pip.py",
"uname -r",
"echo 'blacklist nouveau' | sudo tee /etc/modprobe.d/nvidia-graphics-drivers.conf",
"echo 'blacklist lbm-nouveau' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf",
"echo 'alias nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf",
"echo 'alias lbm-nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf",
"sudo shutdown -r now"]
},
{
"type": "shell",
"inline": [
"echo 'net.core.default_qdisc = fq' | sudo tee -a /etc/sysctl.conf",
"echo 'net.ipv4.tcp_congestion_control = bbr' | sudo tee -a /etc/sysctl.conf",
"echo 'net.ipv4.tcp_timestamps = 0' | sudo tee -a /etc/sysctl.conf",
"echo 'net.core.rmem_max = 67108864' | sudo tee -a /etc/sysctl.conf",
"echo 'net.core.wmem_max = 67108864' | sudo tee -a /etc/sysctl.conf",
"echo 'net.ipv4.tcp_rmem = 4096 87380 67108864' | sudo tee -a /etc/sysctl.conf",
"echo 'net.ipv4.tcp_wmem = 4096 65536 67108864' | sudo tee -a /etc/sysctl.conf",
"sudo sysctl -p"]
},
{
"type": "shell",
"inline": [
"uname -r",
"wget -O /tmp/awscli2.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip",
"cd /tmp && sudo unzip /tmp/awscli2.zip",
"sudo /tmp/aws/install",
"aws configure set default.s3.max_concurrent_requests 100",
"aws configure set default.s3.max_queue_size 10000",
"aws configure set default.s3.multipart_threshold 64MB",
"aws configure set default.s3.multipart_chunksize 16MB"]
},
{
"type": "shell",
"inline_shebang": "/bin/bash -xe",
"inline": [
"echo ' StrictHostKeyChecking no' | sudo tee -a /etc/ssh/ssh_config",
"echo ' HostbasedAuthentication no' | sudo tee -a /etc/ssh/ssh_config",
"echo ' CheckHostIP no' | sudo tee -a /etc/ssh/ssh_config"]
},
{
"type": "shell",
"inline_shebang": "/bin/bash -xe",
"inline": [
"cd /tmp",
"sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora35/x86_64/cuda-fedora35.repo",
"sudo wget -O /tmp/NVIDIA-Linux-driver.run 'https://us.download.nvidia.com/tesla/{{user `nvidia_version`}}/NVIDIA-Linux-x86_64-{{user `nvidia_version`}}.run'",
"sudo sh /tmp/NVIDIA-Linux-driver.run -q -a --ui=none",
"sudo dnf install {{user `cuda_version`}} -y",
"sudo curl -O https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz",
"sudo tar xf fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz -C /tmp",
"sudo rsync -al /tmp/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive/ /usr/ --exclude LICENSE",
"sudo mv /usr/systemd/nvidia-fabricmanager.service /usr/lib/systemd/system",
"sudo systemctl enable nvidia-fabricmanager",
"echo 'options nvidia NVreg_EnableGpuFirmware=0' | sudo tee /etc/modprobe.d/nvidia-gsp.conf",
"echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64\nexport PATH=$PATH:/usr/local/cuda/bin' | sudo tee /etc/profile.d/cuda.sh",
"echo -e '#!/bin/bash\nnvidia-smi -pm 1' | sudo tee /etc/rc.local",
"sudo chmod +x /etc/rc.local"]
},
{
"type": "shell",
"inline_shebang": "/bin/bash -xe",
"inline": [
"sudo dnf install docker -y",
"curl -s -L https://nvidia.github.io/nvidia-docker/centos8/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo",
"sudo dnf install -y nvidia-container-toolkit",
"sudo sed -i 's/^OPTIONS/#&/' /etc/sysconfig/docker",
"echo -e '{\"default-ulimits\":{\"memlock\":{\"Name\":\"memlock\",\"Soft\":-1,\"Hard\":-1}},\"default-runtime\":\"nvidia\",\"runtimes\":{\"nvidia\":{\"path\":\"nvidia-container-runtime\",\"runtimeArgs\":[]}}}' | sudo tee /etc/docker/daemon.json",
"sudo systemctl restart docker",
"sudo usermod -aG docker ec2-user"]
},
{
"type": "shell",
"inline_shebang": "/bin/bash -xe",
"inline": [
"sudo mkdir -p /opt/aws",
"wget -O /tmp/aws-gpu-boost-clock.sh 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.sh'",
"wget -O /tmp/aws-gpu-boost-clock.service 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.service'",
"sudo mv /tmp/aws-gpu-boost-clock.sh /opt/aws/ && sudo chmod +x /opt/aws/aws-gpu-boost-clock.sh",
"sudo mv /tmp/aws-gpu-boost-clock.service /lib/systemd/system",
"sudo systemctl enable aws-gpu-boost-clock.service && sudo systemctl start aws-gpu-boost-clock.service"]
},
{
"type": "shell",
"inline_shebang": "/bin/bash -xe",
"inline": [
"sudo /usr/local/bin/pip3 install boto3",
"sudo mkdir -p /opt/aws",
"sudo wget -O /tmp/cwa.rpm 'https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm'",
"sudo rpm -iUhv /tmp/cwa.rpm",
"git clone https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline.git /tmp/aws-efa-nccl-baseami",
"sudo mv /tmp/aws-efa-nccl-baseami/nvidia-efa-ami_base/cloudwatch /opt/aws/",
"sudo mv /opt/aws/cloudwatch/aws-hw-monitor.service /lib/systemd/system",
"echo -e '#!/bin/sh\n' | sudo tee /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh",
"echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/aws-hwaccel-event-parser.py &' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh",
"echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/accel-to-cw.py /opt/aws/cloudwatch/nvidia/nvidia-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh",
"echo -e '/usr/bin/python3 /opt/aws/cloudwatch/efa/efa-to-cw.py /opt/aws/cloudwatch/efa/efa-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh",
"sudo chmod +x /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh",
"sudo cp /opt/aws/cloudwatch/nvidia/cwa-config.json /opt/aws/amazon-cloudwatch-agent/bin/config.json",
"sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json -s",
"sudo systemctl enable aws-hw-monitor.service",
"sudo systemctl restart amazon-cloudwatch-agent.service"]
}
]
}