Updates NVIDIA drivers installation (Azure#2219)

* updated NVIDIA drivers installation * linting engine.go * update GPU doc
tesharp · Mar 15, 2018 · b8d07c1 · b8d07c1
1 parent 2815ad3
commit b8d07c1
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 35 deletions.
diff --git a/docs/kubernetes/gpu.md b/docs/kubernetes/gpu.md
@@ -19,7 +19,7 @@ If `alpha.kubernetes.io/nvidia-gpu` is `0` and you just created the cluster, you
 When running a GPU container, you will need to specify how many GPU you want to use. If you don't specify a GPU count, kubernetes will asumme you don't require any, and will not map the device into the container.
 You will also need to mount the drivers from the host (the kubernetes agent) into the container.
 
-On the host, the drivers are installed under `/usr/lib/nvidia-384`.
+On the host, the drivers are installed under `/usr/local/nvidia`.
 
 Here is an example template running TensorFlow: 
 
@@ -45,27 +45,13 @@ spec:
           limits:
             alpha.kubernetes.io/nvidia-gpu: 1
         volumeMounts:
-        - mountPath: /usr/local/nvidia/bin
-          name: bin
-        - mountPath: /usr/local/nvidia/lib64
-          name: lib
-        - mountPath: /usr/lib/x86_64-linux-gnu/libcuda.so.1
-          name: libcuda
+        - name: nvidia
+          mountPath: /usr/local/nvidia
       volumes:
-        - name: bin
+        - name: nvidia
           hostPath: 
-            path: /usr/lib/nvidia-384/bin
-        - name: lib
-          hostPath: 
-            path: /usr/lib/nvidia-384
-        - name: libcuda
-          hostPath:
-            path: /usr/lib/x86_64-linux-gnu/libcuda.so.1
+            path: /usr/local/nvidia
 ```
 
 We specify `alpha.kubernetes.io/nvidia-gpu: 1` in the resources limits, and we mount the drivers from the host into the container.
 
-Some libraries, such as `libcuda.so` are installed under `/usr/lib/x86_64-linux-gnu` on the host, you might need to mount them separatly as shown above based on your needs.
-
-
-
diff --git a/pkg/acsengine/engine.go b/pkg/acsengine/engine.go
@@ -1748,20 +1748,42 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC
 func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
 
 	// latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions.
-	dv := "384"
+	dv := "384.111"
+	dest := "/usr/local/nvidia"
 
 	/*
 		First we remove the nouveau drivers, which are the open source drivers for NVIDIA cards. Nouveau is installed on NV Series VMs by default.
-		Then we add the graphics-drivers ppa repository and get the proprietary drivers from there.
+		We also installed needed dependencies.
 	*/
-	ppaScript := fmt.Sprintf(`- rmmod nouveau
+	installScript := fmt.Sprintf(`- rmmod nouveau
 - sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf"
 - update-initramfs -u
-- sudo add-apt-repository -y ppa:graphics-drivers
-- sudo apt-get update
-- sudo apt-get install -y nvidia-%s
-- sudo nvidia-smi
-- sudo systemctl restart kubelet`, dv)
+- sudo apt-get update && sudo apt-get install -y linux-headers-$(uname -r) gcc make
+- mkdir -p %s
+- cd %s`, dest, dest)
+
+	/*
+		Download the .run file from NVIDIA.
+		Nvidia libraries are always install in /usr/lib/x86_64-linux-gnu, and there is no option in the run file to change this.
+		Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64
+	*/
+	installScript += fmt.Sprintf(`
+- curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s
+- mkdir -p lib64 overlay-workdir
+- sudo mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv)
+
+	/*
+		Install the drivers and update /etc/ld.so.conf.d/nvidia.conf which will make the libraries discoverable through $LD_LIBRARY_PATH.
+		Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts)
+	*/
+	installScript += fmt.Sprintf(`
+- sudo sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
+- echo "%s" > /etc/ld.so.conf.d/nvidia.conf
+- sudo ldconfig
+- sudo umount /usr/lib/x86_64-linux-gnu
+- sudo nvidia-modprobe -u -c0
+- sudo %s/bin/nvidia-smi
+- sudo systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest)
 
 	// We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message.
 	na := getGPUDriversNotInstalledWarningMessage(profile.VMSize)
@@ -1770,14 +1792,14 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
 	   that we have an agreement with NVIDIA for this specific gpu. Otherwise use the warning message.
 	*/
 	dm := map[string]string{
-		"Standard_NC6":      ppaScript,
-		"Standard_NC12":     ppaScript,
-		"Standard_NC24":     ppaScript,
-		"Standard_NC24r":    ppaScript,
-		"Standard_NV6":      ppaScript,
-		"Standard_NV12":     ppaScript,
-		"Standard_NV24":     ppaScript,
-		"Standard_NV24r":    ppaScript,
+		"Standard_NC6":      installScript,
+		"Standard_NC12":     installScript,
+		"Standard_NC24":     installScript,
+		"Standard_NC24r":    installScript,
+		"Standard_NV6":      installScript,
+		"Standard_NV12":     installScript,
+		"Standard_NV24":     installScript,
+		"Standard_NV24r":    installScript,
 		"Standard_NC6_v2":   na,
 		"Standard_NC12_v2":  na,
 		"Standard_NC24_v2":  na,