diff --git a/docs/kubernetes/gpu.md b/docs/kubernetes/gpu.md index 9c0591ebd1..232b203039 100644 --- a/docs/kubernetes/gpu.md +++ b/docs/kubernetes/gpu.md @@ -19,7 +19,7 @@ If `alpha.kubernetes.io/nvidia-gpu` is `0` and you just created the cluster, you When running a GPU container, you will need to specify how many GPU you want to use. If you don't specify a GPU count, kubernetes will asumme you don't require any, and will not map the device into the container. You will also need to mount the drivers from the host (the kubernetes agent) into the container. -On the host, the drivers are installed under `/usr/lib/nvidia-384`. +On the host, the drivers are installed under `/usr/local/nvidia`. Here is an example template running TensorFlow: @@ -45,27 +45,13 @@ spec: limits: alpha.kubernetes.io/nvidia-gpu: 1 volumeMounts: - - mountPath: /usr/local/nvidia/bin - name: bin - - mountPath: /usr/local/nvidia/lib64 - name: lib - - mountPath: /usr/lib/x86_64-linux-gnu/libcuda.so.1 - name: libcuda + - name: nvidia + mountPath: /usr/local/nvidia volumes: - - name: bin + - name: nvidia hostPath: - path: /usr/lib/nvidia-384/bin - - name: lib - hostPath: - path: /usr/lib/nvidia-384 - - name: libcuda - hostPath: - path: /usr/lib/x86_64-linux-gnu/libcuda.so.1 + path: /usr/local/nvidia ``` We specify `alpha.kubernetes.io/nvidia-gpu: 1` in the resources limits, and we mount the drivers from the host into the container. -Some libraries, such as `libcuda.so` are installed under `/usr/lib/x86_64-linux-gnu` on the host, you might need to mount them separatly as shown above based on your needs. - - - diff --git a/pkg/acsengine/engine.go b/pkg/acsengine/engine.go index faced27983..d8aed4c7e7 100644 --- a/pkg/acsengine/engine.go +++ b/pkg/acsengine/engine.go @@ -1671,20 +1671,42 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { // latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions. - dv := "384" + dv := "384.111" + dest := "/usr/local/nvidia" /* First we remove the nouveau drivers, which are the open source drivers for NVIDIA cards. Nouveau is installed on NV Series VMs by default. - Then we add the graphics-drivers ppa repository and get the proprietary drivers from there. + We also installed needed dependencies. */ - ppaScript := fmt.Sprintf(`- rmmod nouveau + installScript := fmt.Sprintf(`- rmmod nouveau - sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf" - update-initramfs -u -- sudo add-apt-repository -y ppa:graphics-drivers -- sudo apt-get update -- sudo apt-get install -y nvidia-%s -- sudo nvidia-smi -- sudo systemctl restart kubelet`, dv) +- sudo apt-get update && sudo apt-get install -y linux-headers-$(uname -r) gcc make +- mkdir -p %s +- cd %s`, dest, dest) + + /* + Download the .run file from NVIDIA. + Nvidia libraries are always install in /usr/lib/x86_64-linux-gnu, and there is no option in the run file to change this. + Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64 + */ + installScript += fmt.Sprintf(` +- curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s +- mkdir -p lib64 overlay-workdir +- sudo mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu`, dv, dv, dv) + + /* + Install the drivers and update /etc/ld.so.conf.d/nvidia.conf which will make the libraries discoverable through $LD_LIBRARY_PATH. + Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts) + */ + installScript += fmt.Sprintf(` +- sudo sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s" +- echo "%s" > /etc/ld.so.conf.d/nvidia.conf +- sudo ldconfig +- sudo umount /usr/lib/x86_64-linux-gnu +- sudo nvidia-modprobe -u -c0 +- sudo %s/bin/nvidia-smi +- sudo systemctl restart kubelet`, dv, dest, dest, fmt.Sprintf("%s/lib64", dest), dest) // We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message. na := getGPUDriversNotInstalledWarningMessage(profile.VMSize) @@ -1693,14 +1715,14 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string { that we have an agreement with NVIDIA for this specific gpu. Otherwise use the warning message. */ dm := map[string]string{ - "Standard_NC6": ppaScript, - "Standard_NC12": ppaScript, - "Standard_NC24": ppaScript, - "Standard_NC24r": ppaScript, - "Standard_NV6": ppaScript, - "Standard_NV12": ppaScript, - "Standard_NV24": ppaScript, - "Standard_NV24r": ppaScript, + "Standard_NC6": installScript, + "Standard_NC12": installScript, + "Standard_NC24": installScript, + "Standard_NC24r": installScript, + "Standard_NV6": installScript, + "Standard_NV12": installScript, + "Standard_NV24": installScript, + "Standard_NV24r": installScript, "Standard_NC6_v2": na, "Standard_NC12_v2": na, "Standard_NC24_v2": na,