@@ -1671,20 +1671,42 @@ func getPackageGUID(orchestratorType string, orchestratorVersion string, masterC
1671
1671
func getGPUDriversInstallScript (profile * api.AgentPoolProfile ) string {
1672
1672
1673
1673
// latest version of the drivers. Later this parameter could be bubbled up so that users can choose specific driver versions.
1674
- dv := "384"
1674
+ dv := "384.111"
1675
+ dest := "/usr/local/nvidia"
1675
1676
1676
1677
/*
1677
1678
First we remove the nouveau drivers, which are the open source drivers for NVIDIA cards. Nouveau is installed on NV Series VMs by default.
1678
- Then we add the graphics-drivers ppa repository and get the proprietary drivers from there .
1679
+ We also installed needed dependencies .
1679
1680
*/
1680
- ppaScript := fmt .Sprintf (`- rmmod nouveau
1681
+ installScript := fmt .Sprintf (`- rmmod nouveau
1681
1682
- sh -c "echo \"blacklist nouveau\" >> /etc/modprobe.d/blacklist.conf"
1682
1683
- update-initramfs -u
1683
- - sudo add-apt-repository -y ppa:graphics-drivers
1684
- - sudo apt-get update
1685
- - sudo apt-get install -y nvidia-%s
1686
- - sudo nvidia-smi
1687
- - sudo systemctl restart kubelet` , dv )
1684
+ - sudo apt-get update && sudo apt-get install -y linux-headers-$(uname -r) gcc make
1685
+ - mkdir -p %s
1686
+ - cd %s` , dest , dest )
1687
+
1688
+ /*
1689
+ Download the .run file from NVIDIA.
1690
+ Nvidia libraries are always install in /usr/lib/x86_64-linux-gnu, and there is no option in the run file to change this.
1691
+ Instead we use Overlayfs to move the newly installed libraries under /usr/local/nvidia/lib64
1692
+ */
1693
+ installScript += fmt .Sprintf (`
1694
+ - curl -fLS https://us.download.nvidia.com/tesla/%s/NVIDIA-Linux-x86_64-%s.run -o nvidia-drivers-%s
1695
+ - mkdir -p lib64 overlay-workdir
1696
+ - sudo mount -t overlay -o lowerdir=/usr/lib/x86_64-linux-gnu,upperdir=lib64,workdir=overlay-workdir none /usr/lib/x86_64-linux-gnu` , dv , dv , dv )
1697
+
1698
+ /*
1699
+ Install the drivers and update /etc/ld.so.conf.d/nvidia.conf which will make the libraries discoverable through $LD_LIBRARY_PATH.
1700
+ Run nvidia-smi to test the installation, unmount overlayfs and restard kubelet (GPUs are only discovered when kubelet starts)
1701
+ */
1702
+ installScript += fmt .Sprintf (`
1703
+ - sudo sh nvidia-drivers-%s --silent --accept-license --no-drm --utility-prefix="%s" --opengl-prefix="%s"
1704
+ - echo "%s" > /etc/ld.so.conf.d/nvidia.conf
1705
+ - sudo ldconfig
1706
+ - sudo umount /usr/lib/x86_64-linux-gnu
1707
+ - sudo nvidia-modprobe -u -c0
1708
+ - sudo %s/bin/nvidia-smi
1709
+ - sudo systemctl restart kubelet` , dv , dest , dest , fmt .Sprintf ("%s/lib64" , dest ), dest )
1688
1710
1689
1711
// We don't have an agreement in place with NVIDIA to provide the drivers on every sku. For this VMs we simply log a warning message.
1690
1712
na := getGPUDriversNotInstalledWarningMessage (profile .VMSize )
@@ -1693,14 +1715,14 @@ func getGPUDriversInstallScript(profile *api.AgentPoolProfile) string {
1693
1715
that we have an agreement with NVIDIA for this specific gpu. Otherwise use the warning message.
1694
1716
*/
1695
1717
dm := map [string ]string {
1696
- "Standard_NC6" : ppaScript ,
1697
- "Standard_NC12" : ppaScript ,
1698
- "Standard_NC24" : ppaScript ,
1699
- "Standard_NC24r" : ppaScript ,
1700
- "Standard_NV6" : ppaScript ,
1701
- "Standard_NV12" : ppaScript ,
1702
- "Standard_NV24" : ppaScript ,
1703
- "Standard_NV24r" : ppaScript ,
1718
+ "Standard_NC6" : installScript ,
1719
+ "Standard_NC12" : installScript ,
1720
+ "Standard_NC24" : installScript ,
1721
+ "Standard_NC24r" : installScript ,
1722
+ "Standard_NV6" : installScript ,
1723
+ "Standard_NV12" : installScript ,
1724
+ "Standard_NV24" : installScript ,
1725
+ "Standard_NV24r" : installScript ,
1704
1726
"Standard_NC6_v2" : na ,
1705
1727
"Standard_NC12_v2" : na ,
1706
1728
"Standard_NC24_v2" : na ,
0 commit comments