NVIDIA · dholt · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 15, 2026
diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml
@@ -140,6 +140,10 @@ gpu_clock_lock: "1507,1507"
 
 # Debugging var: force install NVIDIA driver even if GPU not detected
 nvidia_driver_force_install: false
+# Default NVIDIA driver branch for generic host driver installs.
+nvidia_driver_branch: "580"
+# Set true on Ubuntu systems that require NVIDIA open kernel modules.
+nvidia_driver_ubuntu_use_open_kernel_modules: false
 
 
 ################################################################################
@@ -249,15 +253,22 @@ maas_adminusers:
 maas_dns_domain: 'deepops.local'
 maas_region_controller: '192.168.1.1'
 maas_region_controller_url: 'http://{{ maas_region_controller }}:5240/MAAS'
-maas_repo: 'ppa:maas/2.8'
+maas_repo: 'ppa:maas/3.5'
 
 # Defines if maas user should generate ssh keys
 # Usable for remote KVM/libvirt power actions
 maas_setup_user: false
 
 maas_single_node_install: true
 
-maas_kvm: false
+maas_kvm_management: false
+
+# Avoid installing python-libmaas via pip: it shadows the packaged MAAS CLI
+# with /usr/local/bin/maas, which breaks DeepOps' MAAS operational workflow.
+maas_python_reqs:
+  - jinja2
+  - oauth
+  - pyyaml
 
 ################################################################################
 # NVIDIA Datacenter GPU Manager                                                #
@@ -305,4 +316,3 @@ standalone_container_registry_port: "5000"
 ngc_ready_cuda_container: "nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04"
 ngc_ready_pytorch: "nvcr.io/nvidia/pytorch:24.04-py3"
 ngc_ready_tensorflow: "nvcr.io/nvidia/tensorflow:24.04-tf2-py3"
-
diff --git a/docs/deepops/configuration.md b/docs/deepops/configuration.md
@@ -62,7 +62,7 @@ my-cluster-compute-02      ansible_host=10.0.0.3
 (Note that, by default, DeepOps will set the hostname of these machines to match the inventory hostname!
 If you don't want this, you can set `deepops_set_hostname: false` using the instructions in [the next section](#modifying-ansible-variables).)
 
-The example DeepOps inventory also includes groups for the different components of Kubernetes clusters (`kube-master`, `etcd`, and `kube-node`),
+The example DeepOps inventory also includes groups for the different components of Kubernetes clusters (`kube_control_plane`, `etcd`, and `kube_node`),
 and groups for the different components of Slurm clusters (`slurm-master` and `slurm-node`).
 These groups are used by DeepOps to determine which playbooks run on which nodes for each type of cluster,
 and you should add nodes to these groups based on how you want to lay out your cluster.

diff --git a/docs/deepops/update-deepops.md b/docs/deepops/update-deepops.md
@@ -160,6 +160,8 @@ In particular,
 
 Additionally, please note that Kubespray can only upgrade between one minor version of Kubernetes at a time.
 This means that you may need to upgrade multiple times between your current version and your desired version of Kubernetes.
+Other cluster components managed by Kubespray may have similar staged-upgrade requirements.
+For example, the network plugin version installed by an older DeepOps release may need to be upgraded through an intermediate DeepOps/Kubespray release before a newer Kubespray release will accept it.
 
 For example, to upgrade from Kubernetes version 1.19.9 and 1.21.1, you might use a workflow like this:
 
@@ -267,7 +269,7 @@ DeepOps offers the option to configure each of the necessary NVIDIA components i
 
 ##### Updating the NVIDIA driver
 
-**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to false.
+**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to true.
 If you are using MIG-enabled GPUs ensure that your MIG configuration is persistent by using the [nvidia-mig-manager systemd](https://github.com/NVIDIA/mig-parted/tree/master/deployments/systemd) service
 or the [nvidia-mig-manager Kubernetes GPU Operator-included DaemonSet](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html).
 
@@ -277,7 +279,7 @@ To update the driver on a DGX system, we recommend following the instructions in
 
 ###### On Ubuntu
 
-On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 450 or 470.
+On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 580.
 
 To upgrade to the latest driver within your current release branch, run:
 
@@ -288,7 +290,13 @@ ansible-playbook playbooks/nvidia-software/nvidia-driver.yml -e nvidia_driver_pa
 To upgrade the driver to a new release branch, set the following parameter in your DeepOps configuration:
 
 ```bash
-nvidia_driver_ubuntu_branch: "470"
+nvidia_driver_ubuntu_branch: "580"
+```
+
+Some newer GPUs require NVIDIA open kernel modules. To install the Ubuntu open kernel module packages for the selected branch, set:
+
+```bash
+nvidia_driver_ubuntu_use_open_kernel_modules: true
 ```
 
 Then run:
@@ -433,15 +441,15 @@ Note that this can take a long time, as we download and build Slurm from source
 
 #### Updating the NVIDIA driver
 
-**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to false.
+**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to true.
 
 ##### On DGX
 
 To update the driver on a DGX system, we recommend following the instructions in the DGX User Guide.
 
 ##### On Ubuntu
 
-On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 450 or 470.
+On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 580.
 
 To upgrade to the latest driver within your current release branch, run:
 
@@ -452,7 +460,13 @@ ansible-playbook playbooks/nvidia-software/nvidia-driver.yml -e nvidia_driver_pa
 To upgrade the driver to a new release branch, set the following parameter in your DeepOps configuration:
 
 ```bash
-nvidia_driver_ubuntu_branch: "510"
+nvidia_driver_ubuntu_branch: "580"
+```
+
+Some newer GPUs require NVIDIA open kernel modules. To install the Ubuntu open kernel module packages for the selected branch, set:
+
+```bash
+nvidia_driver_ubuntu_use_open_kernel_modules: true
 ```
 
 Then run:

diff --git a/docs/k8s-cluster/README.md b/docs/k8s-cluster/README.md
@@ -84,7 +84,7 @@ Instructions for deploying a GPU cluster with Kubernetes
    # NOTE: If SSH requires a password, add: `-k`
    # NOTE: If sudo on remote machine requires a password, add: `-K`
    # NOTE: If SSH user is different than current user, add: `-u ubuntu`
-   ansible-playbook -l k8s-cluster playbooks/k8s-cluster.yml
+   ansible-playbook -l k8s_cluster playbooks/k8s-cluster.yml
    ```
 
    More information on Kubespray can be found in the official [Getting Started Guide](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/getting-started.md)
@@ -123,7 +123,7 @@ Run the following script to create an administrative user and print out the dash
 
 #### NFS Client Provisioner
 
-The default behavior of DeepOps is to setup an NFS server on the first `kube-master` node. This temporary NFS server is used by the `nfs-client-provisioner` which is installed as the default StorageClass of a standard DeepOps deployment.
+The default behavior of DeepOps is to setup an NFS server on the first `kube_control_plane` node. This temporary NFS server is used by the `nfs-client-provisioner` which is installed as the default StorageClass of a standard DeepOps deployment.
 
 To use an existing nfs server server update the `k8s_nfs_server` and `k8s_nfs_export_path` variables in `config/group_vars/k8s-cluster.yml` and set the `k8s_deploy_nfs_server` to false in `config/group_vars/k8s-cluster.yml`. Additionally, the `k8s_nfs_mkdir` variable can be set to `false` if the export directory is already configured on the server.
 
@@ -165,7 +165,7 @@ Deploy NetApp Astra Trident for services that require persistent storage (such a
    # NOTE: If SSH requires a password, add: `-k`
    # NOTE: If sudo on remote machine requires a password, add: `-K`
    # NOTE: If SSH user is different than current user, add: `-u ubuntu`
-   ansible-playbook -l k8s-cluster playbooks/k8s-cluster/netapp-trident.yml
+   ansible-playbook -l k8s_cluster playbooks/k8s-cluster/netapp-trident.yml
    ```
 
 3. Verify that Astra Trident is running.
@@ -207,9 +207,9 @@ delete  Legacy positional argument for delete. Same as -d flag.
 
 The services can be reached from the following addresses:
 
-- Grafana: http://\<kube-master\>:30200
-- Prometheus: http://\<kube-master\>:30500
-- Alertmanager: http://\<kube-master\>:30400
+- Grafana: http://\<kube_control_plane\>:30200
+- Prometheus: http://\<kube_control_plane\>:30500
+- Alertmanager: http://\<kube_control_plane\>:30400
 
 We deploy our monitoring services using the [prometheus-operator](https://github.com/prometheus-operator/prometheus-operator) project.
 For documentation on configuring and managing the monitoring services, please see the [prometheus-operator user guides](https://github.com/prometheus-operator/prometheus-operator/tree/master/Documentation/user-guides).
@@ -234,7 +234,7 @@ Follow the [ELK logging Guide](logging.md) to setup logging in the cluster.
 
 The service can be reached from the following address:
 
-- Kibana: http://\<kube-master\>:30700
+- Kibana: http://\<kube_control_plane\>:30700
 
 ### Container Registry
 
@@ -264,15 +264,15 @@ DeepOps uses [Kubespray](https://github.com/kubernetes-sigs/kubespray) to deploy
 
 ### Adding Nodes
 
-To add K8s nodes, modify the `config/inventory` file to include the new nodes under `[all]`. Then list the nodes as relevant under the `[kube-master]`, `[etcd]`, and `[kube-node]` sections. For example, if adding a new master node, list it under kube-master and etcd. A new worker node would go under kube-node.
+To add K8s nodes, modify the `config/inventory` file to include the new nodes under `[all]`. Then list the nodes as relevant under the `[kube_control_plane]`, `[etcd]`, and `[kube_node]` sections. For example, if adding a new control-plane node, list it under `kube_control_plane` and `etcd`. A new worker node would go under `kube_node`.
 
 Then run the Kubespray `scale.yml` playbook...
 
 ```bash
 # NOTE: If SSH requires a password, add: `-k`
 # NOTE: If sudo on remote machine requires a password, add: `-K`
 # NOTE: If SSH user is different than current user, add: `-u ubuntu`
-ansible-playbook -l k8s-cluster submodules/kubespray/scale.yml
+ansible-playbook -l k8s_cluster submodules/kubespray/scale.yml
 ```
 
 More information on this topic may be found in the [Kubespray docs](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/getting-started.md#adding-nodes).

diff --git a/docs/k8s-cluster/kubeflow.md b/docs/k8s-cluster/kubeflow.md
@@ -43,7 +43,7 @@ A local checkout of the [Kubeflow manifests](https://github.com/kubeflow/manifes
 
 The services can be reached from the following address:
 
-- Kubeflow: http://\<kube-master\>:31380
+- Kubeflow: http://\<kube_control_plane\>:31380
 
 ## Login information
 

diff --git a/docs/k8s-cluster/nvidia-network-operator.md b/docs/k8s-cluster/nvidia-network-operator.md
@@ -83,9 +83,9 @@ This playbook is developed and tested in following environments:
   gpu01      ansible_host=192.168.2.11
   gpu02      ansible_host=192.168.3.11
   ...
-  [kube-master]
+  [kube_control_plane]
   mgmt01
-  [kube-node]
+  [kube_node]
   gpu01
   gpu02
   ```
@@ -123,7 +123,7 @@ This playbook is developed and tested in following environments:
   # NOTE: If SSH requires a password, add: `-k`
   # NOTE: If sudo on remote machine requires a password, add: `-K`
   # NOTE: If SSH user is different than current user, add: `-u ubuntu`
-  ansible-playbook -l k8s-cluster playbooks/k8s-cluster.yml
+  ansible-playbook -l k8s_cluster playbooks/k8s-cluster.yml
   ```
 
   Please refer to [DeepOps Kubernetes Deployment Guidehere](https://github.com/NVIDIA/deepops/blob/master/docs/kubernetes-cluster.md) for more information.

diff --git a/docs/k8s-cluster/roce-perf-k8s.md b/docs/k8s-cluster/roce-perf-k8s.md
@@ -142,7 +142,7 @@ add switch PFC, ECN configuration
 
    ```bash
    # Modify the Ansible inventory file
-   # Especially the 'all', 'kube-master', 'etcd', 'kube-node' and 'k8s-cluster' sections
+   # Especially the 'all', 'kube_control_plane', 'etcd', 'kube_node' and 'k8s_cluster' sections
    vi config/inventory
    ```
 
@@ -159,10 +159,10 @@ add switch PFC, ECN configuration
    gpu02      ansible_host=192.168.2.11
    ...
 
-   [kube-master]
+   [kube_control_plane]
    mgmt01
 
-   [kube-node]
+   [kube_node]
    gpu01
    gpu02
 
@@ -203,7 +203,7 @@ add switch PFC, ECN configuration
    # NOTE: If SSH requires a password, add: `-k`
    # NOTE: If sudo on remote machine requires a password, add: `-K`
    # NOTE: If SSH user is different than current user, add: `-u ubuntu`
-   ansible-playbook -l k8s-cluster playbooks/k8s-cluster.yml
+   ansible-playbook -l k8s_cluster playbooks/k8s-cluster.yml
    ```
 
    Please refer to [DeepOps Kubernetes Deployment Guidehere](https://github.com/NVIDIA/deepops/blob/master/docs/kubernetes-cluster.md) for more information.
@@ -252,7 +252,7 @@ add switch PFC, ECN configuration
    Run following script to deploy SRIOV RoCE functions:
 
    ```bash
-   nvidia@mgmt01:~/deepops_0322$ ansible-playbook -l k8s-cluster playbooks/k8s-cluster/roce.yaml
+   nvidia@mgmt01:~/deepops_0322$ ansible-playbook -l k8s_cluster playbooks/k8s-cluster/roce.yaml
    ```
 
    If using a different username and SSH key-based authentication haven't set up, try to use `-u <user> -k -K` when you run the script.

diff --git a/docs/k8s-cluster/roce_backend.md b/docs/k8s-cluster/roce_backend.md
@@ -106,7 +106,7 @@ The Role installing following components:
 ## Role deployment
 
 ```bash
-ansible-playbook -l k8s-cluster playbooks/k8s-cluster/roce.yaml
+ansible-playbook -l k8s_cluster playbooks/k8s-cluster/roce.yaml
 ```
 
 ## License

diff --git a/docs/pxe/maas.md b/docs/pxe/maas.md
@@ -258,8 +258,8 @@ only need to tag leaf groups.
 
 | Tag | Ansible Group | Used By |
 |-----|--------------|---------|
-| `kube-master` | `[kube-master]` | K8s control plane |
-| `kube-node` | `[kube-node]` | K8s worker nodes |
+| `kube_control_plane` | `[kube_control_plane]` | K8s control plane |
+| `kube_node` | `[kube_node]` | K8s worker nodes |
 | `slurm-master` | `[slurm-master]` | Slurm head node |
 | `slurm-node` | `[slurm-node]` | Slurm compute nodes |
 | `slurm-nfs` | `[slurm-nfs]` | Slurm NFS server |
@@ -276,8 +276,8 @@ ansible-playbook -i scripts/maas_inventory.py playbooks/slurm-cluster.yml
 
 # Later, retag for K8s
 maas admin tag update-nodes slurm-master remove=<vm01_system_id>
-maas admin tag update-nodes kube-master add=<vm01_system_id>
-maas admin tag update-nodes kube-node add=<vm02_system_id> add=<vm03_system_id>
+maas admin tag update-nodes kube_control_plane add=<vm01_system_id>
+maas admin tag update-nodes kube_node add=<vm02_system_id> add=<vm03_system_id>
 
 # Run K8s deployment
 ansible-playbook -i scripts/maas_inventory.py playbooks/k8s-cluster.yml

diff --git a/playbooks/k8s-cluster.yml b/playbooks/k8s-cluster.yml
@@ -282,7 +282,7 @@
     ansible_become: no
   tasks:
     - name: Install Helm on admin node
-      command: "sh {{ playbook_dir }}/../scripts/k8s/install_helm.sh"
+      command: "bash {{ playbook_dir }}/../scripts/k8s/install_helm.sh"
       delegate_to: localhost
     - name: Globally update the deprecated "stable" helm repo
       command: "/usr/local/bin/helm repo add 'stable' 'https://charts.helm.sh/stable' --force-update"

diff --git a/playbooks/k8s-cluster/nfs-client-provisioner.yml b/playbooks/k8s-cluster/nfs-client-provisioner.yml
@@ -22,7 +22,7 @@
     include_role:
       name: nfs
     vars:
-    - nfs_is_server: yes
+      nfs_is_server: yes
     when: k8s_deploy_nfs_server
 
 - hosts: "k8s_cluster"

diff --git a/playbooks/nvidia-software/nvidia-cuda.yml b/playbooks/nvidia-software/nvidia-cuda.yml
@@ -13,6 +13,10 @@
       include_role:
         name: facts
 
+    - name: configure Ubuntu NVIDIA driver packages
+      include_tasks: tasks/nvidia-driver-ubuntu-packages.yml
+      when: ansible_distribution == 'Ubuntu'
+
     - name: install nvidia driver
       include_role:
         name: nvidia.nvidia_driver

diff --git a/playbooks/nvidia-software/nvidia-driver.yml b/playbooks/nvidia-software/nvidia-driver.yml
@@ -14,6 +14,10 @@
       include_role:
         name: facts
 
+    - name: configure Ubuntu NVIDIA driver packages
+      include_tasks: tasks/nvidia-driver-ubuntu-packages.yml
+      when: ansible_distribution == 'Ubuntu'
+
     - name: install nvidia driver
       include_role:
         name: nvidia.nvidia_driver

diff --git a/playbooks/nvidia-software/tasks/nvidia-driver-ubuntu-packages.yml b/playbooks/nvidia-software/tasks/nvidia-driver-ubuntu-packages.yml
@@ -0,0 +1,12 @@
+---
+- name: Select Ubuntu NVIDIA open kernel module packages
+  set_fact:
+    nvidia_driver_ubuntu_packages:
+    - "nvidia-headless-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}-open"
+    - "nvidia-utils-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}"
+    - "nvidia-headless-no-dkms-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}-open"
+    - "nvidia-kernel-source-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}-open"
+  vars:
+    _nvidia_driver_ubuntu_branch: "{{ nvidia_driver_ubuntu_branch | default(nvidia_driver_branch | default('515')) }}"
+    _nvidia_driver_ubuntu_packages_suffix: "{{ nvidia_driver_ubuntu_packages_suffix | default('-server') }}"
+  when: nvidia_driver_ubuntu_use_open_kernel_modules | default(false) | bool
diff --git a/roles/alertmanager/defaults/main.yml b/roles/alertmanager/defaults/main.yml
@@ -1,6 +1,6 @@
 alertmanager_config_dir: /etc/alertmanager
 alertmanager_config_src: templates/alertmanager.yml.j2
-alertmanager_container: "prom/alertmanager:v0.23.0"
+alertmanager_container: "prom/alertmanager:v0.32.1"
 alertmanager_svc_name: "docker.alertmanager.service"
 alertmanager_docker_volume_name: "deepops_alertmanager_metrics"
 alertmanager_state: started

diff --git a/roles/grafana/defaults/main.yml b/roles/grafana/defaults/main.yml
@@ -2,7 +2,7 @@ grafana_config_dir: /etc/grafana
 grafana_config_template: templates/grafana.ini.j2
 grafana_data_dir: /var/lib/grafana
 grafana_user_id: 472
-grafana_container: "grafana/grafana:8.5.10"
+grafana_container: "grafana/grafana:13.0.1"
 grafana_svc_name: "docker.grafana.service"
 grafana_state: started
 grafana_enabled: yes

diff --git a/roles/netapp-trident/README.md b/roles/netapp-trident/README.md
@@ -41,7 +41,7 @@ Example A:
 Example B:
 
     - name: "Deploy NetApp Trident"
-      hosts: kube-master
+      hosts: kube_control_plane
       become: true
       become_method: sudo
       roles:
@@ -50,7 +50,7 @@ Example B:
 Example C:
 
     - name: "Deploy NetApp Trident"
-      hosts: kube-master
+      hosts: kube_control_plane
       become: true
       become_method: sudo
       vars_files:
@@ -88,7 +88,7 @@ all:
       ip: 192.168.1.215
       access_ip: 192.168.1.215
   children:
-    kube-master:
+    kube_control_plane:
       hosts:
         mgmt01:
         mgmt02:

diff --git a/roles/nginx-docker-registry-cache/defaults/main.yml b/roles/nginx-docker-registry-cache/defaults/main.yml
@@ -1,6 +1,6 @@
 ---
 nginx_docker_cache_name: "deepops-nginx-docker-cache"
-nginx_docker_cache_image: "rpardini/docker-registry-proxy:0.6.4"
+nginx_docker_cache_image: "rpardini/docker-registry-proxy:0.6.5"
 
 nginx_docker_cache_mirror_path: "/opt/deepops/nginx-docker-cache/mirror"
 nginx_docker_cache_ca_path: "/opt/deepops/nginx-docker-cache/ca"