diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index 6b69f98ba..b7de1ed73 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -140,6 +140,10 @@ gpu_clock_lock: "1507,1507" # Debugging var: force install NVIDIA driver even if GPU not detected nvidia_driver_force_install: false +# Default NVIDIA driver branch for generic host driver installs. +nvidia_driver_branch: "580" +# Set true on Ubuntu systems that require NVIDIA open kernel modules. +nvidia_driver_ubuntu_use_open_kernel_modules: false ################################################################################ @@ -249,7 +253,7 @@ maas_adminusers: maas_dns_domain: 'deepops.local' maas_region_controller: '192.168.1.1' maas_region_controller_url: 'http://{{ maas_region_controller }}:5240/MAAS' -maas_repo: 'ppa:maas/2.8' +maas_repo: 'ppa:maas/3.5' # Defines if maas user should generate ssh keys # Usable for remote KVM/libvirt power actions @@ -257,7 +261,14 @@ maas_setup_user: false maas_single_node_install: true -maas_kvm: false +maas_kvm_management: false + +# Avoid installing python-libmaas via pip: it shadows the packaged MAAS CLI +# with /usr/local/bin/maas, which breaks DeepOps' MAAS operational workflow. +maas_python_reqs: + - jinja2 + - oauth + - pyyaml ################################################################################ # NVIDIA Datacenter GPU Manager # @@ -305,4 +316,3 @@ standalone_container_registry_port: "5000" ngc_ready_cuda_container: "nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04" ngc_ready_pytorch: "nvcr.io/nvidia/pytorch:24.04-py3" ngc_ready_tensorflow: "nvcr.io/nvidia/tensorflow:24.04-tf2-py3" - diff --git a/docs/deepops/configuration.md b/docs/deepops/configuration.md index 70c1475e6..3884b8137 100644 --- a/docs/deepops/configuration.md +++ b/docs/deepops/configuration.md @@ -62,7 +62,7 @@ my-cluster-compute-02 ansible_host=10.0.0.3 (Note that, by default, DeepOps will set the hostname of these machines to match the inventory hostname! If you don't want this, you can set `deepops_set_hostname: false` using the instructions in [the next section](#modifying-ansible-variables).) -The example DeepOps inventory also includes groups for the different components of Kubernetes clusters (`kube-master`, `etcd`, and `kube-node`), +The example DeepOps inventory also includes groups for the different components of Kubernetes clusters (`kube_control_plane`, `etcd`, and `kube_node`), and groups for the different components of Slurm clusters (`slurm-master` and `slurm-node`). These groups are used by DeepOps to determine which playbooks run on which nodes for each type of cluster, and you should add nodes to these groups based on how you want to lay out your cluster. diff --git a/docs/deepops/update-deepops.md b/docs/deepops/update-deepops.md index 3e081c18c..a1f7af047 100644 --- a/docs/deepops/update-deepops.md +++ b/docs/deepops/update-deepops.md @@ -160,6 +160,8 @@ In particular, Additionally, please note that Kubespray can only upgrade between one minor version of Kubernetes at a time. This means that you may need to upgrade multiple times between your current version and your desired version of Kubernetes. +Other cluster components managed by Kubespray may have similar staged-upgrade requirements. +For example, the network plugin version installed by an older DeepOps release may need to be upgraded through an intermediate DeepOps/Kubespray release before a newer Kubespray release will accept it. For example, to upgrade from Kubernetes version 1.19.9 and 1.21.1, you might use a workflow like this: @@ -267,7 +269,7 @@ DeepOps offers the option to configure each of the necessary NVIDIA components i ##### Updating the NVIDIA driver -**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to false. +**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to true. If you are using MIG-enabled GPUs ensure that your MIG configuration is persistent by using the [nvidia-mig-manager systemd](https://github.com/NVIDIA/mig-parted/tree/master/deployments/systemd) service or the [nvidia-mig-manager Kubernetes GPU Operator-included DaemonSet](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/gpu-operator-mig.html). @@ -277,7 +279,7 @@ To update the driver on a DGX system, we recommend following the instructions in ###### On Ubuntu -On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 450 or 470. +On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 580. To upgrade to the latest driver within your current release branch, run: @@ -288,7 +290,13 @@ ansible-playbook playbooks/nvidia-software/nvidia-driver.yml -e nvidia_driver_pa To upgrade the driver to a new release branch, set the following parameter in your DeepOps configuration: ```bash -nvidia_driver_ubuntu_branch: "470" +nvidia_driver_ubuntu_branch: "580" +``` + +Some newer GPUs require NVIDIA open kernel modules. To install the Ubuntu open kernel module packages for the selected branch, set: + +```bash +nvidia_driver_ubuntu_use_open_kernel_modules: true ``` Then run: @@ -433,7 +441,7 @@ Note that this can take a long time, as we download and build Slurm from source #### Updating the NVIDIA driver -**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to false. +**Important**: Note that upgrading the NVIDIA driver will reboot the node, unless you set `nvidia_driver_skip_reboot` to true. ##### On DGX @@ -441,7 +449,7 @@ To update the driver on a DGX system, we recommend following the instructions in ##### On Ubuntu -On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 450 or 470. +On Ubuntu, the default behavior in DeepOps is to use the LTS release branch distributed through the Ubuntu repositories. In this mode, the driver is generally pinned to a particular release branch such as 580. To upgrade to the latest driver within your current release branch, run: @@ -452,7 +460,13 @@ ansible-playbook playbooks/nvidia-software/nvidia-driver.yml -e nvidia_driver_pa To upgrade the driver to a new release branch, set the following parameter in your DeepOps configuration: ```bash -nvidia_driver_ubuntu_branch: "510" +nvidia_driver_ubuntu_branch: "580" +``` + +Some newer GPUs require NVIDIA open kernel modules. To install the Ubuntu open kernel module packages for the selected branch, set: + +```bash +nvidia_driver_ubuntu_use_open_kernel_modules: true ``` Then run: diff --git a/docs/k8s-cluster/README.md b/docs/k8s-cluster/README.md index ee6bfe66c..b7f1e32b1 100644 --- a/docs/k8s-cluster/README.md +++ b/docs/k8s-cluster/README.md @@ -84,7 +84,7 @@ Instructions for deploying a GPU cluster with Kubernetes # NOTE: If SSH requires a password, add: `-k` # NOTE: If sudo on remote machine requires a password, add: `-K` # NOTE: If SSH user is different than current user, add: `-u ubuntu` - ansible-playbook -l k8s-cluster playbooks/k8s-cluster.yml + ansible-playbook -l k8s_cluster playbooks/k8s-cluster.yml ``` More information on Kubespray can be found in the official [Getting Started Guide](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/getting-started.md) @@ -123,7 +123,7 @@ Run the following script to create an administrative user and print out the dash #### NFS Client Provisioner -The default behavior of DeepOps is to setup an NFS server on the first `kube-master` node. This temporary NFS server is used by the `nfs-client-provisioner` which is installed as the default StorageClass of a standard DeepOps deployment. +The default behavior of DeepOps is to setup an NFS server on the first `kube_control_plane` node. This temporary NFS server is used by the `nfs-client-provisioner` which is installed as the default StorageClass of a standard DeepOps deployment. To use an existing nfs server server update the `k8s_nfs_server` and `k8s_nfs_export_path` variables in `config/group_vars/k8s-cluster.yml` and set the `k8s_deploy_nfs_server` to false in `config/group_vars/k8s-cluster.yml`. Additionally, the `k8s_nfs_mkdir` variable can be set to `false` if the export directory is already configured on the server. @@ -165,7 +165,7 @@ Deploy NetApp Astra Trident for services that require persistent storage (such a # NOTE: If SSH requires a password, add: `-k` # NOTE: If sudo on remote machine requires a password, add: `-K` # NOTE: If SSH user is different than current user, add: `-u ubuntu` - ansible-playbook -l k8s-cluster playbooks/k8s-cluster/netapp-trident.yml + ansible-playbook -l k8s_cluster playbooks/k8s-cluster/netapp-trident.yml ``` 3. Verify that Astra Trident is running. @@ -207,9 +207,9 @@ delete Legacy positional argument for delete. Same as -d flag. The services can be reached from the following addresses: -- Grafana: http://\:30200 -- Prometheus: http://\:30500 -- Alertmanager: http://\:30400 +- Grafana: http://\:30200 +- Prometheus: http://\:30500 +- Alertmanager: http://\:30400 We deploy our monitoring services using the [prometheus-operator](https://github.com/prometheus-operator/prometheus-operator) project. For documentation on configuring and managing the monitoring services, please see the [prometheus-operator user guides](https://github.com/prometheus-operator/prometheus-operator/tree/master/Documentation/user-guides). @@ -234,7 +234,7 @@ Follow the [ELK logging Guide](logging.md) to setup logging in the cluster. The service can be reached from the following address: -- Kibana: http://\:30700 +- Kibana: http://\:30700 ### Container Registry @@ -264,7 +264,7 @@ DeepOps uses [Kubespray](https://github.com/kubernetes-sigs/kubespray) to deploy ### Adding Nodes -To add K8s nodes, modify the `config/inventory` file to include the new nodes under `[all]`. Then list the nodes as relevant under the `[kube-master]`, `[etcd]`, and `[kube-node]` sections. For example, if adding a new master node, list it under kube-master and etcd. A new worker node would go under kube-node. +To add K8s nodes, modify the `config/inventory` file to include the new nodes under `[all]`. Then list the nodes as relevant under the `[kube_control_plane]`, `[etcd]`, and `[kube_node]` sections. For example, if adding a new control-plane node, list it under `kube_control_plane` and `etcd`. A new worker node would go under `kube_node`. Then run the Kubespray `scale.yml` playbook... @@ -272,7 +272,7 @@ Then run the Kubespray `scale.yml` playbook... # NOTE: If SSH requires a password, add: `-k` # NOTE: If sudo on remote machine requires a password, add: `-K` # NOTE: If SSH user is different than current user, add: `-u ubuntu` -ansible-playbook -l k8s-cluster submodules/kubespray/scale.yml +ansible-playbook -l k8s_cluster submodules/kubespray/scale.yml ``` More information on this topic may be found in the [Kubespray docs](https://github.com/kubernetes-sigs/kubespray/blob/master/docs/getting-started.md#adding-nodes). diff --git a/docs/k8s-cluster/kubeflow.md b/docs/k8s-cluster/kubeflow.md index 02394fc86..59f4d6797 100644 --- a/docs/k8s-cluster/kubeflow.md +++ b/docs/k8s-cluster/kubeflow.md @@ -43,7 +43,7 @@ A local checkout of the [Kubeflow manifests](https://github.com/kubeflow/manifes The services can be reached from the following address: -- Kubeflow: http://\:31380 +- Kubeflow: http://\:31380 ## Login information diff --git a/docs/k8s-cluster/nvidia-network-operator.md b/docs/k8s-cluster/nvidia-network-operator.md index 972211aa3..3817c14f5 100644 --- a/docs/k8s-cluster/nvidia-network-operator.md +++ b/docs/k8s-cluster/nvidia-network-operator.md @@ -83,9 +83,9 @@ This playbook is developed and tested in following environments: gpu01 ansible_host=192.168.2.11 gpu02 ansible_host=192.168.3.11 ... - [kube-master] + [kube_control_plane] mgmt01 - [kube-node] + [kube_node] gpu01 gpu02 ``` @@ -123,7 +123,7 @@ This playbook is developed and tested in following environments: # NOTE: If SSH requires a password, add: `-k` # NOTE: If sudo on remote machine requires a password, add: `-K` # NOTE: If SSH user is different than current user, add: `-u ubuntu` - ansible-playbook -l k8s-cluster playbooks/k8s-cluster.yml + ansible-playbook -l k8s_cluster playbooks/k8s-cluster.yml ``` Please refer to [DeepOps Kubernetes Deployment Guidehere](https://github.com/NVIDIA/deepops/blob/master/docs/kubernetes-cluster.md) for more information. diff --git a/docs/k8s-cluster/roce-perf-k8s.md b/docs/k8s-cluster/roce-perf-k8s.md index b80a6f41c..6f5d12c1b 100644 --- a/docs/k8s-cluster/roce-perf-k8s.md +++ b/docs/k8s-cluster/roce-perf-k8s.md @@ -142,7 +142,7 @@ add switch PFC, ECN configuration ```bash # Modify the Ansible inventory file - # Especially the 'all', 'kube-master', 'etcd', 'kube-node' and 'k8s-cluster' sections + # Especially the 'all', 'kube_control_plane', 'etcd', 'kube_node' and 'k8s_cluster' sections vi config/inventory ``` @@ -159,10 +159,10 @@ add switch PFC, ECN configuration gpu02 ansible_host=192.168.2.11 ... - [kube-master] + [kube_control_plane] mgmt01 - [kube-node] + [kube_node] gpu01 gpu02 @@ -203,7 +203,7 @@ add switch PFC, ECN configuration # NOTE: If SSH requires a password, add: `-k` # NOTE: If sudo on remote machine requires a password, add: `-K` # NOTE: If SSH user is different than current user, add: `-u ubuntu` - ansible-playbook -l k8s-cluster playbooks/k8s-cluster.yml + ansible-playbook -l k8s_cluster playbooks/k8s-cluster.yml ``` Please refer to [DeepOps Kubernetes Deployment Guidehere](https://github.com/NVIDIA/deepops/blob/master/docs/kubernetes-cluster.md) for more information. @@ -252,7 +252,7 @@ add switch PFC, ECN configuration Run following script to deploy SRIOV RoCE functions: ```bash - nvidia@mgmt01:~/deepops_0322$ ansible-playbook -l k8s-cluster playbooks/k8s-cluster/roce.yaml + nvidia@mgmt01:~/deepops_0322$ ansible-playbook -l k8s_cluster playbooks/k8s-cluster/roce.yaml ``` If using a different username and SSH key-based authentication haven't set up, try to use `-u -k -K` when you run the script. diff --git a/docs/k8s-cluster/roce_backend.md b/docs/k8s-cluster/roce_backend.md index 254007217..aa5c9dba6 100644 --- a/docs/k8s-cluster/roce_backend.md +++ b/docs/k8s-cluster/roce_backend.md @@ -106,7 +106,7 @@ The Role installing following components: ## Role deployment ```bash -ansible-playbook -l k8s-cluster playbooks/k8s-cluster/roce.yaml +ansible-playbook -l k8s_cluster playbooks/k8s-cluster/roce.yaml ``` ## License diff --git a/docs/pxe/maas.md b/docs/pxe/maas.md index ca95671c9..1deb6dad1 100644 --- a/docs/pxe/maas.md +++ b/docs/pxe/maas.md @@ -258,8 +258,8 @@ only need to tag leaf groups. | Tag | Ansible Group | Used By | |-----|--------------|---------| -| `kube-master` | `[kube-master]` | K8s control plane | -| `kube-node` | `[kube-node]` | K8s worker nodes | +| `kube_control_plane` | `[kube_control_plane]` | K8s control plane | +| `kube_node` | `[kube_node]` | K8s worker nodes | | `slurm-master` | `[slurm-master]` | Slurm head node | | `slurm-node` | `[slurm-node]` | Slurm compute nodes | | `slurm-nfs` | `[slurm-nfs]` | Slurm NFS server | @@ -276,8 +276,8 @@ ansible-playbook -i scripts/maas_inventory.py playbooks/slurm-cluster.yml # Later, retag for K8s maas admin tag update-nodes slurm-master remove= -maas admin tag update-nodes kube-master add= -maas admin tag update-nodes kube-node add= add= +maas admin tag update-nodes kube_control_plane add= +maas admin tag update-nodes kube_node add= add= # Run K8s deployment ansible-playbook -i scripts/maas_inventory.py playbooks/k8s-cluster.yml diff --git a/playbooks/k8s-cluster.yml b/playbooks/k8s-cluster.yml index 742eed77d..983e9c750 100644 --- a/playbooks/k8s-cluster.yml +++ b/playbooks/k8s-cluster.yml @@ -282,7 +282,7 @@ ansible_become: no tasks: - name: Install Helm on admin node - command: "sh {{ playbook_dir }}/../scripts/k8s/install_helm.sh" + command: "bash {{ playbook_dir }}/../scripts/k8s/install_helm.sh" delegate_to: localhost - name: Globally update the deprecated "stable" helm repo command: "/usr/local/bin/helm repo add 'stable' 'https://charts.helm.sh/stable' --force-update" diff --git a/playbooks/k8s-cluster/nfs-client-provisioner.yml b/playbooks/k8s-cluster/nfs-client-provisioner.yml index 93592e85b..200f27f99 100644 --- a/playbooks/k8s-cluster/nfs-client-provisioner.yml +++ b/playbooks/k8s-cluster/nfs-client-provisioner.yml @@ -22,7 +22,7 @@ include_role: name: nfs vars: - - nfs_is_server: yes + nfs_is_server: yes when: k8s_deploy_nfs_server - hosts: "k8s_cluster" diff --git a/playbooks/nvidia-software/nvidia-cuda.yml b/playbooks/nvidia-software/nvidia-cuda.yml index f7e704e2e..7945f671c 100644 --- a/playbooks/nvidia-software/nvidia-cuda.yml +++ b/playbooks/nvidia-software/nvidia-cuda.yml @@ -13,6 +13,10 @@ include_role: name: facts + - name: configure Ubuntu NVIDIA driver packages + include_tasks: tasks/nvidia-driver-ubuntu-packages.yml + when: ansible_distribution == 'Ubuntu' + - name: install nvidia driver include_role: name: nvidia.nvidia_driver diff --git a/playbooks/nvidia-software/nvidia-driver.yml b/playbooks/nvidia-software/nvidia-driver.yml index c9a05b687..c445e5574 100644 --- a/playbooks/nvidia-software/nvidia-driver.yml +++ b/playbooks/nvidia-software/nvidia-driver.yml @@ -14,6 +14,10 @@ include_role: name: facts + - name: configure Ubuntu NVIDIA driver packages + include_tasks: tasks/nvidia-driver-ubuntu-packages.yml + when: ansible_distribution == 'Ubuntu' + - name: install nvidia driver include_role: name: nvidia.nvidia_driver diff --git a/playbooks/nvidia-software/tasks/nvidia-driver-ubuntu-packages.yml b/playbooks/nvidia-software/tasks/nvidia-driver-ubuntu-packages.yml new file mode 100644 index 000000000..1a987eac4 --- /dev/null +++ b/playbooks/nvidia-software/tasks/nvidia-driver-ubuntu-packages.yml @@ -0,0 +1,12 @@ +--- +- name: Select Ubuntu NVIDIA open kernel module packages + set_fact: + nvidia_driver_ubuntu_packages: + - "nvidia-headless-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}-open" + - "nvidia-utils-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}" + - "nvidia-headless-no-dkms-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}-open" + - "nvidia-kernel-source-{{ _nvidia_driver_ubuntu_branch }}{{ _nvidia_driver_ubuntu_packages_suffix }}-open" + vars: + _nvidia_driver_ubuntu_branch: "{{ nvidia_driver_ubuntu_branch | default(nvidia_driver_branch | default('515')) }}" + _nvidia_driver_ubuntu_packages_suffix: "{{ nvidia_driver_ubuntu_packages_suffix | default('-server') }}" + when: nvidia_driver_ubuntu_use_open_kernel_modules | default(false) | bool diff --git a/roles/alertmanager/defaults/main.yml b/roles/alertmanager/defaults/main.yml index bcbed59c7..8bfdbc49d 100644 --- a/roles/alertmanager/defaults/main.yml +++ b/roles/alertmanager/defaults/main.yml @@ -1,6 +1,6 @@ alertmanager_config_dir: /etc/alertmanager alertmanager_config_src: templates/alertmanager.yml.j2 -alertmanager_container: "prom/alertmanager:v0.23.0" +alertmanager_container: "prom/alertmanager:v0.32.1" alertmanager_svc_name: "docker.alertmanager.service" alertmanager_docker_volume_name: "deepops_alertmanager_metrics" alertmanager_state: started diff --git a/roles/grafana/defaults/main.yml b/roles/grafana/defaults/main.yml index 5156ce0c6..920a37943 100644 --- a/roles/grafana/defaults/main.yml +++ b/roles/grafana/defaults/main.yml @@ -2,7 +2,7 @@ grafana_config_dir: /etc/grafana grafana_config_template: templates/grafana.ini.j2 grafana_data_dir: /var/lib/grafana grafana_user_id: 472 -grafana_container: "grafana/grafana:8.5.10" +grafana_container: "grafana/grafana:13.0.1" grafana_svc_name: "docker.grafana.service" grafana_state: started grafana_enabled: yes diff --git a/roles/netapp-trident/README.md b/roles/netapp-trident/README.md index 87b5557c0..7911d097b 100644 --- a/roles/netapp-trident/README.md +++ b/roles/netapp-trident/README.md @@ -41,7 +41,7 @@ Example A: Example B: - name: "Deploy NetApp Trident" - hosts: kube-master + hosts: kube_control_plane become: true become_method: sudo roles: @@ -50,7 +50,7 @@ Example B: Example C: - name: "Deploy NetApp Trident" - hosts: kube-master + hosts: kube_control_plane become: true become_method: sudo vars_files: @@ -88,7 +88,7 @@ all: ip: 192.168.1.215 access_ip: 192.168.1.215 children: - kube-master: + kube_control_plane: hosts: mgmt01: mgmt02: diff --git a/roles/nginx-docker-registry-cache/defaults/main.yml b/roles/nginx-docker-registry-cache/defaults/main.yml index dba99e9ac..6824bc7d5 100644 --- a/roles/nginx-docker-registry-cache/defaults/main.yml +++ b/roles/nginx-docker-registry-cache/defaults/main.yml @@ -1,6 +1,6 @@ --- nginx_docker_cache_name: "deepops-nginx-docker-cache" -nginx_docker_cache_image: "rpardini/docker-registry-proxy:0.6.4" +nginx_docker_cache_image: "rpardini/docker-registry-proxy:0.6.5" nginx_docker_cache_mirror_path: "/opt/deepops/nginx-docker-cache/mirror" nginx_docker_cache_ca_path: "/opt/deepops/nginx-docker-cache/ca" diff --git a/roles/nvidia-gpu-operator/defaults/main.yml b/roles/nvidia-gpu-operator/defaults/main.yml index 0cd7dad47..ebecc5d60 100644 --- a/roles/nvidia-gpu-operator/defaults/main.yml +++ b/roles/nvidia-gpu-operator/defaults/main.yml @@ -12,7 +12,7 @@ gpu_operator_nvaie_helm_repo: "https://helm.ngc.nvidia.com/nvaie" gpu_operator_nvaie_chart_name: "nvaie/gpu-operator" # NVAIE GPU Operator may require different version, check NGC enterprise collection. -gpu_operator_chart_version: "v23.3.2" +gpu_operator_chart_version: "v26.3.1" k8s_gpu_mig_strategy: "mixed" @@ -33,7 +33,7 @@ gpu_operator_grid_config_dir: "{{ deepops_dir }}/gpu_operator" # Defaults from https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml gpu_operator_default_runtime: "containerd" gpu_operator_driver_registry: "nvcr.io/nvidia" -gpu_operator_driver_version: "525.105.17" +gpu_operator_driver_version: "580.126.20" # This enables/disables NVAIE gpu_operator_nvaie_enable: false diff --git a/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml b/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml index a1214ec17..021d9a3f2 100644 --- a/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml +++ b/roles/nvidia-k8s-gpu-device-plugin/defaults/main.yml @@ -2,6 +2,6 @@ k8s_gpu_plugin_helm_repo: "https://nvidia.github.io/k8s-device-plugin" k8s_gpu_plugin_chart_name: "nvdp/nvidia-device-plugin" k8s_gpu_plugin_release_name: "nvidia-device-plugin" -k8s_gpu_plugin_chart_version: "0.14.0" +k8s_gpu_plugin_chart_version: "0.19.1" k8s_gpu_plugin_init_error: "false" k8s_gpu_mig_strategy: "mixed" diff --git a/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml b/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml index d1cced31e..77111b3ff 100644 --- a/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml +++ b/roles/nvidia-k8s-gpu-feature-discovery/defaults/main.yml @@ -1,6 +1,6 @@ # Vars needed to install feature discovery -k8s_gpu_feature_discovery_helm_repo: "https://nvidia.github.io/gpu-feature-discovery" +k8s_gpu_feature_discovery_helm_repo: "https://nvidia.github.io/k8s-device-plugin" k8s_gpu_feature_discovery_chart_name: "nvgfd/gpu-feature-discovery" k8s_gpu_feature_discovery_release_name: "gpu-feature-discovery" -k8s_gpu_feature_discovery_chart_version: "0.8.0" +k8s_gpu_feature_discovery_chart_version: "0.19.1" k8s_gpu_mig_strategy: "mixed" diff --git a/roles/nvidia-mig-manager/defaults/main.yml b/roles/nvidia-mig-manager/defaults/main.yml index 1139c4608..80d0c60f4 100644 --- a/roles/nvidia-mig-manager/defaults/main.yml +++ b/roles/nvidia-mig-manager/defaults/main.yml @@ -1,3 +1,3 @@ --- -mig_manager_url_deb: https://github.com/NVIDIA/mig-parted/releases/download/v0.4.2/nvidia-mig-manager_0.4.2-1_amd64.deb -mig_manager_url_rpm: https://github.com/NVIDIA/mig-parted/releases/download/v0.4.2/nvidia-mig-manager-0.4.2-1.x86_64.rpm +mig_manager_url_deb: https://github.com/NVIDIA/mig-parted/releases/download/v0.14.1/nvidia-mig-manager_0.14.1-1_amd64.deb +mig_manager_url_rpm: https://github.com/NVIDIA/mig-parted/releases/download/v0.14.1/nvidia-mig-manager-0.14.1-1.x86_64.rpm diff --git a/roles/nvidia-network-operator/tasks/main.yaml b/roles/nvidia-network-operator/tasks/main.yaml index d2c9e409e..09e7fa8dc 100644 --- a/roles/nvidia-network-operator/tasks/main.yaml +++ b/roles/nvidia-network-operator/tasks/main.yaml @@ -4,7 +4,7 @@ - name: label the nodes # noqa command-instead-of-shell shell: kubectl label --overwrite nodes {{ item }} node-role.kubernetes.io/worker= - with_items: "{{ groups['kube-node'] }}" + with_items: "{{ groups['kube_node'] }}" changed_when: false ## required as the DeepOps openshift role doesn't work @@ -20,7 +20,7 @@ - name: Deploy network operator helm chart kubernetes.core.helm: name: network-operator - release_namespace: network-operator + release_namespace: "{{ nvidia_network_operator_namespace }}" chart_version: "{{ nvidia_network_operator_version }}" chart_ref: mellanox/network-operator create_namespace: true @@ -28,7 +28,41 @@ wait: true values: "{{ lookup('template', 'values.yaml') | from_yaml }}" -- name: Create network node poliy +- name: Create NicClusterPolicy + kubernetes.core.k8s: + state: present + definition: "{{ lookup('template', 'nicclusterpolicy.yaml') | from_yaml }}" + run_once: true + +- name: Wait for NVIDIA IPAM CRD + kubernetes.core.k8s_info: + api_version: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + name: ippools.nv-ipam.nvidia.com + register: nvidia_ipam_crd + until: nvidia_ipam_crd.resources | length > 0 + retries: 30 + delay: 10 + when: nvidia_network_operator_ipam_type == "nv-ipam" + run_once: true + +- name: Create NVIDIA IPAM pools + kubernetes.core.k8s: + state: present + definition: + apiVersion: nv-ipam.nvidia.com/v1alpha1 + kind: IPPool + metadata: + name: "{{ item.ip_pool_name | default(item.res_name) }}" + namespace: "{{ nvidia_network_operator_namespace }}" + spec: + subnet: "{{ item.ip_addr }}" + perNodeBlockSize: "{{ nvidia_network_operator_ipam_per_node_block_size | int }}" + with_items: "{{ intf_resources }}" + when: nvidia_network_operator_ipam_type == "nv-ipam" + run_once: true + +- name: Create network node policy include_tasks: sriovnetworknodepolicy.yaml with_items: "{{ intf_resources }}" diff --git a/roles/nvidia-network-operator/tasks/sriovibnetwork.yaml b/roles/nvidia-network-operator/tasks/sriovibnetwork.yaml index fe6626fc7..ae08cde6b 100644 --- a/roles/nvidia-network-operator/tasks/sriovibnetwork.yaml +++ b/roles/nvidia-network-operator/tasks/sriovibnetwork.yaml @@ -9,14 +9,8 @@ networkNamespace: "default" ipam: | { - "type": "whereabouts", - "datastore": "kubernetes", - "kubernetes": { - "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" - }, - "range": "{{ item.ip_addr }}", - "log_file": "/var/log/whereabouts.log", - "log_level": "info" + "type": "{{ nvidia_network_operator_ipam_type }}", + "poolName": "{{ item.ip_pool_name | default(item.res_name) }}" } k8s: state: present diff --git a/roles/nvidia-network-operator/tasks/sriovnetworknodepolicy.yaml b/roles/nvidia-network-operator/tasks/sriovnetworknodepolicy.yaml index c41f16bcd..9cecfdbb1 100644 --- a/roles/nvidia-network-operator/tasks/sriovnetworknodepolicy.yaml +++ b/roles/nvidia-network-operator/tasks/sriovnetworknodepolicy.yaml @@ -7,7 +7,7 @@ deviceType: netdevice mtu: {{ mtu |int }} nodeSelector: - feature.node.kubernetes.io/network-sriov.capable: "true" + feature.node.kubernetes.io/pci-{{ vendor_id }}.present: "true" nicSelector: vendor: "{{ vendor_id }}" pfNames: ["{{ item.pf_name }}"] diff --git a/roles/nvidia-network-operator/templates/nicclusterpolicy.yaml b/roles/nvidia-network-operator/templates/nicclusterpolicy.yaml new file mode 100644 index 000000000..81834316b --- /dev/null +++ b/roles/nvidia-network-operator/templates/nicclusterpolicy.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: mellanox.com/v1alpha1 +kind: NicClusterPolicy +metadata: + name: nic-cluster-policy +spec: + secondaryNetwork: + cniPlugins: + image: plugins + repository: nvcr.io/nvidia/mellanox + version: {{ nvidia_network_operator_image_tag }} + multus: + image: multus-cni + repository: nvcr.io/nvidia/mellanox + version: {{ nvidia_network_operator_image_tag }} +{% if nvidia_network_operator_ipam_type == "nv-ipam" %} + nvIpam: + image: nvidia-k8s-ipam + repository: nvcr.io/nvidia/mellanox + version: {{ nvidia_network_operator_image_tag }} + enableWebhook: false +{% endif %} diff --git a/roles/nvidia-network-operator/templates/values.yaml b/roles/nvidia-network-operator/templates/values.yaml index 9de7fbab2..4b63ac7b5 100644 --- a/roles/nvidia-network-operator/templates/values.yaml +++ b/roles/nvidia-network-operator/templates/values.yaml @@ -7,21 +7,3 @@ nfd: enabled: true sriovNetworkOperator: enabled: true - -# NicClusterPolicy CR values: -deployCR: true -ofedDriver: - deploy: false -rdmaSharedDevicePlugin: - deploy: false -sriovDevicePlugin: - deploy: false - -secondaryNetwork: - deploy: true - multus: - deploy: true - cniPlugins: - deploy: true - ipamPlugin: - deploy: true diff --git a/roles/nvidia-network-operator/vars/main.yaml b/roles/nvidia-network-operator/vars/main.yaml index 2d3322081..7be8ee253 100644 --- a/roles/nvidia-network-operator/vars/main.yaml +++ b/roles/nvidia-network-operator/vars/main.yaml @@ -6,8 +6,12 @@ # if_name must match k8s network annotation name # -nvidia_network_operator_version: "1.2.0" -nvidia_network_operator_url: "https://mellanox.github.io/network-operator" +nvidia_network_operator_version: "26.1.1" +nvidia_network_operator_image_tag: "network-operator-v{{ nvidia_network_operator_version }}" +nvidia_network_operator_namespace: "network-operator" +nvidia_network_operator_ipam_type: "nv-ipam" +nvidia_network_operator_ipam_per_node_block_size: "{{ num_vf }}" +nvidia_network_operator_url: "https://helm.ngc.nvidia.com/nvidia" mpi_operator_version: "v2beta1" mpi_raw_url: "https://raw.githubusercontent.com/kubeflow/mpi-operator/master/deploy/v2beta1" diff --git a/roles/prometheus-node-exporter/defaults/main.yml b/roles/prometheus-node-exporter/defaults/main.yml index 9d6116c8e..925f4baee 100644 --- a/roles/prometheus-node-exporter/defaults/main.yml +++ b/roles/prometheus-node-exporter/defaults/main.yml @@ -1,4 +1,4 @@ -node_exporter_container: "quay.io/prometheus/node-exporter:v1.3.1" +node_exporter_container: "quay.io/prometheus/node-exporter:v1.11.1" node_exporter_prom_dir: "/run/prometheus" node_exporter_svc_name: "docker.node-exporter.service" node_exporter_state: started diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 6e7afbd21..7349da1b3 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -1,7 +1,7 @@ prometheus_config_dir: /etc/prometheus prometheus_config_src: templates/prometheus.yml.j2 prometheus_alert_rules_src: templates/alert_rules.yml.j2 -prometheus_container: "prom/prometheus:v2.37.0" +prometheus_container: "prom/prometheus:v3.11.3" prometheus_svc_name: "docker.prometheus.service" prometheus_docker_volume_name: "deepops_prometheus_metrics" prometheus_state: started diff --git a/roles/requirements.yml b/roles/requirements.yml index b6a0795aa..77f97ef7b 100644 --- a/roles/requirements.yml +++ b/roles/requirements.yml @@ -56,9 +56,9 @@ roles: - src: robertdebock.kibana version: "1.2.6" -- src: https://github.com/DeepOps/ansible-maas.git +- src: https://github.com/mrlesmithjr/ansible-maas.git name: ansible-maas - version: '632fe9bd1e048b9abb717621dc2d76b19614327b' + version: '178a999c9bfc979ef32c42f4f59c034664df10d0' - src: https://github.com/DeepOps/ansible-role-chrony name: DeepOps.chrony diff --git a/roles/slurm/defaults/main.yml b/roles/slurm/defaults/main.yml index 4573301db..735ae16ee 100644 --- a/roles/slurm/defaults/main.yml +++ b/roles/slurm/defaults/main.yml @@ -7,7 +7,7 @@ hwloc_build_dir: /opt/deepops/build/hwloc pmix_build_dir: /opt/deepops/build/pmix slurm_workflow_build: yes -slurm_version: "23.02.4" +slurm_version: "25.11.6" slurm_src_url: "https://download.schedmd.com/slurm/slurm-{{ slurm_version }}.tar.bz2" slurm_build_make_clean: no slurm_build_dir_cleanup: no diff --git a/roles/slurm/tasks/controller.yml b/roles/slurm/tasks/controller.yml index 733e32823..8ebe48fac 100644 --- a/roles/slurm/tasks/controller.yml +++ b/roles/slurm/tasks/controller.yml @@ -149,16 +149,30 @@ - name: create account command: sacctmgr -i add account compute-account Description="Compute Accounts" Organization="Prestige" register: create_account_result - failed_when: "create_account_result.rc != 0 and 'Nothing new added' not in create_account_result.stdout" - changed_when: "'Nothing new added' not in create_account_result.stdout" + failed_when: > + create_account_result.rc != 0 and + 'Nothing new added' not in create_account_result.stdout and + 'Nothing added' not in create_account_result.stdout and + 'Already existing' not in create_account_result.stdout + changed_when: > + 'Nothing new added' not in create_account_result.stdout and + 'Nothing added' not in create_account_result.stdout and + 'Already existing' not in create_account_result.stdout environment: PATH: '{{ slurm_install_prefix }}/bin:{{ ansible_env.PATH }}' - name: create users command: sacctmgr -i create user {{ item }} account=compute-account adminlevel=None register: create_user_result - failed_when: "create_user_result.rc != 0 and 'Nothing new added' not in create_user_result.stdout" - changed_when: "'Nothing new added' not in create_user_result.stdout" + failed_when: > + create_user_result.rc != 0 and + 'Nothing new added' not in create_user_result.stdout and + 'Nothing added' not in create_user_result.stdout and + 'Already existing' not in create_user_result.stdout + changed_when: > + 'Nothing new added' not in create_user_result.stdout and + 'Nothing added' not in create_user_result.stdout and + 'Already existing' not in create_user_result.stdout with_items: - "{{ user }}" environment: diff --git a/roles/slurm/tasks/login-compute-setup.yml b/roles/slurm/tasks/login-compute-setup.yml index 69c2769a9..2644c9bc3 100644 --- a/roles/slurm/tasks/login-compute-setup.yml +++ b/roles/slurm/tasks/login-compute-setup.yml @@ -9,6 +9,7 @@ systemctl set-property sshd.service DeviceAllow="/dev/nvidiactl" fi args: + executable: /bin/bash creates: "{{ '/etc/systemd/system.control/sshd.service.d/50-DeviceAllow.conf' \ if ansible_os_family == 'RedHat' else \ '/etc/systemd/system.control/ssh.service.d/50-DeviceAllow.conf' }}" diff --git a/roles/spack/defaults/main.yml b/roles/spack/defaults/main.yml index 26540b781..76e356c63 100644 --- a/roles/spack/defaults/main.yml +++ b/roles/spack/defaults/main.yml @@ -1,7 +1,7 @@ --- spack_repo: "https://github.com/spack/spack.git" spack_install_dir: "/sw/spack" -spack_version: "v0.18.1" +spack_version: "v1.1.1" spack_user: "root" spack_group: "root" diff --git a/roles/standalone-container-registry/defaults/main.yml b/roles/standalone-container-registry/defaults/main.yml index 383f6e9e4..562524707 100644 --- a/roles/standalone-container-registry/defaults/main.yml +++ b/roles/standalone-container-registry/defaults/main.yml @@ -2,7 +2,7 @@ epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm" epel_key_url: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}" -standalone_container_registry_image: "registry:2.8" +standalone_container_registry_image: "registry:3.1.1" standalone_container_registry_port: "5000" standalone_container_registry_name: "deepops-registry" diff --git a/scripts/k8s/deploy_ingress.sh b/scripts/k8s/deploy_ingress.sh index f311f4d84..b1ad2143e 100755 --- a/scripts/k8s/deploy_ingress.sh +++ b/scripts/k8s/deploy_ingress.sh @@ -9,7 +9,7 @@ ROOT_DIR="${SCRIPT_DIR}/../.." source ${ROOT_DIR}/scripts/common.sh HELM_CHARTS_REPO_INGRESS="${HELM_CHARTS_REPO_INGRESS:-https://kubernetes.github.io/ingress-nginx}" -HELM_INGRESS_CHART_VERSION="${HELM_INGRESS_CHART_VERSION:-4.2.1}" +HELM_INGRESS_CHART_VERSION="${HELM_INGRESS_CHART_VERSION:-4.15.1}" # HELM_INGRESS_CONFIG, defaults below based on presence of metallb ${SCRIPT_DIR}/k8s/install_helm.sh diff --git a/scripts/k8s/deploy_monitoring.sh b/scripts/k8s/deploy_monitoring.sh index d73fdc5f5..05a5851a7 100755 --- a/scripts/k8s/deploy_monitoring.sh +++ b/scripts/k8s/deploy_monitoring.sh @@ -24,7 +24,7 @@ if [ ! -d "${DEEPOPS_CONFIG_DIR}" ]; then fi HELM_CHARTS_REPO_PROMETHEUS="${HELM_CHARTS_REPO_PROMETHEUS:-https://prometheus-community.github.io/helm-charts}" -HELM_PROMETHEUS_CHART_VERSION="${HELM_PROMETHEUS_CHART_VERSION:-39.5.0}" +HELM_PROMETHEUS_CHART_VERSION="${HELM_PROMETHEUS_CHART_VERSION:-85.0.3}" ingress_name="ingress-nginx" PROMETHEUS_YAML_CONFIG="${PROMETHEUS_YAML_CONFIG:-${DEEPOPS_CONFIG_DIR}/helm/monitoring.yml}" diff --git a/scripts/k8s/install_helm.sh b/scripts/k8s/install_helm.sh index 1efd217de..b3c2bb4e7 100755 --- a/scripts/k8s/install_helm.sh +++ b/scripts/k8s/install_helm.sh @@ -35,7 +35,7 @@ case "$ID" in ;; esac -helm_version=$(helm version --short) +helm_version=$(helm version --short 2>/dev/null || true) helm_min_installed=$(echo -e "${HELM_MINIMUM_VERSION}\n${helm_version}"| sort -V | head -n 1) if [ "${HELM_MINIMUM_VERSION}" != "${helm_min_installed}" ]; then if [ "${helm_version}" != "" ]; then @@ -49,5 +49,7 @@ if [ "${HELM_MINIMUM_VERSION}" != "${helm_min_installed}" ]; then HELM_INSTALL_DIR=${HELM_INSTALL_DIR} DESIRED_VERSION=v3.17.1 /var/tmp/get_helm.sh # Should match: config/group_vars/k8s-cluster.yml:helm_version: fi +sudo chmod 0755 "${HELM_INSTALL_DIR}/helm" + # Display the helm version for better debug helm version diff --git a/submodules/kubespray b/submodules/kubespray index f4ccdb5e7..1c9add489 160000 --- a/submodules/kubespray +++ b/submodules/kubespray @@ -1 +1 @@ -Subproject commit f4ccdb5e72395eaf9f3444056ebd1a6625ddb89a +Subproject commit 1c9add48975060f45396b34d8e022c30d7f80dab diff --git a/virtual/vars_files/virt_k8s.yml b/virtual/vars_files/virt_k8s.yml index c393e9a3e..089f52add 100644 --- a/virtual/vars_files/virt_k8s.yml +++ b/virtual/vars_files/virt_k8s.yml @@ -1,3 +1,3 @@ --- container_registry_persistence_enabled: false -rsyslog_client_tcp_host: "{{ groups['kube-master'][0] }}" +rsyslog_client_tcp_host: "{{ groups['kube_control_plane'][0] }}" diff --git a/virtual/virtual_inventory b/virtual/virtual_inventory index 71e0b72ba..d5a39bb8d 100644 --- a/virtual/virtual_inventory +++ b/virtual/virtual_inventory @@ -15,19 +15,19 @@ virtual-gpu01 ansible_host=10.0.0.6 ip=10.0.0.6 ###### # KUBERNETES ###### -[kube-master] +[kube_control_plane] virtual-mgmt01 [etcd] virtual-mgmt01 -[kube-node] +[kube_node] virtual-mgmt01 virtual-gpu01 -[k8s-cluster:children] -kube-master -kube-node +[k8s_cluster:children] +kube_control_plane +kube_node ###### # SLURM diff --git a/virtual/virtual_inventory_full b/virtual/virtual_inventory_full index f639e0e78..1f1c70b27 100644 --- a/virtual/virtual_inventory_full +++ b/virtual/virtual_inventory_full @@ -19,7 +19,7 @@ virtual-gpu02 ansible_host=10.0.0.7 ip=10.0.0.7 ###### # KUBERNETES ###### -[kube-master] +[kube_control_plane] virtual-mgmt01 virtual-mgmt02 virtual-mgmt03 @@ -29,16 +29,16 @@ virtual-mgmt01 virtual-mgmt02 virtual-mgmt03 -[kube-node] +[kube_node] virtual-mgmt01 virtual-mgmt02 virtual-mgmt03 virtual-gpu01 virtual-gpu02 -[k8s-cluster:children] -kube-master -kube-node +[k8s_cluster:children] +kube_control_plane +kube_node ###### # SLURM