From 021be8aced9e3b52f760b7e709e412989a807815 Mon Sep 17 00:00:00 2001 From: Shiva Kumar Date: Wed, 6 May 2026 13:55:32 +0530 Subject: [PATCH 1/2] Precompiled: Ubuntu26.04 driver container support Signed-off-by: Shiva Kumar (SW-CLOUD) --- .common-ci.yml | 19 + .github/workflows/precompiled.yaml | 38 +- .gitlab-ci.yml | 9 + .nvidia-ci.yml | 47 +++ Makefile | 25 +- base/Dockerfile | 36 ++ multi-arch.mk | 1 + tests/holodeck_ubuntu26.04.yaml | 34 ++ ubuntu26.04/precompiled/Dockerfile | 62 +++ ubuntu26.04/precompiled/local-repo.sh | 118 ++++++ ubuntu26.04/precompiled/nvidia-driver | 566 ++++++++++++++++++++++++++ 11 files changed, 942 insertions(+), 13 deletions(-) create mode 100644 tests/holodeck_ubuntu26.04.yaml create mode 100644 ubuntu26.04/precompiled/Dockerfile create mode 100755 ubuntu26.04/precompiled/local-repo.sh create mode 100755 ubuntu26.04/precompiled/nvidia-driver diff --git a/.common-ci.yml b/.common-ci.yml index cff2a09c9..1a81f71ef 100644 --- a/.common-ci.yml +++ b/.common-ci.yml @@ -110,6 +110,14 @@ trigger-pipeline: KERNEL_FLAVOR: [aws, azure, azure-fde, generic, nvidia, oracle] LTS_KERNEL: ["6.8"] +# Define the matrix of precompiled jobs that can be run in parallel for ubuntu26.04 +.driver-versions-precompiled-ubuntu26.04: + parallel: + matrix: + - DRIVER_BRANCH: [580] + KERNEL_FLAVOR: [aws, azure, azure-fde, generic, nvidia, oracle] + LTS_KERNEL: ["6.14"] + .dist-ubuntu22.04: variables: DIST: ubuntu22.04 @@ -411,3 +419,14 @@ release:staging-precompiled-ubuntu24.04: - .release:staging-precompiled needs: - image-precompiled-ubuntu24.04 + +# Precompiled Ubuntu26.04 release +release:staging-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .release:staging-precompiled + needs: + - image-precompiled-ubuntu26.04 diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index dadc11975..517baf2fc 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -49,12 +49,12 @@ jobs: echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT # get ubuntu distributions - DIST=("ubuntu22.04" "ubuntu24.04") + DIST=("ubuntu22.04" "ubuntu24.04" "ubuntu26.04") dist_json=$(printf '%s\n' "${DIST[@]}" | jq -R . | jq -cs .) echo "dist=$dist_json" >> $GITHUB_OUTPUT # LTS_KERNEL setup - LTS_KERNEL=("5.15" "6.8") + LTS_KERNEL=("5.15" "6.8" "6.14") lts_kernel_json=$(printf '%s\n' "${LTS_KERNEL[@]}" | jq -R . | jq -cs .) echo "lts_kernel=$lts_kernel_json" >> $GITHUB_OUTPUT @@ -70,8 +70,14 @@ jobs: exclude: - dist: ubuntu24.04 driver_branch: 535 + - dist: ubuntu26.04 + driver_branch: 535 - lts_kernel: 5.15 dist: ubuntu24.04 + - lts_kernel: 5.15 + dist: ubuntu26.04 + - lts_kernel: 6.8 + dist: ubuntu26.04 - flavor: azure-fde dist: ubuntu22.04 steps: @@ -113,6 +119,8 @@ jobs: BASE_TARGET="jammy" elif [[ "${{ matrix.dist }}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" + elif [[ "${{ matrix.dist }}" == "ubuntu26.04" ]]; then + BASE_TARGET="questing" fi make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} LTS_KERNEL=${LTS_KERNEL} build-base-${BASE_TARGET} @@ -143,6 +151,8 @@ jobs: BASE_TARGET="jammy" elif [[ "${{ matrix.dist }}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" + elif [[ "${{ matrix.dist }}" == "ubuntu26.04" ]]; then + BASE_TARGET="questing" fi tar -cvf kernel-version-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar kernel_version.txt docker save "${PRIVATE_REGISTRY}/nvidia/driver:base-${BASE_TARGET}-${LTS_KERNEL}-${{ matrix.flavor }}-${{ matrix.driver_branch }}" \ @@ -183,6 +193,10 @@ jobs: exclude: - lts_kernel: 5.15 dist: ubuntu24.04 + - lts_kernel: 5.15 + dist: ubuntu26.04 + - lts_kernel: 6.8 + dist: ubuntu26.04 needs: - precompiled-build-image - set-driver-version-matrix @@ -214,8 +228,8 @@ jobs: driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) - # remove 535 driver branch for ubuntu24.04 - if [ "$DIST" == "ubuntu24.04" ]; then + # remove 535 driver branch for newer Ubuntu precompiled distros + if [[ "$DIST" == "ubuntu24.04" || "$DIST" == "ubuntu26.04" ]]; then DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do [[ $branch != "535" ]] && echo "$branch" done)) @@ -401,8 +415,8 @@ jobs: rc=0 # for precompiled driver we are setting driver branch as driver version DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }}) - # remove 535 driver branch for ubuntu24.04 - if [ "$DIST" == "ubuntu24.04" ]; then + # remove 535 driver branch for newer Ubuntu precompiled distros + if [[ "$DIST" == "ubuntu24.04" || "$DIST" == "ubuntu26.04" ]]; then DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do [[ $branch != "535" ]] && echo "$branch" done)) @@ -457,14 +471,14 @@ jobs: echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV - name: Download base image artifact - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: ${{ ! (matrix.driver_branch == 535 && (contains(matrix.kernel_version, 'ubuntu24.04') || contains(matrix.kernel_version, 'ubuntu26.04'))) }} uses: actions/download-artifact@v8 with: name: base-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }} path: ./ - name: Publish base image - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: ${{ ! (matrix.driver_branch == 535 && (contains(matrix.kernel_version, 'ubuntu24.04') || contains(matrix.kernel_version, 'ubuntu26.04'))) }} run: | LTS_KERNEL=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^([0-9]+\.[0-9]+)\..*/\1/') KERNEL_FLAVOR=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^[0-9]+\.[0-9]+\.[0-9]+-[0-9]+-(.*)-ubuntu[0-9]+\.[0-9]+$/\1/') @@ -473,6 +487,8 @@ jobs: BASE_TARGET="jammy" elif [[ "${DIST}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" + elif [[ "${DIST}" == "ubuntu26.04" ]]; then + BASE_TARGET="questing" fi image_path="./base-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar" echo "uploading $image_path" @@ -484,14 +500,14 @@ jobs: fi - name: Download built image artifact - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: ${{ ! (matrix.driver_branch == 535 && (contains(matrix.kernel_version, 'ubuntu24.04') || contains(matrix.kernel_version, 'ubuntu26.04'))) }} uses: actions/download-artifact@v8 with: name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }} path: ./ - name: Publish image - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: ${{ ! (matrix.driver_branch == 535 && (contains(matrix.kernel_version, 'ubuntu24.04') || contains(matrix.kernel_version, 'ubuntu26.04'))) }} run: | image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar" echo "uploading $image_path" @@ -503,7 +519,7 @@ jobs: fi - name: Slack notification - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) && github.ref == 'refs/heads/main' }} + if: ${{ ! (matrix.driver_branch == 535 && (contains(matrix.kernel_version, 'ubuntu24.04') || contains(matrix.kernel_version, 'ubuntu26.04'))) && github.ref == 'refs/heads/main' }} uses: slackapi/slack-github-action@v3.0.3 with: token: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 15db285a4..01a368592 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -137,3 +137,12 @@ image-precompiled-ubuntu24.04: extends: - .driver-versions-precompiled-ubuntu24.04 - .image-build-precompiled + +image-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + CVE_UPDATES: "curl libc6" + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .image-build-precompiled diff --git a/.nvidia-ci.yml b/.nvidia-ci.yml index 6703a64f7..56e5aa87e 100644 --- a/.nvidia-ci.yml +++ b/.nvidia-ci.yml @@ -112,6 +112,20 @@ image-precompiled-ubuntu24.04: - .driver-versions-precompiled-ubuntu24.04 - .image-pull-generic +image-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + PRECOMPILED: "true" + CVE_UPDATES: "curl libc6" + rules: + - if: $CI_PIPELINE_SOURCE == "schedule" + when: delayed + start_in: 30 minutes + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .image-pull-generic + .image-pull-ubuntu22.04: # Perform for each DRIVER_VERSION extends: @@ -271,6 +285,18 @@ image-rocky9: - !reference [.scan-rules-common, rules] - !reference [.precompiled-rules, rules] +.scan-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + PRECOMPILED: "true" + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .scan-generic + rules: + - !reference [.scan-rules-common, rules] + - !reference [.precompiled-rules, rules] + .scan-precompiled-ubuntu22.04: variables: DIST: signed_ubuntu22.04 @@ -324,6 +350,15 @@ scan-precompiled-ubuntu24.04-amd64: needs: - image-precompiled-ubuntu24.04 +scan-precompiled-ubuntu26.04-amd64: + variables: + PLATFORM: linux/amd64 + extends: + - .scan-precompiled-ubuntu26.04 + - .platform-amd64 + needs: + - image-precompiled-ubuntu26.04 + scan-precompiled-ubuntu22.04-amd64: variables: PLATFORM: linux/amd64 @@ -434,6 +469,18 @@ release:ngc-precompiled-ubuntu24.04: rules: - !reference [.precompiled-rules, rules] +release:ngc-precompiled-ubuntu26.04: + variables: + DIST: signed_ubuntu26.04 + BASE_TARGET: questing + PRECOMPILED: "true" + extends: + - .driver-versions-precompiled-ubuntu26.04 + - .release-generic + - .release:ngc-variables + rules: + - !reference [.precompiled-rules, rules] + release:ngc-precompiled-ubuntu22.04: variables: DIST: signed_ubuntu22.04 diff --git a/Makefile b/Makefile index 0d71239d0..d7e077875 100644 --- a/Makefile +++ b/Makefile @@ -54,10 +54,10 @@ OUT_IMAGE_TAG = $(OUT_IMAGE_VERSION)-$(OUT_DIST) OUT_IMAGE = $(OUT_IMAGE_NAME):$(OUT_IMAGE_TAG) ##### Public rules ##### -DISTRIBUTIONS := ubuntu22.04 ubuntu24.04 signed_ubuntu22.04 signed_ubuntu24.04 rhel8 rhel9 rhel10 rocky9 precompiled_rhcos +DISTRIBUTIONS := ubuntu22.04 ubuntu24.04 signed_ubuntu22.04 signed_ubuntu24.04 signed_ubuntu26.04 rhel8 rhel9 rhel10 rocky9 precompiled_rhcos RHCOS_VERSIONS := rhcos4.14 rhcos4.15 rhcos4.16 rhcos4.17 rhcos4.18 rhel9.6 PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) -BASE_FROM := noble jammy focal +BASE_FROM := questing noble jammy focal PUSH_TARGETS := $(patsubst %, push-%, $(DISTRIBUTIONS)) VGPU_GUEST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuguest-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) VGPU_HOST_DRIVER_PUSH_TARGETS := $(patsubst %, push-vgpuhost-%, $(DISTRIBUTIONS) $(RHCOS_VERSIONS)) @@ -98,6 +98,10 @@ pull-signed_ubuntu24.04%: DIST = ubuntu24.04 pull-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) pull-signed_ubuntu24.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +pull-signed_ubuntu26.04%: DIST = ubuntu26.04 +pull-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +pull-signed_ubuntu26.04%: IMAGE_TAG = $(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + PLATFORM ?= linux/amd64 $(DRIVER_PULL_TARGETS): pull-%: $(DOCKER) pull "--platform=$(PLATFORM)" "$(IMAGE)" @@ -116,6 +120,10 @@ archive-signed_ubuntu24.04%: DIST = ubuntu24.04 archive-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) archive-signed_ubuntu24.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +archive-signed_ubuntu26.04%: DIST = ubuntu26.04 +archive-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +archive-signed_ubuntu26.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + $(DRIVER_ARCHIVE_TARGETS): archive-%: $(DOCKER) save "$(IMAGE)" -o "archive.tar" @@ -139,6 +147,11 @@ push-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) push-signed_ubuntu24.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) push-signed_ubuntu24.04%: OUT_IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +push-signed_ubuntu26.04%: DIST = ubuntu26.04 +push-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +push-signed_ubuntu26.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +push-signed_ubuntu26.04%: OUT_IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) + # $(DRIVER_BUILD_TARGETS) is in the form of build-$(DIST)-$(DRIVER_VERSION) # Parse the target to set the required variables. build-%: DIST = $(word 2,$(subst -, ,$@)) @@ -185,6 +198,14 @@ build-signed_ubuntu24.04%: DRIVER_TAG = $(DRIVER_BRANCH) build-signed_ubuntu24.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) build-signed_ubuntu24.04%: DOCKER_BUILD_ARGS = --build-arg KERNEL_VERSION="$(KERNEL_VERSION)" +# ubuntu26.04 Precompiled Driver +build-signed_ubuntu26.04%: DIST = ubuntu26.04 +build-signed_ubuntu26.04%: SUBDIR = . +build-signed_ubuntu26.04%: DOCKERFILE = $(CURDIR)/ubuntu26.04/precompiled/Dockerfile +build-signed_ubuntu26.04%: DRIVER_TAG = $(DRIVER_BRANCH) +build-signed_ubuntu26.04%: IMAGE_TAG = $(if $(VERSION),$(VERSION)-)$(DRIVER_BRANCH)-$(KERNEL_VERSION)-$(DIST) +build-signed_ubuntu26.04%: DOCKER_BUILD_ARGS = --build-arg KERNEL_VERSION="$(KERNEL_VERSION)" + # base is an image used to poll Canonical for the latest kernel version # LTS_KERNEL must be defined in the environment when invoking this target. LTS_KERNEL ?= "" diff --git a/base/Dockerfile b/base/Dockerfile index 123eae7fd..7fe297bb0 100644 --- a/base/Dockerfile +++ b/base/Dockerfile @@ -1,3 +1,39 @@ +# Ubuntu 26.04 +FROM ubuntu:questing-20260410 AS questing + +SHELL ["/bin/bash", "-c"] + +ARG DRIVER_BRANCH +ARG KERNEL_FLAVOR +ARG LTS_KERNEL +ENV DRIVER_BRANCH=${DRIVER_BRANCH} +ENV KERNEL_FLAVOR=${KERNEL_FLAVOR} +ENV LTS_KERNEL=${LTS_KERNEL} + +RUN rm -f /etc/apt/sources.list.d/cuda* + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +ENV NVIDIA_VISIBLE_DEVICES=void + +RUN apt-get update && apt-get install -y --no-install-recommends \ + apt-utils git curl && \ + rm -rf /var/lib/apt/lists/* + +RUN echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ questing main universe" > /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ questing-updates main universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ questing-security main universe" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://us.archive.ubuntu.com/ubuntu questing-updates main restricted" >> /etc/apt/sources.list && \ + echo "deb [arch=amd64] http://us.archive.ubuntu.com/ubuntu questing-security main restricted" >> /etc/apt/sources.list && \ + usermod -o -u 0 -g 0 _apt + +COPY generate-ci-config /usr/local/bin/generate-ci-config + +RUN chmod +x /usr/local/bin/generate-ci-config && \ + generate-ci-config + +ENTRYPOINT ["/usr/bin/sleep","1000"] + # Ubuntu 24.04 FROM nvcr.io/nvidia/cuda:13.2.1-base-ubuntu24.04 AS noble diff --git a/multi-arch.mk b/multi-arch.mk index d5201eb70..c0f1e5cea 100644 --- a/multi-arch.mk +++ b/multi-arch.mk @@ -26,3 +26,4 @@ $(DRIVER_PUSH_TARGETS): push-%: # No multi-arch support for the following distributions build-signed_ubuntu22.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-signed_ubuntu24.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 +build-signed_ubuntu26.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 diff --git a/tests/holodeck_ubuntu26.04.yaml b/tests/holodeck_ubuntu26.04.yaml new file mode 100644 index 000000000..bcf2599ce --- /dev/null +++ b/tests/holodeck_ubuntu26.04.yaml @@ -0,0 +1,34 @@ +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: HOLODECK_NAME + description: "end-to-end test infrastructure" +spec: + provider: aws + auth: + keyName: cnt-ci + privateKey: HOLODECK_PRIVATE_KEY + instance: + type: g4dn.xlarge + region: us-west-1 + ingressIpRanges: + - 18.190.12.32/32 + - 3.143.46.93/32 + - 52.15.119.136/32 + - 35.155.108.162/32 + - 35.162.190.51/32 + - 54.201.61.24/32 + - 52.24.205.48/32 + - 44.235.4.62/32 + - 44.230.241.223/32 + os: ubuntu-26.04 + image: + architecture: amd64 + containerRuntime: + install: true + name: containerd + kubernetes: + install: true + installer: kubeadm + version: v1.33.0 + crictlVersion: v1.33.0 diff --git a/ubuntu26.04/precompiled/Dockerfile b/ubuntu26.04/precompiled/Dockerfile new file mode 100644 index 000000000..77c22bd4a --- /dev/null +++ b/ubuntu26.04/precompiled/Dockerfile @@ -0,0 +1,62 @@ +ARG BASE_IMAGE=ubuntu:questing-20260410 +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG BASE_URL=https://us.download.nvidia.com/tesla +ARG TARGETARCH +ENV TARGETARCH=$TARGETARCH +ARG DRIVER_BRANCH=580 +ENV DRIVER_BRANCH=$DRIVER_BRANCH +ARG DRIVER_VERSION=580.126.20 +ENV DRIVER_VERSION=$DRIVER_VERSION + +ARG KERNEL_VERSION=6.14.0-15-generic +ENV KERNEL_VERSION=$KERNEL_VERSION + +ENV NVIDIA_VISIBLE_DEVICES=void + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +RUN dpkg --add-architecture i386 && \ + apt-get update && apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ + ca-certificates \ + curl \ + kmod \ + file \ + libelf-dev \ + libglvnd-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* + +# Fetch GPG keys for CUDA repo +RUN rm -f /etc/apt/sources.list.d/cuda* && \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2604/x86_64/cuda-keyring_1.1-1_all.deb -o cuda-keyring_1.1-1_all.deb && \ + dpkg -i cuda-keyring_1.1-1_all.deb && \ + rm -f cuda-keyring_1.1-1_all.deb + +RUN usermod -o -u 0 -g 0 _apt + +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + apt-get update && apt-get --only-upgrade -y install ${CVE_UPDATES} && \ + rm -rf /var/lib/apt/lists/*; \ + fi + +COPY ubuntu26.04/precompiled/nvidia-driver /usr/local/bin + +ADD ubuntu26.04/precompiled/local-repo.sh /tmp + +RUN mkdir -p /usr/local/repos && \ + /tmp/local-repo.sh download_driver_package_deps && \ + /tmp/local-repo.sh build_local_apt_repo && \ + /tmp/local-repo.sh fetch_nvidia_installer && \ + # Remove all other ubuntu apt sources to ensure we only pull from the local apt repo + rm /etc/apt/sources.list.d/* + +WORKDIR /drivers + +ENTRYPOINT ["nvidia-driver", "init"] diff --git a/ubuntu26.04/precompiled/local-repo.sh b/ubuntu26.04/precompiled/local-repo.sh new file mode 100755 index 000000000..48e18453d --- /dev/null +++ b/ubuntu26.04/precompiled/local-repo.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash + +set -eu + +LOCAL_REPO_DIR=/usr/local/repos +DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} +DRIVER_RUN_FILE=NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION + +download_apt_with_dep () { + local package_name="$1" + local package_version + if [ $# -gt 1 ] && [ -n "$2" ]; then + package_version="$2" + apt-get download "${package_name}=${package_version}" + else + apt-get download "${package_name}" + fi + + dependent_pkgs=$(apt-cache depends --recurse --no-recommends --no-suggests --no-conflicts --no-breaks --no-replaces --no-enhances "$package_name" | grep "^\w" | grep -v "$package_name" | sort -u) + if [ -n "$dependent_pkgs" ]; then + apt-get download $dependent_pkgs + fi +} + +nvlink5_pkgs_download() { + if [ "$DRIVER_BRANCH" -ge "570" ]; then + download_apt_with_dep nvlsm + download_apt_with_dep infiniband-diags + fi +} + +nvsdm_download() { + if [ "$TARGETARCH" = "amd64" ]; then + if [ "$DRIVER_BRANCH" -ge "580" ]; then + download_apt_with_dep libnvsdm ${DRIVER_VERSION}* + elif [ "$DRIVER_BRANCH" -ge "560" ]; then + download_apt_with_dep libnvsdm-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi + fi +} + +fabricmanager_download() { + if [ "$DRIVER_BRANCH" -ge "580" ]; then + download_apt_with_dep nvidia-fabricmanager ${DRIVER_VERSION}* + else + download_apt_with_dep nvidia-fabricmanager-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi +} + +nscq_download() { + if [ "$DRIVER_BRANCH" -ge "580" ]; then + download_apt_with_dep libnvidia-nscq ${DRIVER_VERSION}* + else + download_apt_with_dep libnvidia-nscq-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi +} + +imex_download() { + if [ "$DRIVER_BRANCH" -ge "580" ]; then + download_apt_with_dep nvidia-imex ${DRIVER_VERSION}* + elif [ "$DRIVER_BRANCH" -ge "550" ]; then + download_apt_with_dep nvidia-imex-${DRIVER_BRANCH} ${DRIVER_VERSION}* + fi +} + +download_driver_package_deps () { + apt-get update + pushd ${LOCAL_REPO_DIR} + + download_apt_with_dep linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + download_apt_with_dep linux-signatures-nvidia-${KERNEL_VERSION} + download_apt_with_dep linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + download_apt_with_dep linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + download_apt_with_dep nvidia-utils-${DRIVER_BRANCH}-server + download_apt_with_dep nvidia-headless-no-dkms-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-decode-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-extra-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-encode-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-fbc1-${DRIVER_BRANCH}-server + download_apt_with_dep libnvidia-gl-${DRIVER_BRANCH}-server + + fabricmanager_download + nscq_download + nvlink5_pkgs_download + imex_download + nvsdm_download + + ls -al . + popd +} + +build_local_apt_repo () { + pushd ${LOCAL_REPO_DIR} + dpkg-scanpackages . /dev/null | gzip -9c | tee Packages.gz > /dev/null + echo "deb [trusted=yes] file:${LOCAL_REPO_DIR} ./" > /etc/apt/sources.list + popd + apt-get update +} + +fetch_nvidia_installer () { + curl -fSsl -O $BASE_URL/$DRIVER_VERSION/$DRIVER_RUN_FILE.run + chmod +x $DRIVER_RUN_FILE.run + sh $DRIVER_RUN_FILE.run -x + mv $DRIVER_RUN_FILE/nvidia-installer /usr/bin/ + rm -rf $DRIVER_RUN_FILE + rm $DRIVER_RUN_FILE.run +} + +if [ "$1" = "download_driver_package_deps" ]; then + download_driver_package_deps +elif [ "$1" = "build_local_apt_repo" ]; then + build_local_apt_repo +elif [ "$1" = "fetch_nvidia_installer" ]; then + fetch_nvidia_installer +else + echo "Unknown function: $1" + exit 1 +fi diff --git a/ubuntu26.04/precompiled/nvidia-driver b/ubuntu26.04/precompiled/nvidia-driver new file mode 100755 index 000000000..eb887b2d8 --- /dev/null +++ b/ubuntu26.04/precompiled/nvidia-driver @@ -0,0 +1,566 @@ +#! /bin/bash +# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. + +set -eu + +KERNEL_VERSION=$(uname -r) +RUN_DIR=/run/nvidia +PID_FILE=${RUN_DIR}/${0##*/}.pid +DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing driver version"} +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +NVIDIA_MODULE_PARAMS=() +NVIDIA_UVM_MODULE_PARAMS=() +NVIDIA_MODESET_MODULE_PARAMS=() +NVIDIA_PEERMEM_MODULE_PARAMS=() +TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto} +MODPROBE_CONFIG_DIR="/etc/modprobe.d" + + +fabricmanager_install() { + local fabricmanager_package_name + if [ "$DRIVER_BRANCH" -ge "580" ]; then + fabricmanager_package_name=nvidia-fabricmanager + else + fabricmanager_package_name=nvidia-fabricmanager-${DRIVER_BRANCH} + fi + apt-get install -y --no-install-recommends ${fabricmanager_package_name}=${DRIVER_VERSION}* + apt-mark hold ${fabricmanager_package_name} +} + +nscq_install() { + local nscq_package_name + if [ "$DRIVER_BRANCH" -ge "580" ]; then + nscq_package_name=libnvidia-nscq + else + nscq_package_name=libnvidia-nscq-${DRIVER_BRANCH} + fi + apt-get install -y --no-install-recommends ${nscq_package_name}=${DRIVER_VERSION}* + apt-mark hold ${nscq_package_name} +} + +imex_install() { + local imex_package_name + if [ "$DRIVER_BRANCH" -ge "580" ]; then + imex_package_name=nvidia-imex + elif [ "$DRIVER_BRANCH" -ge "550" ]; then + imex_package_name=nvidia-imex-${DRIVER_BRANCH} + else + return 0 + fi + apt-get install -y --no-install-recommends ${imex_package_name}=${DRIVER_VERSION}* + apt-mark hold ${imex_package_name} +} + +nvlink5_pkgs_install() { + if [ "$DRIVER_BRANCH" -ge "570" ]; then + apt-get install -y --no-install-recommends nvlsm + apt-get install -y --no-install-recommends infiniband-diags + fi +} + +# libnvsdm packages are not available for arm64 +nvsdm_install() { + local nvsdm_package_name + if [ "$TARGETARCH" = "amd64" ]; then + if [ "$DRIVER_BRANCH" -ge "580" ]; then + nvsdm_package_name=libnvsdm + elif [ "$DRIVER_BRANCH" -ge "560" ]; then + nvsdm_package_name=libnvsdm-${DRIVER_BRANCH} + else + return 0 + fi + apt-get install -y --no-install-recommends ${nvsdm_package_name}=${DRIVER_VERSION}* + apt-mark hold ${nvsdm_package_name} + fi +} + +_update_package_cache() { + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + echo "Updating the package cache..." + if ! apt-get -qq update; then + echo "ERROR: Failed to update package cache. "\ + "Ensure that the cluster can access the proper networks." + exit 1 + fi + fi +} + +_assert_nvswitch_system() { + [ -d /proc/driver/nvidia-nvswitch/devices ] || return 1 + if [ -z "$(ls -A /proc/driver/nvidia-nvswitch/devices)" ]; then + return 1 + fi + return 0 +} + +_assert_nvlink5_system() ( + for dir in /sys/class/infiniband/*/device; do + # Define the path to the VPD file + vpd_file="$dir/vpd" + + # Check if the VPD file exists + if [ -f "$vpd_file" ]; then + # Search for 'SW_MNG' in the VPD file + if grep -q "SW_MNG" "$vpd_file"; then + echo "Detected NVLink5+ system" + return 0 + fi + fi + done + return 1 +) + +_ensure_nvlink5_prerequisites() ( + until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1; + do + echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded" + sleep 10 + done +) + +# Check if mellanox devices are present +_mellanox_devices_present() { + devices_found=0 + for dev in /sys/bus/pci/devices/*; do + read vendor < $dev/vendor + if [ "$vendor" = "0x15b3" ]; then + echo "Mellanox device found at $(basename $dev)" + return 0 + fi + done + echo "No Mellanox devices were found..." + return 1 +} + +_gpu_direct_rdma_enabled() { + if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then + # check if mellanox cards are present + if _mellanox_devices_present; then + return 0 + fi + fi + return 1 +} + +# For each kernel module configuration file mounted into the container, +# parse the file contents and extract the custom module parameters that +# are to be passed as input to 'modprobe'. +# +# Assumptions: +# - Configuration files are named .conf (i.e. nvidia.conf, nvidia-uvm.conf). +# - Configuration files are mounted inside the container at /drivers. +# - Each line in the file contains at least one parameter, where parameters on the same line +# are space delimited. It is up to the user to properly format the file to ensure +# the correct set of parameters are passed to 'modprobe'. +_get_module_params() { + local base_path="/drivers" + # nvidia + if [ -f "${base_path}/nvidia.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia.conf" + echo "Module parameters provided for nvidia: ${NVIDIA_MODULE_PARAMS[@]}" + fi + # nvidia-uvm + if [ -f "${base_path}/nvidia-uvm.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_UVM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-uvm.conf" + echo "Module parameters provided for nvidia-uvm: ${NVIDIA_UVM_MODULE_PARAMS[@]}" + fi + # nvidia-modeset + if [ -f "${base_path}/nvidia-modeset.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_MODESET_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-modeset.conf" + echo "Module parameters provided for nvidia-modeset: ${NVIDIA_MODESET_MODULE_PARAMS[@]}" + fi + # nvidia-peermem + if [ -f "${base_path}/nvidia-peermem.conf" ]; then + while IFS="" read -r param || [ -n "$param" ]; do + NVIDIA_PEERMEM_MODULE_PARAMS+=("$param") + done <"${base_path}/nvidia-peermem.conf" + echo "Module parameters provided for nvidia-peermem: ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" + fi +} + +_create_module_params_conf() { + echo "Parsing kernel module parameters..." + _get_module_params + + if [ ${#NVIDIA_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia module parameters in ${MODPROBE_CONFIG_DIR}/nvidia.conf" + echo "options nvidia ${NVIDIA_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia.conf + fi + if [ ${#NVIDIA_UVM_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia-uvm module parameters in ${MODPROBE_CONFIG_DIR}/nvidia-uvm.conf" + echo "options nvidia-uvm ${NVIDIA_UVM_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia-uvm.conf + fi + if [ ${#NVIDIA_MODESET_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia-modeset module parameters in ${MODPROBE_CONFIG_DIR}/nvidia-modeset.conf" + echo "options nvidia-modeset ${NVIDIA_MODESET_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia-modeset.conf + fi + if [ ${#NVIDIA_PEERMEM_MODULE_PARAMS[@]} -gt 0 ]; then + echo "Configuring nvidia-peermem module parameters in ${MODPROBE_CONFIG_DIR}/nvidia-peermem.conf" + echo "options nvidia-peermem ${NVIDIA_PEERMEM_MODULE_PARAMS[@]}" > ${MODPROBE_CONFIG_DIR}/nvidia-peermem.conf + fi +} + +# Load the kernel modules and start persistenced. +_load_driver() { + local nv_fw_search_path="$RUN_DIR/driver/lib/firmware" + local set_fw_path="true" + local fw_path_config_file="/sys/module/firmware_class/parameters/path" + for param in "${NVIDIA_MODULE_PARAMS[@]}"; do + if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then + set_fw_path="false" + fi + done + + if [[ "$set_fw_path" == "true" ]]; then + echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path" + if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then + echo "WARNING: A search path is already configured in $fw_path_config_file" + echo " Retaining the current configuration" + else + echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure firmware search path" + fi + fi + + echo "Loading ipmi and i2c_core kernel modules..." + modprobe -a i2c_core ipmi_msghandler ipmi_devintf + + echo "Loading NVIDIA driver kernel modules..." + set -o xtrace +o nounset + modprobe nvidia + modprobe nvidia-uvm + modprobe nvidia-modeset + set +o xtrace -o nounset + + + if _gpu_direct_rdma_enabled; then + echo "Loading NVIDIA Peer Memory kernel module..." + set -o xtrace +o nounset + modprobe nvidia-peermem + set +o xtrace -o nounset + fi + + echo "Starting NVIDIA persistence daemon..." + nvidia-persistenced --persistence-mode + + DRIVER_VERSION=$(nvidia-smi -q | grep "Driver Version" | awk -F: '{print $2}' | xargs) + if _assert_nvlink5_system; then + _ensure_nvlink5_prerequisites || return 1 + + echo "Installing NVIDIA fabric manager, libnvsdm and nvlsm packages..." + nvlink5_pkgs_install + fabricmanager_install + nvsdm_install + imex_install + + echo "Starting NVIDIA fabric manager daemon for NVLink5+..." + + fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg + fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid + nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf + nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid + /usr/bin/nvidia-fabricmanager-start.sh --mode start \ + --fm-config-file $fm_config_file \ + --fm-pid-file $fm_pid_file \ + --nvlsm-config-file $nvlsm_config_file \ + --nvlsm-pid-file $nvlsm_pid_file + + # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches + elif _assert_nvswitch_system; then + echo "Installing NVIDIA fabric manager and libnvidia NSCQ packages..." + fabricmanager_install + nscq_install + imex_install + + echo "Starting NVIDIA fabric manager daemon..." + nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg + fi + + return 0 +} + +# Stop persistenced and unload the kernel modules if they are currently loaded. +_unload_driver() { + local rmmod_args=() + local nvidia_deps=0 + local nvidia_refs=0 + local nvidia_uvm_refs=0 + local nvidia_modeset_refs=0 + local nvidia_peermem_refs=0 + + if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then + echo "Stopping NVIDIA persistence daemon..." + local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA persistence daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nv-fabricmanager.pid ]; then + echo "Stopping NVIDIA fabric manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nv-fabricmanager.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVIDIA fabric manager daemon" >&2 + return 1 + fi + fi + + if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then + echo "Stopping NVLink Subnet Manager daemon..." + local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid) + + kill -SIGTERM "${pid}" + for i in $(seq 1 50); do + kill -0 "${pid}" 2> /dev/null || break + sleep 0.1 + done + if [ $i -eq 50 ]; then + echo "Could not stop NVLink Subnet Manager daemon" >&2 + return 1 + fi + fi + + echo "Unloading NVIDIA driver kernel modules..." + if [ -f /sys/module/nvidia_drm/refcnt ]; then + nvidia_drm_refs=$(< /sys/module/nvidia_drm/refcnt) + rmmod_args+=("nvidia-drm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_modeset/refcnt ]; then + nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) + rmmod_args+=("nvidia-modeset") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_uvm/refcnt ]; then + nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) + rmmod_args+=("nvidia-uvm") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt) + rmmod_args+=("nvidia-peermem") + ((++nvidia_deps)) + fi + if [ -f /sys/module/nvidia/refcnt ]; then + nvidia_refs=$(< /sys/module/nvidia/refcnt) + rmmod_args+=("nvidia") + fi + if [ ${nvidia_refs} -gt ${nvidia_deps} ]; then + # run lsmod to debug module usage + lsmod | grep nvidia + echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 + return 1 + fi + + if [ ${#rmmod_args[@]} -gt 0 ]; then + rmmod ${rmmod_args[@]} + fi + return 0 +} + +_resolve_kernel_type_from_driver_branch() { + [[ "${DRIVER_BRANCH}" -lt 560 ]] && KERNEL_TYPE=kernel || KERNEL_TYPE=kernel-open +} + +# _resolve_kernel_type determines which kernel module type, open or proprietary, to install. +# This function assumes that the nvidia-installer binary is in the PATH, so this function +# should only be invoked after the userspace driver components have been installed. +# +# KERNEL_MODULE_TYPE is the frontend interface that users can use to configure which module +# to install. Valid values for KERNEL_MODULE_TYPE are 'auto' (default), 'open', and 'proprietary'. +# When 'auto' is configured, we use the nvidia-installer to recommend the module type to install. +_resolve_kernel_type() { + if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then + KERNEL_TYPE=kernel + elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then + KERNEL_TYPE=kernel-open + elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then + kernel_module_type=$(nvidia-installer --print-recommended-kernel-module-type 2> /dev/null) + if [ $? -ne 0 ]; then + echo "failed to retrieve the recommended kernel module type from nvidia-installer, falling back to using the driver branch" + _resolve_kernel_type_from_driver_branch + return 0 + fi + [[ "${kernel_module_type}" == "open" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel + else + echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}" + return 1 + fi +} + +# Link and install the kernel modules from a precompiled packages +_install_driver() { + # Install necessary driver userspace packages + apt-get install -y --no-install-recommends \ + nvidia-utils-${DRIVER_BRANCH}-server \ + nvidia-headless-no-dkms-${DRIVER_BRANCH}-server \ + libnvidia-decode-${DRIVER_BRANCH}-server \ + libnvidia-extra-${DRIVER_BRANCH}-server \ + libnvidia-encode-${DRIVER_BRANCH}-server \ + libnvidia-fbc1-${DRIVER_BRANCH}-server \ + libnvidia-gl-${DRIVER_BRANCH}-server + + # Now install the precompiled kernel module packages signed by Canonical + if [ "$KERNEL_TYPE" = "kernel-open" ]; then + echo "Installing Open NVIDIA driver kernel modules..." + apt-get install --no-install-recommends -y \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + else + echo "Installing Closed NVIDIA driver kernel modules..." + apt-get install --no-install-recommends -y \ + linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + fi +} + +# Mount the driver rootfs into the run directory with the exception of sysfs. +_mount_rootfs() { + echo "Mounting NVIDIA driver rootfs..." + mount --make-runbindable /sys + mount --make-private /sys + mkdir -p ${RUN_DIR}/driver + mount --rbind / ${RUN_DIR}/driver +} + +# Unmount the driver rootfs from the run directory. +_unmount_rootfs() { + echo "Unmounting NVIDIA driver rootfs..." + if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then + umount -l -R ${RUN_DIR}/driver + fi +} + +init() { + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver branch ${DRIVER_BRANCH} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + _unload_driver || exit 1 + _unmount_rootfs + + _update_package_cache + + _create_module_params_conf + _install_driver + _load_driver || exit 1 + _mount_rootfs + + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 +} + +_shutdown() { + if _unload_driver; then + _unmount_rootfs + rm -f ${PID_FILE} + return 0 + fi + return 1 +} + +# Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates +reload_nvidia_peermem() { + if [ "$USE_HOST_MOFED" = "true" ]; then + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /sys/module/nvidia/refcnt ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + else + # use driver readiness flag created by MOFED container + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /sys/module/nvidia/refcnt ]; + do + echo "waiting for mellanox ofed and nvidia drivers to be installed" + sleep 10 + done + fi + # get any parameters provided for nvidia-peermem + _get_module_params && set +o nounset + if chroot /run/nvidia/driver modprobe nvidia-peermem "${NVIDIA_PEERMEM_MODULE_PARAMS[@]}"; then + if [ -f /sys/module/nvidia_peermem/refcnt ]; then + echo "successfully loaded nvidia-peermem module, now waiting for signal" + sleep inf + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + fi + fi + echo "failed to load nvidia-peermem module" + exit 1 +} + +# probe by gpu-opertor for liveness/startup checks for nvidia-peermem module to be loaded when MOFED drivers are ready +probe_nvidia_peermem() { + if lsmod | grep mlx5_core > /dev/null 2>&1; then + if [ ! -f /sys/module/nvidia_peermem/refcnt ]; then + echo "nvidia-peermem module is not loaded" + return 1 + fi + else + echo "MOFED drivers are not ready, skipping probe to avoid container restarts..." + fi + return 0 +} + +usage() { + cat >&2 < Date: Wed, 6 May 2026 14:35:11 +0530 Subject: [PATCH 2/2] Precompiled: Ubuntu26.04 driver container support Signed-off-by: Shiva Kumar (SW-CLOUD) --- .github/workflows/precompiled.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index 517baf2fc..d878aa07f 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -39,7 +39,7 @@ jobs: id: extract_driver_branch run: | # get driver_branch - DRIVER_BRANCH=("535" "580") + DRIVER_BRANCH=("535" "580" "595") driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .) echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT @@ -72,6 +72,8 @@ jobs: driver_branch: 535 - dist: ubuntu26.04 driver_branch: 535 + - dist: ubuntu26.04 + driver_branch: 580 - lts_kernel: 5.15 dist: ubuntu24.04 - lts_kernel: 5.15