From 193d269c9fc2cf37b83b6d4cb68d02d283649c2d Mon Sep 17 00:00:00 2001
From: Ganeshkumar Ashokavardhanan <aganeshkumar@microsoft.com>
Date: Sun, 31 May 2026 11:55:49 -0700
Subject: [PATCH] feat: add build-only and install-skip-build modes to support
 VHD-prebuilt kernel module

Split the host-side driver install into two phases so the NVIDIA kernel module
can be DKMS-compiled into the VHD at image build time and the boot-time install
can skip straight to device init:

- install.sh: refactor into build_kernel_module() (compile + stage userspace
  libs, no device access) and device_init() (modprobe, nvidia-smi, fabric
  manager, containerd config, udev). Add AKSGPU_BUILD_ONLY and
  AKSGPU_SKIP_KERNEL_BUILD modes, an overlay cleanup trap, and a dkms-marker
  (/opt/azure/aks-gpu/dkms-marker) recording kernel, driver_version,
  driver_kind and arch so the consumer (AgentBaker CSE) can validate an exact
  match before taking the skip-build fast path.
- entrypoint.sh: add build-only and install-skip-build actions and pass the
  mode through to the host via nsenter. The default install action is
  unchanged.

This is the aks-gpu half of the AgentBaker change that prebuilds the GPU kernel
module into the VHD to reduce node provisioning time. Secure Boot module
signing and GPU e2e validation are still required.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 entrypoint.sh |  35 ++++++--
 install.sh    | 245 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 189 insertions(+), 91 deletions(-)

diff --git a/entrypoint.sh b/entrypoint.sh
index c880826..646d481 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -23,11 +23,30 @@ if [[ "${1}" == "copy" ]]; then
     exit 0
 fi
 
-if [[ "${1}" == "install" ]]; then
-    echo "copying gpu cache files"
-    cp -a /opt/gpu/. /mnt/gpu/
-    echo "copied successfully!"
-fi
+# Map the requested action to the install mode passed to install.sh.
+#   install            -> full compile + device init (legacy behaviour)
+#   build-only         -> compile/cache the kernel module only (VHD build, no GPU)
+#   install-skip-build -> device init only, reusing the module prebuilt into the VHD
+GPU_INSTALL_MODE_ENV=""
+case "${1}" in
+    install)
+        echo "copying gpu cache files"
+        cp -a /opt/gpu/. /mnt/gpu/
+        echo "copied successfully!"
+        ;;
+    build-only)
+        echo "copying gpu cache files (build-only)"
+        cp -a /opt/gpu/. /mnt/gpu/
+        echo "copied successfully!"
+        GPU_INSTALL_MODE_ENV="AKSGPU_BUILD_ONLY=1"
+        ;;
+    install-skip-build)
+        echo "copying gpu cache files (install-skip-build)"
+        cp -a /opt/gpu/. /mnt/gpu/
+        echo "copied successfully!"
+        GPU_INSTALL_MODE_ENV="AKSGPU_SKIP_KERNEL_BUILD=1"
+        ;;
+esac
 
 ACTION_FILE="/opt/actions/install.sh"
 
@@ -46,7 +65,11 @@ cp -R /opt/actions/. /mnt/actions
 
 echo "Executing nsenter"
 
-nsenter -t 1 -m bash "${ACTION_FILE}"
+if [[ -n "${GPU_INSTALL_MODE_ENV}" ]]; then
+    nsenter -t 1 -m env "${GPU_INSTALL_MODE_ENV}" bash "${ACTION_FILE}"
+else
+    nsenter -t 1 -m bash "${ACTION_FILE}"
+fi
 RESULT="${PIPESTATUS[0]}"
 
 if [ $RESULT -eq 0 ]; then
diff --git a/install.sh b/install.sh
index 26aede0..4ba7125 100644
--- a/install.sh
+++ b/install.sh
@@ -7,18 +7,54 @@ source /opt/gpu/package_manager_helpers.sh
 trap 'PS4="+ "' exit
 PS4='+ $(date -u -I"seconds" | cut -c1-19) '
 
+# Install mode flags (set by entrypoint.sh based on the requested action):
+#   AKSGPU_BUILD_ONLY=1        -> compile/cache the kernel module + userspace libs only.
+#                                 Runs on a GPU-less host (e.g. the Packer VHD builder).
+#                                 Skips every device-dependent step (modprobe, nvidia-smi,
+#                                 fabric manager, persistence) and writes a marker.
+#   AKSGPU_SKIP_KERNEL_BUILD=1 -> the kernel module + libs were prebuilt into the VHD for
+#                                 this exact kernel+driver; skip recompilation and only run
+#                                 the device-dependent steps at node boot.
+#   (neither set)              -> legacy behaviour: full compile + device init in one shot.
+AKSGPU_BUILD_ONLY="${AKSGPU_BUILD_ONLY:-0}"
+AKSGPU_SKIP_KERNEL_BUILD="${AKSGPU_SKIP_KERNEL_BUILD:-0}"
+
+# Host-side marker describing what was baked into the VHD at build time. AgentBaker reads
+# this (plus its own image-digest record) to decide whether the boot-time fast path is safe.
+DKMS_MARKER_FILE="/opt/azure/aks-gpu/dkms-marker"
+
 KERNEL_NAME=$(uname -r)
 LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log"
 ARCH=$(uname -m)
 
-set +euo pipefail
-open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
-echo "Open devices: $open_devices"
-
-open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
-echo "Open gridd: $open_gridd"
-
-set -euo pipefail
+# Track overlay/tmpfs state so a build-time exit can never leave dangling mounts in the VHD.
+OVERLAY_MOUNTED=0
+cleanup_overlay() {
+    set +e
+    if [ "${OVERLAY_MOUNTED}" = "1" ]; then
+        umount -l "/usr/lib/${ARCH}-linux-gnu" || true
+        umount /tmp/overlay || true
+        rm -r /tmp/overlay || true
+        OVERLAY_MOUNTED=0
+    fi
+    set -e
+}
+trap cleanup_overlay EXIT
+
+resolve_runfile() {
+    if [[ "${DRIVER_KIND}" == "cuda" ]]; then
+        RUNFILE="NVIDIA-Linux-${ARCH}-${DRIVER_VERSION}"
+    elif [[ "${DRIVER_KIND}" == "grid" ]]; then
+        if [[ "${ARCH}" != "x86_64" ]]; then
+            echo "GRID driver is only supported on x86_64 architecture"
+            exit 1
+        fi
+        RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
+    else
+        echo "Invalid driver kind: ${DRIVER_KIND}"
+        exit 1
+    fi
+}
 
 # install cached nvidia debian packages for container runtime compatibility
 install_cached_nvidia_packages() {
@@ -27,87 +63,126 @@ for apt_package in $NVIDIA_PACKAGES; do
 done
 }
 
-use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3
-
-# blacklist nouveau driver, nvidia driver dependency
-cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf
-update-initramfs -u
-
-# clean up lingering files from previous install
-set +e
-umount -l /usr/lib/$(uname -m)-linux-gnu || true
-umount -l /tmp/overlay || true
-rm -r /tmp/overlay || true
-set -e
-
-# set up overlayfs to change install location of nvidia libs from /usr/lib/$ARCH-linux-gnu to /usr/local/nvidia
-# add an extra layer of indirection via tmpfs because it's not possible to have an overlayfs on an overlayfs (i.e., inside a container)
-mkdir /tmp/overlay
-mount -t tmpfs tmpfs /tmp/overlay
-mkdir /tmp/overlay/{workdir,lib64}
-mkdir -p ${GPU_DEST}/lib64
-mount -t overlay overlay -o lowerdir=/usr/lib/$(uname -m)-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/$(uname -m)-linux-gnu
-
-if [[ "${DRIVER_KIND}" == "cuda" ]]; then
-    RUNFILE="NVIDIA-Linux-$(uname -m)-${DRIVER_VERSION}"
-elif [[ "${DRIVER_KIND}" == "grid" ]]; then
-    if [[ $(uname -m) != "x86_64" ]]; then
-        echo "GRID driver is only supported on x86_64 architecture"
-        exit 1
-    fi
-    RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure"
-else
-    echo "Invalid driver kind: ${DRIVER_KIND}"
-    exit 1
-fi
+install_nvidia_container_toolkit() {
+    use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3
+}
 
-# install nvidia drivers
-pushd /opt/gpu
-/opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms
-nvidia-smi
-popd
-
-# move nvidia libs to correct location from temporary overlayfs
-cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64
-
-# configure system to know about nvidia lib paths
-echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
-ldconfig 
-
-# unmount, cleanup
-set +e
-umount -l /usr/lib/$(uname -m)-linux-gnu
-umount /tmp/overlay
-rm -r /tmp/overlay
-set -e
-
-# validate that nvidia driver is working
-dkms status
-nvidia-modprobe -u -c0
-
-# configure persistence daemon
-# decreases latency for later driver loads
-# reduces nvidia-smi invocation time 10x from 30 to 2 sec 
-# notable on large VM sizes with multiple GPUs
-# especially when nvidia-smi process is in CPU cgroup
-cp -r /usr/bin/lib64/lib64/* /usr/lib/$(uname -m)-linux-gnu/
-nvidia-smi
-
-# install fabricmanager for nvlink based systems
-if [[ "${DRIVER_KIND}" == "cuda" ]]; then
-    NVIDIA_FM_ARCH=$(uname -m)
-    if [ $NVIDIA_FM_ARCH = "arm64" ]; then
-        # NVIDIA uses the name "SBSA" for ARM64 platforms for the fabric manager. See https://en.wikipedia.org/wiki/Server_Base_System_Architecture
-        NVIDIA_FM_ARCH="sbsa"
+# build_kernel_module compiles the NVIDIA kernel module (the expensive step) and stages the
+# userspace libraries. It performs NO device access, so it is safe to run at VHD build time on
+# a host without a GPU.
+build_kernel_module() {
+    # blacklist nouveau driver, nvidia driver dependency
+    cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf
+    update-initramfs -u
+
+    # clean up lingering files from previous install
+    set +e
+    umount -l "/usr/lib/${ARCH}-linux-gnu" || true
+    umount -l /tmp/overlay || true
+    rm -r /tmp/overlay || true
+    set -e
+
+    # set up overlayfs to change install location of nvidia libs from /usr/lib/$ARCH-linux-gnu to /usr/local/nvidia
+    # add an extra layer of indirection via tmpfs because it's not possible to have an overlayfs on an overlayfs (i.e., inside a container)
+    mkdir /tmp/overlay
+    mount -t tmpfs tmpfs /tmp/overlay
+    mkdir /tmp/overlay/{workdir,lib64}
+    mkdir -p ${GPU_DEST}/lib64
+    mount -t overlay overlay -o lowerdir="/usr/lib/${ARCH}-linux-gnu",upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir "/usr/lib/${ARCH}-linux-gnu"
+    OVERLAY_MOUNTED=1
+
+    resolve_runfile
+
+    # install nvidia drivers (DKMS build is the dominant cost we are hoisting to VHD build time)
+    pushd /opt/gpu
+    /opt/gpu/${RUNFILE}/nvidia-installer -s -k=$KERNEL_NAME --log-file-name=${LOG_FILE_NAME} -a --no-drm --dkms
+    popd
+
+    # move nvidia libs to correct location from temporary overlayfs
+    cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64
+
+    # configure system to know about nvidia lib paths
+    echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf
+    ldconfig
+
+    cleanup_overlay
+
+    # validate that the kernel module was built and registered (no device access required)
+    dkms status
+    modinfo -k "$KERNEL_NAME" nvidia
+}
+
+# device_init runs the steps that require the physical GPU and therefore must execute at node
+# boot, regardless of whether the kernel module was prebuilt into the VHD.
+device_init() {
+    nvidia-modprobe -u -c0
+
+    # configure persistence daemon
+    # decreases latency for later driver loads
+    # reduces nvidia-smi invocation time 10x from 30 to 2 sec
+    # notable on large VM sizes with multiple GPUs
+    # especially when nvidia-smi process is in CPU cgroup
+    cp -r /usr/bin/lib64/lib64/* "/usr/lib/${ARCH}-linux-gnu/"
+    nvidia-smi
+
+    # install fabricmanager for nvlink based systems
+    if [[ "${DRIVER_KIND}" == "cuda" ]]; then
+        NVIDIA_FM_ARCH=$ARCH
+        if [ "$NVIDIA_FM_ARCH" = "arm64" ]; then
+            # NVIDIA uses the name "SBSA" for ARM64 platforms for the fabric manager. See https://en.wikipedia.org/wiki/Server_Base_System_Architecture
+            NVIDIA_FM_ARCH="sbsa"
+        fi
+        bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
     fi
-    bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh
+
+    mkdir -p /etc/containerd/config.d
+    cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
+
+    mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)"
+    cp /opt/gpu/71-nvidia-char-dev.rules /lib/udev/rules.d/71-nvidia-dev-char.rules
+    /usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all
+}
+
+write_dkms_marker() {
+    mkdir -p "$(dirname "${DKMS_MARKER_FILE}")"
+    cat > "${DKMS_MARKER_FILE}" <<EOF
+kernel=${KERNEL_NAME}
+driver_version=${DRIVER_VERSION}
+driver_kind=${DRIVER_KIND}
+arch=${ARCH}
+EOF
+}
+
+set +euo pipefail
+open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
+echo "Open devices: $open_devices"
+
+open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
+echo "Open gridd: $open_gridd"
+set -euo pipefail
+
+if [ "${AKSGPU_BUILD_ONLY}" = "1" ]; then
+    # VHD build time: compile + cache only, no device access.
+    echo "aks-gpu: build-only mode (prebuilding kernel module for kernel ${KERNEL_NAME})"
+    build_kernel_module
+    write_dkms_marker
+    rm -r /opt/gpu
+    exit 0
 fi
 
-mkdir -p /etc/containerd/config.d
-cp /opt/gpu/10-nvidia-runtime.toml /etc/containerd/config.d/10-nvidia-runtime.toml
+install_nvidia_container_toolkit
+
+if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ]; then
+    # Node boot, prebuilt module valid for this kernel+driver: skip recompilation, ensure the
+    # baked module is loadable, then run the device-dependent steps only.
+    echo "aks-gpu: skip-kernel-build mode (using module prebuilt in VHD for kernel ${KERNEL_NAME})"
+    ldconfig
+    dkms status
+    modinfo -k "$KERNEL_NAME" nvidia
+else
+    build_kernel_module
+fi
 
-mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)"
-cp /opt/gpu/71-nvidia-char-dev.rules /lib/udev/rules.d/71-nvidia-dev-char.rules
-/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all
+device_init
 
 rm -r /opt/gpu