From bfbbc87cd3a63e7d77c996244e63cf1ea2fe2b51 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Sun, 31 May 2026 11:55:40 -0700 Subject: [PATCH 1/2] feat: optionally prebuild NVIDIA GPU kernel module into VHD to cut provisioning time On mainstream Ubuntu GPU SKUs the aks-gpu-cuda image is only pre-pulled into the VHD; the expensive NVIDIA DKMS kernel-module compile + update-initramfs runs on the host at first boot during CSE. This adds an opt-in path to compile the module into the VHD at build time and skip the boot-time build. - cse_config.sh: add gpuPrebuiltModuleMatches() guard and a "fast path" in configGPUDrivers() that selects the install-skip-build action only when a baked module exactly matches the running kernel, driver version/kind, and driver image tag. Any mismatch (kernel drift, newer driver from CRP, GRID SKU, older VHD without the marker) falls back to today's full build, so correctness never depends on the fast path. - install-dependencies.sh: add prebuildGPUKernelModule() which runs the aks-gpu container's build-only action during VHD build and records the driver image tag in the marker. Gated behind PREBUILD_GPU_KERNEL_MODULE and scoped to Ubuntu 22.04 amd64 (CUDA driver). Default off so existing builds are unchanged and it is only attempted on VHDs whose aks-gpu image supports the build-only action. - cse_config_spec.sh: add shellspec coverage for gpuPrebuiltModuleMatches. Requires a companion aks-gpu image change (build-only / install-skip-build entrypoint actions) to be published before PREBUILD_GPU_KERNEL_MODULE is enabled. Secure Boot module signing and GPU e2e validation are still required. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../linux/cloud-init/artifacts/cse_config.sh | 34 +++++++++- .../cloud-init/artifacts/cse_config_spec.sh | 64 +++++++++++++++++++ vhdbuilder/packer/install-dependencies.sh | 45 +++++++++++++ 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 614bd74a054..9f82a9e2819 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -999,12 +999,44 @@ configAzurePolicyAddon() { sed -i "s||/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP|g" $AZURE_POLICY_ADDON_FILE } +gpuPrebuiltModuleMatches() { + # Returns 0 only when the VHD baked a kernel module that exactly matches what CSE is about + # to install: same running kernel, same driver version + kind, and the same driver image + # tag. Any mismatch (kernel drift, newer driver from CRP, GRID SKU, older VHD without the + # marker) returns non-zero so configGPUDrivers falls back to the full build. The fast path + # is therefore purely an optimization -- correctness never depends on it. + local marker_file="${GPU_PREBUILT_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}" + [ -f "$marker_file" ] || return 1 + + local m_kernel m_version m_kind m_image + m_kernel=$(awk -F= '/^kernel=/{print $2; exit}' "$marker_file") + m_version=$(awk -F= '/^driver_version=/{print $2; exit}' "$marker_file") + m_kind=$(awk -F= '/^driver_kind=/{print $2; exit}' "$marker_file") + m_image=$(awk -F= '/^image_tag=/{print $2; exit}' "$marker_file") + + [ -n "$m_kernel" ] && [ "$m_kernel" = "$(uname -r)" ] || return 1 + [ -n "$m_version" ] && [ "$m_version" = "$GPU_DV" ] || return 1 + [ -n "$m_kind" ] && [ "$m_kind" = "$NVIDIA_GPU_DRIVER_TYPE" ] || return 1 + [ -n "$m_image" ] && [ "$m_image" = "$NVIDIA_DRIVER_IMAGE_TAG" ] || return 1 + + # The compiled module must actually resolve for the running kernel. + modinfo -k "$(uname -r)" nvidia >/dev/null 2>&1 || return 1 + return 0 +} + configGPUDrivers() { if [ "$OS" = "$UBUNTU_OS_NAME" ]; then waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL mkdir -p /opt/{actions,gpu} + # Fast path: when the VHD baked a kernel module matching this exact kernel + driver + + # image, skip the expensive boot-time DKMS compile and only run the device-init steps. + local gpu_install_action="install" + if gpuPrebuiltModuleMatches; then + echo "Prebuilt GPU kernel module matches running kernel/driver/image; using skip-build fast path" + gpu_install_action="install-skip-build" + fi ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG - retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" + retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh ${gpu_install_action}" ret=$? if [ "$ret" -ne 0 ]; then echo "Failed to install GPU driver, exiting..." diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 5051528c554..27fecb2914e 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -1678,4 +1678,68 @@ SETUP_EOF The output should include "rm -f /opt/azure/containers/managed-gpu-experience.enabled" End End + + Describe 'gpuPrebuiltModuleMatches' + setup() { + GPU_PREBUILT_MARKER_FILE="$(mktemp)" + GPU_DV="535.230.02" + NVIDIA_GPU_DRIVER_TYPE="cuda" + NVIDIA_DRIVER_IMAGE_TAG="535.230.02-abc123" + } + cleanup() { + rm -f "$GPU_PREBUILT_MARKER_FILE" + } + BeforeEach 'setup' + AfterEach 'cleanup' + + uname() { echo "5.15.0-1078-azure"; } + modinfo() { return 0; } + + write_marker() { + cat > "$GPU_PREBUILT_MARKER_FILE" <&2 + exit 1 + fi + + if [ ! -f "$marker" ]; then + echo "Error: expected GPU prebuild marker ${marker} not found after build-only run" >&2 + exit 1 + fi + + # Bind the baked module to this exact driver image so CSE only fast-paths on an exact match. + { + echo "image_tag=${tag}" + echo "image=${ref}" + } >> "$marker" + + echo "GPU kernel module prebuilt into VHD:" >> ${VHD_LOGS_FILEPATH} + sed 's/^/ - /' "$marker" >> ${VHD_LOGS_FILEPATH} +} + # For Ubuntu, pre-pull the CUDA driver image if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU with GPU now gpu_action="copy" @@ -718,6 +753,16 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit cat << EOF >> ${VHD_LOGS_FILEPATH} - nvidia-cuda-driver=${NVIDIA_DRIVER_IMAGE_TAG} EOF + + # Optionally pre-compile the NVIDIA kernel module into the VHD so that node provisioning can + # skip the expensive boot-time DKMS build. Scoped to the most common GPU SKU (Ubuntu 22.04 + # amd64, CUDA driver). The module is bound to the shipped kernel; CSE only takes the fast + # path when the kernel, driver version/kind, and image tag all match, otherwise it falls back + # to a full build at boot. Gated behind PREBUILD_GPU_KERNEL_MODULE so it is only attempted on + # VHDs whose aks-gpu image supports the build-only action. + if [ "${PREBUILD_GPU_KERNEL_MODULE:-false}" = "true" ] && [ "${UBUNTU_RELEASE}" = "22.04" ]; then + prebuildGPUKernelModule "$NVIDIA_DRIVER_IMAGE" "$NVIDIA_DRIVER_IMAGE_TAG" + fi fi if grep -q "NVIDIA_GB" <<< "$FEATURE_FLAGS"; then From ce2d5db62b76cc6b2ef5e4f5b08f7c0456e42816 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Mon, 1 Jun 2026 11:18:47 -0700 Subject: [PATCH 2/2] fix: assert builder kernel == shipped kernel before GPU module prebuild The GPU kernel module is DKMS-compiled against the builder's running kernel, but nodes boot the newest kernel baked into the image. If those differ, every node sees a marker kernel != uname -r, silently falls back to the boot-time build, and ships a useless prebuilt module. Fail the VHD build loudly when the running kernel isn't the newest installed, or when matching headers are absent, so the misorder is fixed instead of silently regressing the optimization. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- vhdbuilder/packer/install-dependencies.sh | 24 ++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 85ebd3b986c..02e79ac78dc 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -697,8 +697,30 @@ prebuildGPUKernelModule() { local image="$1" tag="$2" local ref="${image}:${tag}" local marker="/opt/azure/aks-gpu/dkms-marker" + local running newest + running="$(uname -r)" + + # Guard: the module is DKMS-compiled against the builder's *running* kernel, but a node boots + # the newest kernel baked into the image. If those differ, every provisioned node would see a + # marker kernel != uname -r, silently fall back to the full boot-time build, and ship a useless + # (space-wasting, unsigned) prebuilt module. Fail the build loudly so it gets fixed instead of + # silently regressing the optimization. + newest="$(ls -1 /lib/modules 2>/dev/null | sort -V | tail -n1)" + if [ -z "$newest" ]; then + echo "Error: could not enumerate installed kernels under /lib/modules" >&2 + exit 1 + fi + if [ "$running" != "$newest" ]; then + echo "Error: running kernel ($running) is not the newest installed kernel ($newest); a GPU module prebuilt now would not match the kernel nodes boot. Run prebuildGPUKernelModule after the final kernel is installed and the builder has rebooted into it." >&2 + exit 1 + fi + # The DKMS build needs the matching kernel headers on the builder. + if [ ! -d "/lib/modules/${running}/build" ]; then + echo "Error: kernel headers for ${running} not found (/lib/modules/${running}/build missing); cannot prebuild the GPU kernel module." >&2 + exit 1 + fi - echo "Prebuilding GPU kernel module into VHD from ${ref} for kernel $(uname -r)" + echo "Prebuilding GPU kernel module into VHD from ${ref} for kernel ${running}" mkdir -p /opt/{actions,gpu} rm -f "$marker"