Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -999,12 +999,44 @@ configAzurePolicyAddon() {
sed -i "s|<resourceId>|/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP|g" $AZURE_POLICY_ADDON_FILE
}

gpuPrebuiltModuleMatches() {
# Returns 0 only when the VHD baked a kernel module that exactly matches what CSE is about
# to install: same running kernel, same driver version + kind, and the same driver image
# tag. Any mismatch (kernel drift, newer driver from CRP, GRID SKU, older VHD without the
# marker) returns non-zero so configGPUDrivers falls back to the full build. The fast path
# is therefore purely an optimization -- correctness never depends on it.
local marker_file="${GPU_PREBUILT_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
[ -f "$marker_file" ] || return 1

local m_kernel m_version m_kind m_image
m_kernel=$(awk -F= '/^kernel=/{print $2; exit}' "$marker_file")
m_version=$(awk -F= '/^driver_version=/{print $2; exit}' "$marker_file")
m_kind=$(awk -F= '/^driver_kind=/{print $2; exit}' "$marker_file")
m_image=$(awk -F= '/^image_tag=/{print $2; exit}' "$marker_file")

[ -n "$m_kernel" ] && [ "$m_kernel" = "$(uname -r)" ] || return 1
[ -n "$m_version" ] && [ "$m_version" = "$GPU_DV" ] || return 1
[ -n "$m_kind" ] && [ "$m_kind" = "$NVIDIA_GPU_DRIVER_TYPE" ] || return 1
[ -n "$m_image" ] && [ "$m_image" = "$NVIDIA_DRIVER_IMAGE_TAG" ] || return 1

# The compiled module must actually resolve for the running kernel.
modinfo -k "$(uname -r)" nvidia >/dev/null 2>&1 || return 1
return 0
}

configGPUDrivers() {
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL
mkdir -p /opt/{actions,gpu}
# Fast path: when the VHD baked a kernel module matching this exact kernel + driver +
# image, skip the expensive boot-time DKMS compile and only run the device-init steps.
local gpu_install_action="install"
if gpuPrebuiltModuleMatches; then
echo "Prebuilt GPU kernel module matches running kernel/driver/image; using skip-build fast path"
gpu_install_action="install-skip-build"
fi
ctr -n k8s.io image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh ${gpu_install_action}"
ret=$?
if [ "$ret" -ne 0 ]; then
echo "Failed to install GPU driver, exiting..."
Expand Down
64 changes: 64 additions & 0 deletions spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1678,4 +1678,68 @@ SETUP_EOF
The output should include "rm -f /opt/azure/containers/managed-gpu-experience.enabled"
End
End

Describe 'gpuPrebuiltModuleMatches'
setup() {
GPU_PREBUILT_MARKER_FILE="$(mktemp)"
GPU_DV="535.230.02"
NVIDIA_GPU_DRIVER_TYPE="cuda"
NVIDIA_DRIVER_IMAGE_TAG="535.230.02-abc123"
}
cleanup() {
rm -f "$GPU_PREBUILT_MARKER_FILE"
}
BeforeEach 'setup'
AfterEach 'cleanup'

uname() { echo "5.15.0-1078-azure"; }
modinfo() { return 0; }

write_marker() {
cat > "$GPU_PREBUILT_MARKER_FILE" <<EOF
kernel=$1
driver_version=$2
driver_kind=$3
arch=amd64
image_tag=$4
EOF
}

It 'matches when kernel, driver version/kind, and image tag all align'
write_marker "5.15.0-1078-azure" "535.230.02" "cuda" "535.230.02-abc123"
When call gpuPrebuiltModuleMatches
The status should be success
End

It 'does not match when the marker file is absent'
rm -f "$GPU_PREBUILT_MARKER_FILE"
When call gpuPrebuiltModuleMatches
The status should be failure
End

It 'does not match on kernel drift'
write_marker "5.15.0-1000-azure" "535.230.02" "cuda" "535.230.02-abc123"
When call gpuPrebuiltModuleMatches
The status should be failure
End

It 'does not match when the driver image tag differs'
write_marker "5.15.0-1078-azure" "535.230.02" "cuda" "535.230.02-OLDSHA"
When call gpuPrebuiltModuleMatches
The status should be failure
End

It 'does not match for a different driver kind (e.g. grid)'
write_marker "5.15.0-1078-azure" "535.230.02" "grid" "535.230.02-abc123"
When call gpuPrebuiltModuleMatches
The status should be failure
End

It 'does not match when the compiled module is not loadable'
modinfo() { return 1; }
write_marker "5.15.0-1078-azure" "535.230.02" "cuda" "535.230.02-abc123"
When call gpuPrebuiltModuleMatches
The status should be failure
End
End
End
67 changes: 67 additions & 0 deletions vhdbuilder/packer/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,63 @@ while IFS= read -r imageToBePulled; do
fi
done <<< "$GPUContainerImages"

prebuildGPUKernelModule() {
# Runs the aks-gpu container in build-only mode to DKMS-compile the NVIDIA kernel module and
# stage userspace libs into the VHD, then records the driver image tag in the marker written
# by the container's install.sh. No device access happens, so this is safe on the GPU-less
# Packer builder. Must run after the final shipped kernel is in place (uname -r == VHD kernel)
# and before kernel/header autoremove, otherwise every node would ship a mismatched module.
local image="$1" tag="$2"
local ref="${image}:${tag}"
local marker="/opt/azure/aks-gpu/dkms-marker"
local running newest
running="$(uname -r)"

# Guard: the module is DKMS-compiled against the builder's *running* kernel, but a node boots
# the newest kernel baked into the image. If those differ, every provisioned node would see a
# marker kernel != uname -r, silently fall back to the full boot-time build, and ship a useless
# (space-wasting, unsigned) prebuilt module. Fail the build loudly so it gets fixed instead of
# silently regressing the optimization.
newest="$(ls -1 /lib/modules 2>/dev/null | sort -V | tail -n1)"
if [ -z "$newest" ]; then
echo "Error: could not enumerate installed kernels under /lib/modules" >&2
exit 1
fi
if [ "$running" != "$newest" ]; then
echo "Error: running kernel ($running) is not the newest installed kernel ($newest); a GPU module prebuilt now would not match the kernel nodes boot. Run prebuildGPUKernelModule after the final kernel is installed and the builder has rebooted into it." >&2
exit 1
fi
# The DKMS build needs the matching kernel headers on the builder.
if [ ! -d "/lib/modules/${running}/build" ]; then
echo "Error: kernel headers for ${running} not found (/lib/modules/${running}/build missing); cannot prebuild the GPU kernel module." >&2
exit 1
fi

echo "Prebuilding GPU kernel module into VHD from ${ref} for kernel ${running}"
mkdir -p /opt/{actions,gpu}
rm -f "$marker"

# image-fetcher already imported the image into the k8s.io containerd namespace.
if ! retrycmd_if_failure 3 10 1200 bash -c "ctr -n k8s.io run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind ${ref} buildgpu /entrypoint.sh build-only"; then
echo "Error: GPU kernel module prebuild (build-only) failed for ${ref}" >&2
exit 1
fi

if [ ! -f "$marker" ]; then
echo "Error: expected GPU prebuild marker ${marker} not found after build-only run" >&2
exit 1
fi

# Bind the baked module to this exact driver image so CSE only fast-paths on an exact match.
{
echo "image_tag=${tag}"
echo "image=${ref}"
} >> "$marker"

echo "GPU kernel module prebuilt into VHD:" >> ${VHD_LOGS_FILEPATH}
sed 's/^/ - /' "$marker" >> ${VHD_LOGS_FILEPATH}
}

# For Ubuntu, pre-pull the CUDA driver image
if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU with GPU now
gpu_action="copy"
Expand Down Expand Up @@ -718,6 +775,16 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit
cat << EOF >> ${VHD_LOGS_FILEPATH}
- nvidia-cuda-driver=${NVIDIA_DRIVER_IMAGE_TAG}
EOF

# Optionally pre-compile the NVIDIA kernel module into the VHD so that node provisioning can
# skip the expensive boot-time DKMS build. Scoped to the most common GPU SKU (Ubuntu 22.04
# amd64, CUDA driver). The module is bound to the shipped kernel; CSE only takes the fast
# path when the kernel, driver version/kind, and image tag all match, otherwise it falls back
# to a full build at boot. Gated behind PREBUILD_GPU_KERNEL_MODULE so it is only attempted on
# VHDs whose aks-gpu image supports the build-only action.
if [ "${PREBUILD_GPU_KERNEL_MODULE:-false}" = "true" ] && [ "${UBUNTU_RELEASE}" = "22.04" ]; then
prebuildGPUKernelModule "$NVIDIA_DRIVER_IMAGE" "$NVIDIA_DRIVER_IMAGE_TAG"
fi
fi

if grep -q "NVIDIA_GB" <<< "$FEATURE_FLAGS"; then
Expand Down
Loading