Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 55 additions & 18 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ ARG TARGETPLATFORM
ARG INSTALL_KV_CONNECTORS=false
ENV DEBIAN_FRONTEND=noninteractive

ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04

ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
Expand Down Expand Up @@ -137,13 +141,14 @@ COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
--prerelease=allow

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ARG torch_cuda_arch_list='7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################

Expand Down Expand Up @@ -198,6 +203,10 @@ ARG TARGETPLATFORM

SHELL ["/bin/bash", "-c"]

ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
Comment on lines +206 to +208
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

These ARG declarations for GDRCOPY_CUDA_VERSION and GDRCOPY_OS_VERSION are duplicated from lines 71-73. To improve maintainability and avoid potential inconsistencies where one is updated but not the other, consider defining these arguments globally before the first FROM statement.

For example:

ARG GDRCOPY_CUDA_VERSION=12.8
ARG GDRCOPY_OS_VERSION=Ubuntu22_04

...

FROM ... as base
ARG GDRCOPY_CUDA_VERSION
ARG GDRCOPY_OS_VERSION
...

FROM base as dev
ARG GDRCOPY_CUDA_VERSION
ARG GDRCOPY_OS_VERSION
...

This way, the default values are defined in a single place, reducing redundancy and risk of error.


ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL
Expand Down Expand Up @@ -273,7 +282,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY --from=build /workspace/dist /aphrodite-workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
--prerelease=allow

# If we need to build FlashInfer wheel before its release:
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
Expand All @@ -288,7 +298,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
ARG FLASHINFER_GIT_REF="v0.3.1"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
Expand All @@ -298,19 +308,32 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
pushd flashinfer
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
fi
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
# Build AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
Expand Down Expand Up @@ -351,13 +374,27 @@ COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
APHRODITE_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}

# Install EP kernels(pplx-kernels and DeepEP), NixL
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
RUN set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
rm ./install_gdrcopy.sh
Comment on lines +377 to +385
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The indentation of the COPY and RUN instructions here is misleading, as it suggests they are part of the preceding RUN command's shell script, but they are separate Dockerfile instructions. This should be fixed for clarity.

Additionally, using separate COPY and RUN commands for a temporary script adds an unnecessary layer to the Docker image because the file from the COPY layer is not removed from the final image, even with rm in the RUN layer. This can be optimized by using a single RUN command with a --mount=type=bind flag to make the script available temporarily without adding it to any image layers. This improves both clarity and image size.

RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=install_gdrcopy.sh \
    set -eux; \
    case "${TARGETPLATFORM}" in \
      linux/arm64) UUARCH="aarch64" ;; \
      linux/amd64) UUARCH="x64" ;; \
      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
    esac; \
    ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"


# Install EP kernels(pplx-kernels and DeepEP)
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
COPY tools/install_nixl.sh install_nixl.sh
ENV CUDA_HOME=/usr/local/cuda
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
&& bash install_python_libraries.sh \
&& bash install_nixl.sh --force
&& bash install_python_libraries.sh

# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}

#################### Aphrodite installation IMAGE ####################

Expand Down
63 changes: 63 additions & 0 deletions tools/flashinfer-build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env bash
# This script is used to build FlashInfer wheels with AOT kernels

set -ex

# FlashInfer configuration
FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
CUDA_VERSION="${CUDA_VERSION}"
BUILD_WHEEL="${BUILD_WHEEL:-true}"

if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
echo "❌ FLASHINFER_GIT_REF must be specified" >&2
exit 1
fi

if [[ -z "${CUDA_VERSION}" ]]; then
echo "❌ CUDA_VERSION must be specified" >&2
exit 1
fi

echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"

# Clone FlashInfer
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer

# Set CUDA arch list based on CUDA version
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi

echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"

pushd flashinfer
# Make sure the wheel is built for the correct CUDA version
export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Build AOT kernels
export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
python3 -m flashinfer.aot

if [[ "${BUILD_WHEEL}" == "true" ]]; then
# Build wheel for distribution
uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
else
# Install directly (for Dockerfile)
uv pip install --system --no-build-isolation --force-reinstall .
echo "✅ FlashInfer installed successfully"
fi
popd

# Cleanup
rm -rf flashinfer
57 changes: 57 additions & 0 deletions tools/install_gdrcopy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env bash
set -euo pipefail

# Usage: install_gdrcopy.sh <GDRCOPY_OS_VERSION> <GDRCOPY_CUDA_VERSION> <uuarch>
# uuarch must be "x64" or "aarch64"
# Optional: set GDRCOPY_VERSION to override the libgdrapi package version (default: 2.5.1-1)
# Requires: curl, apt-get, root privileges
if [[ $(id -u) -ne 0 ]]; then
echo "Must be run as root" >&2

exit 1
fi
if [[ $# -ne 3 ]]; then
echo "Usage: $0 <GDRCOPY_OS_VERSION> <GDRCOPY_CUDA_VERSION> <uuarch(x64|aarch64)>" >&2
exit 1
fi

OS_VER="$1"
CUDA_VER="$2"
UUARCH_RAW="$3"

# Normalize/validate arch
case "${UUARCH_RAW,,}" in
aarch64|arm64)
URL_ARCH="aarch64"
DEB_ARCH="arm64"
;;
x64|x86_64|amd64)
URL_ARCH="x64"
DEB_ARCH="amd64"
;;
*)
echo "Unsupported uuarch: ${UUARCH_RAW}. Use 'x64' or 'aarch64'." >&2
exit 1
;;
esac

OS_VER_LOWER="$(tr '[:upper:]' '[:lower:]' <<<"$OS_VER")"
GDRCOPY_PKG_VER="${GDRCOPY_VERSION:-2.5.1-1}"

DEB_NAME="libgdrapi_${GDRCOPY_PKG_VER}_${DEB_ARCH}.${OS_VER}.deb"
BASE_URL="https://developer.download.nvidia.com/compute/redist/gdrcopy"
URL="${BASE_URL}/CUDA%20${CUDA_VER}/${OS_VER_LOWER}/${URL_ARCH}/${DEB_NAME}"

echo "Downloading: ${URL}"
TMPDIR="$(mktemp -d)"
trap 'rm -rf "${TMPDIR}"' EXIT

curl -fSL "${URL}" -o "${TMPDIR}/${DEB_NAME}"

export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y "${TMPDIR}/${DEB_NAME}"
apt-get clean
rm -rf /var/lib/apt/lists/*

echo "Installed ${DEB_NAME}"
Loading