Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 28 additions & 18 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
driver_kind: ["cuda"]
include:
- config_key: cuda
image_repo: aks-gpu-cuda
- config_key: cuda_lts
image_repo: aks-gpu-cuda-lts
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Load CUDA config
id: load_config
run: |
cuda_version=$(yq e '.cuda.version' driver_config.yml)
cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml)
echo "CUDA_VERSION=$cuda_version"
echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT
- name: Set up QEMU
Expand All @@ -39,9 +43,9 @@ jobs:
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
${{ runner.os }}-buildx-${{ matrix.image_repo }}-
- name: Generate timestamp
id: timestamp
run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
Expand All @@ -59,7 +63,7 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} .
docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |
Expand All @@ -69,17 +73,19 @@ jobs:
runs-on: ubuntu-24.04-arm # see https://github.com/actions/partner-runner-images?tab=readme-ov-file#available-images
strategy:
matrix:
driver_kind: ["cuda"]
platform:
- linux/arm64
include:
- config_key: cuda
image_repo: aks-gpu-cuda-arm64
- config_key: cuda_lts
image_repo: aks-gpu-cuda-lts-arm64
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Load CUDA config
id: load_config
run: |
cuda_version=$(yq e '.cuda.version' driver_config.yml)
cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml)
echo "CUDA_VERSION=$cuda_version"
echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
Expand All @@ -88,9 +94,9 @@ jobs:
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
${{ runner.os }}-buildx-${{ matrix.image_repo }}-
- name: Generate timestamp
id: timestamp
run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
Expand All @@ -108,7 +114,7 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} .
docker buildx build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |
Expand All @@ -118,16 +124,20 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
driver_kind: ["grid"]
include:
- config_key: grid
image_repo: aks-gpu-grid
- config_key: grid_v20
image_repo: aks-gpu-grid-v20
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Load GRID config
id: load_config
run: |
grid_version=$(yq e '.grid.version' driver_config.yml)
grid_url=$(yq e '.grid.url' driver_config.yml)
grid_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml)
grid_url=$(yq e '.${{ matrix.config_key }}.url' driver_config.yml)
echo "GRID_VERSION=$grid_version"
echo "GRID_URL=$grid_url"
echo "grid_version=$grid_version" >> $GITHUB_OUTPUT
Expand All @@ -138,9 +148,9 @@ jobs:
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }}
key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}
${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }}
- name: Generate timestamp
id: timestamp
run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
Expand All @@ -158,7 +168,7 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} .
docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} .
docker images
- name: Move cache
run: |
Expand Down
52 changes: 31 additions & 21 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,19 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
driver_kind: ["cuda"]
include:
- config_key: cuda
image_repo: aks-gpu-cuda
- config_key: cuda_lts
image_repo: aks-gpu-cuda-lts
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Load CUDA config
id: load_config
run: |
cuda_version=$(yq e '.cuda.version' driver_config.yml)
cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml)
echo "CUDA_VERSION=$cuda_version"
echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT
- name: Set up QEMU
Expand All @@ -42,9 +46,9 @@ jobs:
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
${{ runner.os }}-buildx-${{ matrix.image_repo }}-
- name: Generate timestamp
id: timestamp
run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
Expand All @@ -68,10 +72,10 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} .
docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} .
docker images
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }}
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }}
- name: Move cache
run: |
rm -rf /tmp/.buildx-cache
Expand All @@ -80,17 +84,19 @@ jobs:
runs-on: ubuntu-24.04-arm # see https://github.com/actions/partner-runner-images?tab=readme-ov-file#available-images
strategy:
matrix:
driver_kind: ["cuda"]
platform:
- linux/arm64
include:
- config_key: cuda
image_repo: aks-gpu-cuda-arm64
- config_key: cuda_lts
image_repo: aks-gpu-cuda-lts-arm64
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Load CUDA config
id: load_config
run: |
cuda_version=$(yq e '.cuda.version' driver_config.yml)
cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml)
echo "CUDA_VERSION=$cuda_version"
echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT
- name: Set up Docker Buildx
Expand All @@ -99,9 +105,9 @@ jobs:
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}
key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-
${{ runner.os }}-buildx-${{ matrix.image_repo }}-
- name: Generate timestamp
id: timestamp
run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
Expand All @@ -125,10 +131,10 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda-arm64:${{ steps.semver.outputs.version }} .
docker buildx build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} .
docker images
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda-arm64:${{ steps.semver.outputs.version }}
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }}
- name: Move cache
run: |
rm -rf /tmp/.buildx-cache
Expand All @@ -137,16 +143,20 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
driver_kind: ["grid"]
include:
- config_key: grid
image_repo: aks-gpu-grid
- config_key: grid_v20
image_repo: aks-gpu-grid-v20
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Load GRID config
id: load_config
run: |
grid_version=$(yq e '.grid.version' driver_config.yml)
grid_url=$(yq e '.grid.url' driver_config.yml)
grid_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml)
grid_url=$(yq e '.${{ matrix.config_key }}.url' driver_config.yml)
echo "GRID_VERSION=$grid_version"
echo "GRID_URL=$grid_url"
echo "grid_version=$grid_version" >> $GITHUB_OUTPUT
Expand All @@ -161,9 +171,9 @@ jobs:
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }}
key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}
${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }}
- uses: paulhatch/semantic-version@v6.0.2
with:
bump_each_commit: false
Expand All @@ -184,10 +194,10 @@ jobs:
set -x
echo "tag is: "
echo ${{ steps.semver.outputs.version }}
docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} .
docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} .
docker images
az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }}
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }}
docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }}
- name: Move cache
run: |
rm -rf /tmp/.buildx-cache
Expand Down
10 changes: 10 additions & 0 deletions driver_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@ cuda:
# renovate: datasource=custom.nvidia-driver depName=nvidia-cuda-driver versioning=loose
version: "595.71.05"

cuda_lts:
# NVIDIA R580 Long Term Support Branch (LTSB), supported through Aug 2028.
version: "580.159.04"

grid:
version: "570.211.01"
# We do not support GRID drivers on ARM64 architecture.
url: "https://download.microsoft.com/download/2a04ca6a-9eec-40d9-9564-9cdea1ab795f/NVIDIA-Linux-x86_64-570.211.01-grid-azure.run"

grid_v20:
version: "595.58.03"
# We do not support GRID drivers on ARM64 architecture.
# GRID v20 is required for RTX PRO 6000 Blackwell Server Edition v6 SKUs.
url: "https://download.microsoft.com/download/51239696-ec04-4c02-a6b3-1d9c608fb57c/NVIDIA-Linux-x86_64-595.58.03-grid-azure.run"
16 changes: 14 additions & 2 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,30 @@ registry := "docker.io/alexeldeib"

default:

pushallcuda: (pushcuda)
pushallcuda: (pushcuda) (pushcudalts)

pushallgrid: (pushgrid)
pushallgrid: (pushgrid) (pushgridv20)

pushcuda: (buildcuda)
docker push {{ registry }}/aks-gpu:$(yq e '.cuda.version' driver_config.yml)-cuda

pushcudalts: (buildcudalts)
docker push {{ registry }}/aks-gpu:$(yq e '.cuda_lts.version' driver_config.yml)-cuda-lts

pushgrid: (buildgrid)
docker push {{ registry }}/aks-gpu:$(yq e '.grid.version' driver_config.yml)-grid

pushgridv20: (buildgridv20)
docker push {{ registry }}/aks-gpu:$(yq e '.grid_v20.version' driver_config.yml)-grid-v20

buildgrid:
docker build --build-arg DRIVER_URL=$(yq e '.grid.url' driver_config.yml) --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=$(yq e '.grid.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:{{VERSION}}-grid .

buildgridv20:
docker build --build-arg DRIVER_URL=$(yq e '.grid_v20.url' driver_config.yml) --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=$(yq e '.grid_v20.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:$(yq e '.grid_v20.version' driver_config.yml)-grid-v20 .

buildcuda:
docker build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=$(yq e '.cuda.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:$(yq e '.cuda.version' driver_config.yml)-cuda .

buildcudalts:
docker build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=$(yq e '.cuda_lts.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:$(yq e '.cuda_lts.version' driver_config.yml)-cuda-lts .
Loading