diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2ae57bd..172f20a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,7 +10,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - driver_kind: ["cuda"] + include: + - config_key: cuda + image_repo: aks-gpu-cuda + - config_key: cuda_lts + image_repo: aks-gpu-cuda-lts steps: - uses: actions/checkout@v6 with: @@ -18,7 +22,7 @@ jobs: - name: Load CUDA config id: load_config run: | - cuda_version=$(yq e '.cuda.version' driver_config.yml) + cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml) echo "CUDA_VERSION=$cuda_version" echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT - name: Set up QEMU @@ -39,9 +43,9 @@ jobs: uses: actions/cache@v5 with: path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ github.sha }} + key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }} restore-keys: | - ${{ runner.os }}-buildx- + ${{ runner.os }}-buildx-${{ matrix.image_repo }}- - name: Generate timestamp id: timestamp run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT @@ -59,7 +63,7 @@ jobs: set -x echo "tag is: " echo ${{ steps.semver.outputs.version }} - docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} . + docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} . docker images - name: Move cache run: | @@ -69,9 +73,11 @@ jobs: runs-on: ubuntu-24.04-arm # see https://github.com/actions/partner-runner-images?tab=readme-ov-file#available-images strategy: matrix: - driver_kind: ["cuda"] - platform: - - linux/arm64 + include: + - config_key: cuda + image_repo: aks-gpu-cuda-arm64 + - config_key: cuda_lts + image_repo: aks-gpu-cuda-lts-arm64 steps: - uses: actions/checkout@v6 with: @@ -79,7 +85,7 @@ jobs: - name: Load CUDA config id: load_config run: | - cuda_version=$(yq e '.cuda.version' driver_config.yml) + cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml) echo "CUDA_VERSION=$cuda_version" echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT - name: Set up Docker Buildx @@ -88,9 +94,9 @@ jobs: uses: actions/cache@v5 with: path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ github.sha }} + key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }} restore-keys: | - ${{ runner.os }}-buildx- + ${{ runner.os }}-buildx-${{ matrix.image_repo }}- - name: Generate timestamp id: timestamp run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT @@ -108,7 +114,7 @@ jobs: set -x echo "tag is: " echo ${{ steps.semver.outputs.version }} - docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} . + docker buildx build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} . docker images - name: Move cache run: | @@ -118,7 +124,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - driver_kind: ["grid"] + include: + - config_key: grid + image_repo: aks-gpu-grid + - config_key: grid_v20 + image_repo: aks-gpu-grid-v20 steps: - uses: actions/checkout@v6 with: @@ -126,8 +136,8 @@ jobs: - name: Load GRID config id: load_config run: | - grid_version=$(yq e '.grid.version' driver_config.yml) - grid_url=$(yq e '.grid.url' driver_config.yml) + grid_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml) + grid_url=$(yq e '.${{ matrix.config_key }}.url' driver_config.yml) echo "GRID_VERSION=$grid_version" echo "GRID_URL=$grid_url" echo "grid_version=$grid_version" >> $GITHUB_OUTPUT @@ -138,9 +148,9 @@ jobs: uses: actions/cache@v5 with: path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }} + key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }} restore-keys: | - ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }} + ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }} - name: Generate timestamp id: timestamp run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT @@ -158,7 +168,7 @@ jobs: set -x echo "tag is: " echo ${{ steps.semver.outputs.version }} - docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} . + docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} . docker images - name: Move cache run: | diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c7f650c..aea9e8d 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -13,7 +13,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - driver_kind: ["cuda"] + include: + - config_key: cuda + image_repo: aks-gpu-cuda + - config_key: cuda_lts + image_repo: aks-gpu-cuda-lts steps: - uses: actions/checkout@v6 with: @@ -21,7 +25,7 @@ jobs: - name: Load CUDA config id: load_config run: | - cuda_version=$(yq e '.cuda.version' driver_config.yml) + cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml) echo "CUDA_VERSION=$cuda_version" echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT - name: Set up QEMU @@ -42,9 +46,9 @@ jobs: uses: actions/cache@v5 with: path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ github.sha }} + key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }} restore-keys: | - ${{ runner.os }}-buildx- + ${{ runner.os }}-buildx-${{ matrix.image_repo }}- - name: Generate timestamp id: timestamp run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT @@ -68,10 +72,10 @@ jobs: set -x echo "tag is: " echo ${{ steps.semver.outputs.version }} - docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} . + docker buildx build --platform linux/arm64/v8,linux/amd64 --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} . docker images az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }} - docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda:${{ steps.semver.outputs.version }} + docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} - name: Move cache run: | rm -rf /tmp/.buildx-cache @@ -80,9 +84,11 @@ jobs: runs-on: ubuntu-24.04-arm # see https://github.com/actions/partner-runner-images?tab=readme-ov-file#available-images strategy: matrix: - driver_kind: ["cuda"] - platform: - - linux/arm64 + include: + - config_key: cuda + image_repo: aks-gpu-cuda-arm64 + - config_key: cuda_lts + image_repo: aks-gpu-cuda-lts-arm64 steps: - uses: actions/checkout@v6 with: @@ -90,7 +96,7 @@ jobs: - name: Load CUDA config id: load_config run: | - cuda_version=$(yq e '.cuda.version' driver_config.yml) + cuda_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml) echo "CUDA_VERSION=$cuda_version" echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT - name: Set up Docker Buildx @@ -99,9 +105,9 @@ jobs: uses: actions/cache@v5 with: path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ github.sha }} + key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ github.sha }} restore-keys: | - ${{ runner.os }}-buildx- + ${{ runner.os }}-buildx-${{ matrix.image_repo }}- - name: Generate timestamp id: timestamp run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT @@ -125,10 +131,10 @@ jobs: set -x echo "tag is: " echo ${{ steps.semver.outputs.version }} - docker buildx build --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda-arm64:${{ steps.semver.outputs.version }} . + docker buildx build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.cuda_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} . docker images az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }} - docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-cuda-arm64:${{ steps.semver.outputs.version }} + docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} - name: Move cache run: | rm -rf /tmp/.buildx-cache @@ -137,7 +143,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - driver_kind: ["grid"] + include: + - config_key: grid + image_repo: aks-gpu-grid + - config_key: grid_v20 + image_repo: aks-gpu-grid-v20 steps: - uses: actions/checkout@v6 with: @@ -145,8 +155,8 @@ jobs: - name: Load GRID config id: load_config run: | - grid_version=$(yq e '.grid.version' driver_config.yml) - grid_url=$(yq e '.grid.url' driver_config.yml) + grid_version=$(yq e '.${{ matrix.config_key }}.version' driver_config.yml) + grid_url=$(yq e '.${{ matrix.config_key }}.url' driver_config.yml) echo "GRID_VERSION=$grid_version" echo "GRID_URL=$grid_url" echo "grid_version=$grid_version" >> $GITHUB_OUTPUT @@ -161,9 +171,9 @@ jobs: uses: actions/cache@v5 with: path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }} + key: ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }} restore-keys: | - ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }} + ${{ runner.os }}-buildx-${{ matrix.image_repo }}-${{ steps.load_config.outputs.grid_version }} - uses: paulhatch/semantic-version@v6.0.2 with: bump_each_commit: false @@ -184,10 +194,10 @@ jobs: set -x echo "tag is: " echo ${{ steps.semver.outputs.version }} - docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} . + docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} . docker images az acr login -n ${{ secrets.AZURE_REGISTRY_SERVER }} - docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} + docker push ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/${{ matrix.image_repo }}:${{ steps.semver.outputs.version }} - name: Move cache run: | rm -rf /tmp/.buildx-cache diff --git a/driver_config.yml b/driver_config.yml index ac8cee3..6ec11b0 100644 --- a/driver_config.yml +++ b/driver_config.yml @@ -2,7 +2,17 @@ cuda: # renovate: datasource=custom.nvidia-driver depName=nvidia-cuda-driver versioning=loose version: "595.71.05" +cuda_lts: + # NVIDIA R580 Long Term Support Branch (LTSB), supported through Aug 2028. + version: "580.159.04" + grid: version: "570.211.01" # We do not support GRID drivers on ARM64 architecture. url: "https://download.microsoft.com/download/2a04ca6a-9eec-40d9-9564-9cdea1ab795f/NVIDIA-Linux-x86_64-570.211.01-grid-azure.run" + +grid_v20: + version: "595.58.03" + # We do not support GRID drivers on ARM64 architecture. + # GRID v20 is required for RTX PRO 6000 Blackwell Server Edition v6 SKUs. + url: "https://download.microsoft.com/download/51239696-ec04-4c02-a6b3-1d9c608fb57c/NVIDIA-Linux-x86_64-595.58.03-grid-azure.run" diff --git a/justfile b/justfile index 9049ba2..729c78b 100644 --- a/justfile +++ b/justfile @@ -2,18 +2,30 @@ registry := "docker.io/alexeldeib" default: -pushallcuda: (pushcuda) +pushallcuda: (pushcuda) (pushcudalts) -pushallgrid: (pushgrid) +pushallgrid: (pushgrid) (pushgridv20) pushcuda: (buildcuda) docker push {{ registry }}/aks-gpu:$(yq e '.cuda.version' driver_config.yml)-cuda +pushcudalts: (buildcudalts) + docker push {{ registry }}/aks-gpu:$(yq e '.cuda_lts.version' driver_config.yml)-cuda-lts + pushgrid: (buildgrid) docker push {{ registry }}/aks-gpu:$(yq e '.grid.version' driver_config.yml)-grid +pushgridv20: (buildgridv20) + docker push {{ registry }}/aks-gpu:$(yq e '.grid_v20.version' driver_config.yml)-grid-v20 + buildgrid: docker build --build-arg DRIVER_URL=$(yq e '.grid.url' driver_config.yml) --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=$(yq e '.grid.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:{{VERSION}}-grid . +buildgridv20: + docker build --build-arg DRIVER_URL=$(yq e '.grid_v20.url' driver_config.yml) --build-arg DRIVER_KIND=grid --build-arg DRIVER_VERSION=$(yq e '.grid_v20.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:$(yq e '.grid_v20.version' driver_config.yml)-grid-v20 . + buildcuda: docker build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=$(yq e '.cuda.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:$(yq e '.cuda.version' driver_config.yml)-cuda . + +buildcudalts: + docker build --build-arg DRIVER_KIND=cuda --build-arg DRIVER_VERSION=$(yq e '.cuda_lts.version' driver_config.yml) -f Dockerfile -t {{ registry }}/aks-gpu:$(yq e '.cuda_lts.version' driver_config.yml)-cuda-lts .