NVIDIA
diff --git a/‎.github/actions/fetch_ctk/action.yml‎
Lines changed: 17 additions & 26 deletions b/‎.github/actions/fetch_ctk/action.yml‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎.github/actions/sccache-summary/action.yml‎
Lines changed: 5 additions & 6 deletions b/‎.github/actions/sccache-summary/action.yml‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎.github/workflows/build-wheel.yml‎
Lines changed: 1 addition & 5 deletions b/‎.github/workflows/build-wheel.yml‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎.github/workflows/ci-nightly.yml‎
Lines changed: 257 additions & 0 deletions b/‎.github/workflows/ci-nightly.yml‎
Lines changed: 257 additions & 0 deletions
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin,libcudla"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
@@ -27,24 +27,15 @@ runs:
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         # Pre-process the component list to ensure hash uniqueness
+        # Use the runtime workspace mount so this also works inside container jobs.
+        CTK_REDIST_TOOL="${GITHUB_WORKSPACE}/ci/tools/fetch_ctk_redistrib.py"
         CTK_CACHE_COMPONENTS=${{ inputs.cuda-components }}
-        # Conditionally strip out libnvjitlink for CUDA versions < 12
-        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
-        if [[ "$CUDA_MAJOR_VER" -lt 12 ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
-        fi
-        # Conditionally strip out cuda_crt and libnvvm for CUDA versions < 13
-        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
-        if [[ "$CUDA_MAJOR_VER" -lt 13 ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//cuda_crt/}"
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvvm/}"
-        fi
-        # Conditionally strip out libcufile since it does not support Windows
-        if [[ "${{ inputs.host-platform }}" == win-* ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
-        fi
-        # Cleanup stray commas after removing components
-        CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
+        CTK_JSON_URL="https://developer.download.nvidia.com/compute/cuda/redist/redistrib_${{ inputs.cuda-version }}.json"
+        CTK_CACHE_COMPONENTS="$(python "$CTK_REDIST_TOOL" filter-components \
+          --host-platform "${{ inputs.host-platform }}" \
+          --cuda-version "${{ inputs.cuda-version }}" \
+          --components "$CTK_CACHE_COMPONENTS" \
+          --metadata-url "$CTK_JSON_URL")"
 
         HASH=$(echo -n "${CTK_CACHE_COMPONENTS}" | sha256sum | awk '{print $1}')
         echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH" >> $GITHUB_ENV
@@ -78,19 +69,17 @@ runs:
         mkdir $CACHE_TMP_DIR
 
         # The binary archives (redist) are guaranteed to be updated as part of the release posting.
+        # Use the runtime workspace mount so this also works inside container jobs.
+        CTK_REDIST_TOOL="${GITHUB_WORKSPACE}/ci/tools/fetch_ctk_redistrib.py"
         CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
         CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
+        CTK_JSON_FILE="$CACHE_TMP_DIR/redistrib.json"
+        curl -LSs "$CTK_JSON_URL" -o "$CTK_JSON_FILE"
         if [[ "${{ inputs.host-platform }}" == linux* ]]; then
-          if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
-            CTK_SUBDIR="linux-x86_64"
-          elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
-            CTK_SUBDIR="linux-sbsa"
-          fi
           function extract() {
             tar -xvf $1 -C $CACHE_TMP_DIR --strip-components=1
           }
         elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
-          CTK_SUBDIR="windows-x86_64"
           function extract() {
             _TEMP_DIR_=$(mktemp -d)
             unzip $1 -d $_TEMP_DIR_
@@ -106,8 +95,10 @@ runs:
             curl -LSs $1 -o $2
           }
           CTK_COMPONENT=$1
-          CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
-              python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
+          CTK_COMPONENT_REL_PATH="$(python "$CTK_REDIST_TOOL" component-relative-path \
+            --host-platform "${{ inputs.host-platform }}" \
+            --component "$CTK_COMPONENT" \
+            --metadata-path "$CTK_JSON_FILE")"
           CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
           CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
           download $CTK_COMPONENT_URL $CTK_COMPONENT_COMPONENT_FILENAME
 
@@ -6,8 +6,6 @@ name: sccache summary
 description: Parse sccache stats JSON and write a summary table to GITHUB_STEP_SUMMARY
 
 # Inspired by NVIDIA/cccl's prepare-execution-summary.py (PR #3621).
-# Only counts C/C++ and CUDA language hits (excludes PTX/CUBIN which are
-# not included in sccache's compile_requests counter).
 
 inputs:
   json-file:
@@ -47,10 +45,11 @@ runs:
         with open(json_file) as f:
             stats = json.load(f)["stats"]
 
-        # compile_requests includes non-compilation calls (linker, etc).
-        # Use cache_hits + cache_misses as the denominator to match sccache's
-        # own "Cache hits rate" which only counts actual compilation requests.
-        counted_languages = {"C/C++", "CUDA"}
+        # compile_requests only counts top-level nvcc invocations, but each
+        # invocation spawns sub-tool compilations (cudafe++, cicc, ptxas) that
+        # sccache tracks under separate language keys.  Count all of them so
+        # the reported rate matches sccache's own "Cache hits rate".
+        counted_languages = {"C/C++", "CUDA", "CUDA (Device code)", "PTX", "CUBIN"}
         hits = sum(
             v for k, v in stats.get("cache_hits", {}).get("counts", {}).items()
             if k in counted_languages
 
@@ -401,11 +401,7 @@ jobs:
 
           OLD_BRANCH=$(yq '.backport_branch' ci/versions.yml)
           OLD_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda*-${{ inputs.host-platform }}*"
-          LATEST_PRIOR_RUN_ID=$(gh run list -b ${OLD_BRANCH} -L 1 -w "ci.yml" -s success -R NVIDIA/cuda-python --json databaseId | jq '.[]| .databaseId')
-          if [[ "$LATEST_PRIOR_RUN_ID" == "" ]]; then
-            echo "LATEST_PRIOR_RUN_ID not found!"
-            exit 1
-          fi
+          LATEST_PRIOR_RUN_ID=$(./ci/tools/lookup-run-id --branch "${OLD_BRANCH}" NVIDIA/cuda-python "CI")
 
           gh run download $LATEST_PRIOR_RUN_ID -p ${OLD_BASENAME} -R NVIDIA/cuda-python
           rm -rf ${OLD_BASENAME}-tests  # exclude cython test artifacts
 
@@ -0,0 +1,257 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Nightly CI pipeline that tests optional dependencies (PyTorch, numba-cuda)
+# against the latest cuda-python wheels built on main, and runs the standard
+# test suite on runners reserved for nightly-only use (e.g. arm64 l4×2).
+#
+# This workflow does NOT build wheels — it downloads them from the latest
+# successful CI run on main and runs integration/standard tests.
+
+name: "CI: Nightly optional-deps"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+on:
+  schedule:
+    # 2:17 AM UTC daily, after the midnight main CI build finishes.
+    # Avoid minute 0 because GitHub documents high scheduled-workflow load
+    # at the start of every hour, where queued jobs may be delayed or dropped.
+    - cron: "17 2 * * *"
+  workflow_dispatch:
+    inputs:
+      run-id:
+        description: >
+          Override the CI run ID to download artifacts from.
+          Leave empty to auto-detect the latest successful main run.
+        type: string
+        default: ''
+
+jobs:
+  find-wheels:
+    runs-on: ubuntu-latest
+    outputs:
+      RUN_ID: ${{ steps.find.outputs.run_id }}
+      HEAD_SHA: ${{ steps.find.outputs.head_sha }}
+      CUDA_BUILD_VER: ${{ steps.find.outputs.cuda_build_ver }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          fetch-depth: 1
+
+      - name: Find latest successful CI run on main
+        id: find
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [[ -n "${{ inputs.run-id }}" ]]; then
+            RUN_ID="${{ inputs.run-id }}"
+            HEAD_SHA=$(gh run view "$RUN_ID" \
+              -R "${{ github.repository }}" \
+              --json headSha | jq -r '.headSha')
+          else
+            # lookup-run-id --branch --head-sha prints two lines: run_id then head_sha
+            OUTPUT=$(./ci/tools/lookup-run-id --branch main --head-sha "${{ github.repository }}" "CI")
+            RUN_ID=$(echo "$OUTPUT" | sed -n '1p')
+            HEAD_SHA=$(echo "$OUTPUT" | sed -n '2p')
+          fi
+
+          if [[ -z "$HEAD_SHA" || "$HEAD_SHA" == "null" ]]; then
+            echo "::error::Could not resolve head SHA for CI run $RUN_ID"
+            exit 1
+          fi
+
+          CUDA_BUILD_VER=$(gh api \
+            "repos/${{ github.repository }}/contents/ci/versions.yml?ref=$HEAD_SHA" \
+            --jq '.content' \
+            | base64 -d \
+            | yq '.cuda.build.version')
+
+          if [[ -z "$CUDA_BUILD_VER" || "$CUDA_BUILD_VER" == "null" ]]; then
+            echo "::error::Could not resolve CUDA build version from $HEAD_SHA"
+            exit 1
+          fi
+
+          echo "run_id=$RUN_ID" >> $GITHUB_OUTPUT
+          echo "head_sha=$HEAD_SHA" >> $GITHUB_OUTPUT
+          echo "cuda_build_ver=$CUDA_BUILD_VER" >> $GITHUB_OUTPUT
+
+  # ── PyTorch interop tests ──
+
+  test-pytorch-linux:
+    name: "Nightly PyTorch (linux-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-pytorch
+      matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
+
+  test-pytorch-linux-aarch64:
+    name: "Nightly PyTorch (linux-aarch64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-aarch64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-pytorch
+      matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
+
+  test-pytorch-windows:
+    name: "Nightly PyTorch (win-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: nightly
+      host-platform: win-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-pytorch
+      matrix_filter: 'map(select(.MODE == "nightly-pytorch"))'
+
+  # ── numba-cuda tests ──
+
+  test-numba-cuda-linux-64:
+    name: "Nightly numba-cuda (linux-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-numba-cuda
+      matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
+
+  test-numba-cuda-linux-aarch64:
+    name: "Nightly numba-cuda (linux-aarch64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-aarch64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-numba-cuda
+      matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
+
+  test-numba-cuda-windows:
+    name: "Nightly numba-cuda (win-64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-windows.yml
+    with:
+      build-type: nightly
+      host-platform: win-64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: nightly-numba-cuda
+      matrix_filter: 'map(select(.MODE == "nightly-numba-cuda"))'
+
+  # ── Standard tests on nightly-only runners ──
+
+  test-standard-linux-aarch64:
+    name: "Nightly standard (linux-aarch64)"
+    if: ${{ github.repository_owner == 'nvidia' }}
+    needs: find-wheels
+    permissions:
+      contents: read
+      actions: read
+    secrets: inherit
+    uses: ./.github/workflows/test-wheel-linux.yml
+    with:
+      build-type: nightly
+      host-platform: linux-aarch64
+      build-ctk-ver: ${{ needs.find-wheels.outputs.CUDA_BUILD_VER }}
+      run-id: ${{ needs.find-wheels.outputs.RUN_ID }}
+      sha: ${{ needs.find-wheels.outputs.HEAD_SHA }}
+      test-mode: standard
+      matrix_filter: 'map(select(.MODE == "nightly-standard"))'
+
+  # ── Status check ──
+
+  checks:
+    name: Nightly check status
+    if: always()
+    runs-on: ubuntu-latest
+    needs:
+      - find-wheels
+      - test-pytorch-linux
+      - test-pytorch-linux-aarch64
+      - test-pytorch-windows
+      - test-numba-cuda-linux-64
+      - test-numba-cuda-linux-aarch64
+      - test-numba-cuda-windows
+      - test-standard-linux-aarch64
+    steps:
+      - name: Exit
+        run: |
+          # If any dependency was cancelled or failed, that's a failure.
+          #
+          # See ci.yml for the full rationale on why we must use always()
+          # and explicitly check each result rather than relying on the
+          # default behaviour.
+          if ${{ needs.find-wheels.result != 'success' }}; then
+            exit 1
+          fi
+          if ${{ needs.test-pytorch-linux.result == 'cancelled' ||
+                 needs.test-pytorch-linux.result == 'failure' ||
+                 needs.test-pytorch-linux-aarch64.result == 'cancelled' ||
+                 needs.test-pytorch-linux-aarch64.result == 'failure' ||
+                 needs.test-pytorch-windows.result == 'cancelled' ||
+                 needs.test-pytorch-windows.result == 'failure' ||
+                 needs.test-numba-cuda-linux-64.result == 'cancelled' ||
+                 needs.test-numba-cuda-linux-64.result == 'failure' ||
+                 needs.test-numba-cuda-linux-aarch64.result == 'cancelled' ||
+                 needs.test-numba-cuda-linux-aarch64.result == 'failure' ||
+                 needs.test-numba-cuda-windows.result == 'cancelled' ||
+                 needs.test-numba-cuda-windows.result == 'failure' ||
+                 needs.test-standard-linux-aarch64.result == 'cancelled' ||
+                 needs.test-standard-linux-aarch64.result == 'failure' }}; then
+            exit 1
+          fi
+          exit 0