NVIDIA
diff --git a/‎.github/actions/fetch_ctk/action.yml‎
Lines changed: 17 additions & 26 deletions b/‎.github/actions/fetch_ctk/action.yml‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎.github/actions/sccache-summary/action.yml‎
Lines changed: 5 additions & 6 deletions b/‎.github/actions/sccache-summary/action.yml‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎benchmarks/cuda_bindings/runner/main.py‎
Lines changed: 46 additions & 24 deletions b/‎benchmarks/cuda_bindings/runner/main.py‎
Lines changed: 46 additions & 24 deletions
diff --git a/‎benchmarks/cuda_bindings/tests/test_runner.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cuda_bindings/tests/test_runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cuda_core/.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎benchmarks/cuda_core/.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎benchmarks/cuda_core/AGENTS.md‎
Lines changed: 11 additions & 0 deletions b/‎benchmarks/cuda_core/AGENTS.md‎
Lines changed: 11 additions & 0 deletions
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin,libcudla"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
@@ -27,24 +27,15 @@ runs:
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: |
         # Pre-process the component list to ensure hash uniqueness
+        # Use the runtime workspace mount so this also works inside container jobs.
+        CTK_REDIST_TOOL="${GITHUB_WORKSPACE}/ci/tools/fetch_ctk_redistrib.py"
         CTK_CACHE_COMPONENTS=${{ inputs.cuda-components }}
-        # Conditionally strip out libnvjitlink for CUDA versions < 12
-        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
-        if [[ "$CUDA_MAJOR_VER" -lt 12 ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
-        fi
-        # Conditionally strip out cuda_crt and libnvvm for CUDA versions < 13
-        CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
-        if [[ "$CUDA_MAJOR_VER" -lt 13 ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//cuda_crt/}"
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvvm/}"
-        fi
-        # Conditionally strip out libcufile since it does not support Windows
-        if [[ "${{ inputs.host-platform }}" == win-* ]]; then
-          CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
-        fi
-        # Cleanup stray commas after removing components
-        CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
+        CTK_JSON_URL="https://developer.download.nvidia.com/compute/cuda/redist/redistrib_${{ inputs.cuda-version }}.json"
+        CTK_CACHE_COMPONENTS="$(python "$CTK_REDIST_TOOL" filter-components \
+          --host-platform "${{ inputs.host-platform }}" \
+          --cuda-version "${{ inputs.cuda-version }}" \
+          --components "$CTK_CACHE_COMPONENTS" \
+          --metadata-url "$CTK_JSON_URL")"
 
         HASH=$(echo -n "${CTK_CACHE_COMPONENTS}" | sha256sum | awk '{print $1}')
         echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH" >> $GITHUB_ENV
@@ -78,19 +69,17 @@ runs:
         mkdir $CACHE_TMP_DIR
 
         # The binary archives (redist) are guaranteed to be updated as part of the release posting.
+        # Use the runtime workspace mount so this also works inside container jobs.
+        CTK_REDIST_TOOL="${GITHUB_WORKSPACE}/ci/tools/fetch_ctk_redistrib.py"
         CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
         CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
+        CTK_JSON_FILE="$CACHE_TMP_DIR/redistrib.json"
+        curl -LSs "$CTK_JSON_URL" -o "$CTK_JSON_FILE"
         if [[ "${{ inputs.host-platform }}" == linux* ]]; then
-          if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
-            CTK_SUBDIR="linux-x86_64"
-          elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
-            CTK_SUBDIR="linux-sbsa"
-          fi
           function extract() {
             tar -xvf $1 -C $CACHE_TMP_DIR --strip-components=1
           }
         elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
-          CTK_SUBDIR="windows-x86_64"
           function extract() {
             _TEMP_DIR_=$(mktemp -d)
             unzip $1 -d $_TEMP_DIR_
@@ -106,8 +95,10 @@ runs:
             curl -LSs $1 -o $2
           }
           CTK_COMPONENT=$1
-          CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
-              python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
+          CTK_COMPONENT_REL_PATH="$(python "$CTK_REDIST_TOOL" component-relative-path \
+            --host-platform "${{ inputs.host-platform }}" \
+            --component "$CTK_COMPONENT" \
+            --metadata-path "$CTK_JSON_FILE")"
           CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
           CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
           download $CTK_COMPONENT_URL $CTK_COMPONENT_COMPONENT_FILENAME
 
@@ -6,8 +6,6 @@ name: sccache summary
 description: Parse sccache stats JSON and write a summary table to GITHUB_STEP_SUMMARY
 
 # Inspired by NVIDIA/cccl's prepare-execution-summary.py (PR #3621).
-# Only counts C/C++ and CUDA language hits (excludes PTX/CUBIN which are
-# not included in sccache's compile_requests counter).
 
 inputs:
   json-file:
@@ -47,10 +45,11 @@ runs:
         with open(json_file) as f:
             stats = json.load(f)["stats"]
 
-        # compile_requests includes non-compilation calls (linker, etc).
-        # Use cache_hits + cache_misses as the denominator to match sccache's
-        # own "Cache hits rate" which only counts actual compilation requests.
-        counted_languages = {"C/C++", "CUDA"}
+        # compile_requests only counts top-level nvcc invocations, but each
+        # invocation spawns sub-tool compilations (cudafe++, cicc, ptxas) that
+        # sccache tracks under separate language keys.  Count all of them so
+        # the reported rate matches sccache's own "Cache hits rate".
+        counted_languages = {"C/C++", "CUDA", "CUDA (Device code)", "PTX", "CUBIN"}
         hits = sum(
             v for k, v in stats.get("cache_hits", {}).get("counts", {}).items()
             if k in counted_languages
 
@@ -16,30 +16,30 @@
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 BENCH_DIR = PROJECT_ROOT / "benchmarks"
 DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
+DEFAULT_MODULE_NAME_PREFIX = "cuda_bindings_bench"
 # Env var used to propagate the --benchmark filter from the parent to pyperf
 # worker subprocesses. pyperf reconstructs worker argv from scratch and drops
 # custom flags like --benchmark, so without this the worker would register the
 # full bench list and pyperf would run the wrong bench by task index.
-BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
+DEFAULT_BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
 
-PYPERF_INHERITED_ENV_VARS = (
+BASE_PYPERF_INHERITED_ENV_VARS = (
     "CUDA_HOME",
     "CUDA_PATH",
     "CUDA_VISIBLE_DEVICES",
     "LD_LIBRARY_PATH",
     "NVIDIA_VISIBLE_DEVICES",
-    BENCH_FILTER_ENV_VAR,
 )
 _MODULE_CACHE: dict[Path, ModuleType] = {}
 
 
-def load_module(module_path: Path) -> ModuleType:
+def load_module(module_path: Path, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX) -> ModuleType:
     module_path = module_path.resolve()
     cached_module = _MODULE_CACHE.get(module_path)
     if cached_module is not None:
         return cached_module
 
-    module_name = f"cuda_bindings_bench_{module_path.stem}"
+    module_name = f"{module_name_prefix}_{module_path.stem}"
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     if spec is None or spec.loader is None:
         raise RuntimeError(f"Failed to load benchmark module: {module_path}")
@@ -64,13 +64,17 @@ def _discover_module_functions(module_path: Path) -> list[str]:
     ]
 
 
-def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
+def _lazy_benchmark(
+    module_path: Path,
+    function_name: str,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+) -> Callable[[int], float]:
     loaded_function: Callable[[int], float] | None = None
 
     def run(loops: int) -> float:
         nonlocal loaded_function
         if loaded_function is None:
-            module = load_module(module_path)
+            module = load_module(module_path, module_name_prefix=module_name_prefix)
             loaded_function = getattr(module, function_name)
         return loaded_function(loops)
 
@@ -86,6 +90,7 @@ def run(loops: int) -> float:
 def _collect_skipped_benchmarks(
     bench_ids: list[str],
     registry: dict[str, Callable[[int], float]],
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
 ) -> set[str]:
     """Return bench IDs that the owning module has marked as unsupported.
 
@@ -106,29 +111,37 @@ def _collect_skipped_benchmarks(
             continue
         module = loaded_modules.get(module_path)
         if module is None:
-            module = load_module(module_path)
+            module = load_module(module_path, module_name_prefix=module_name_prefix)
             loaded_modules[module_path] = module
         module_skip = getattr(module, "SKIPPED_BENCHMARKS", None)
         if module_skip and function_name in module_skip:
             skipped.add(bench_id)
     return skipped
 
 
-def discover_benchmarks() -> dict[str, Callable[[int], float]]:
+def discover_benchmarks(
+    bench_dir: Path | None = None,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+) -> dict[str, Callable[[int], float]]:
     """Discover bench_ functions.
 
     Each bench_ function must have the signature: bench_*(loops: int) -> float
     where it calls the operation `loops` times and returns the total elapsed
     time in seconds (using time.perf_counter).
     """
+    # Resolve the default inside the call so tests (and embedders) can
+    # monkeypatch ``BENCH_DIR`` at the module level — Python binds default
+    # args at def-time, so a literal default would ignore later patches.
+    if bench_dir is None:
+        bench_dir = BENCH_DIR
     registry: dict[str, Callable[[int], float]] = {}
-    for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
+    for module_path in sorted(bench_dir.glob("bench_*.py")):
         module_name = module_path.stem
         for function_name in _discover_module_functions(module_path):
             bench_id = benchmark_id(module_name, function_name)
             if bench_id in registry:
                 raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
-            registry[bench_id] = _lazy_benchmark(module_path, function_name)
+            registry[bench_id] = _lazy_benchmark(module_path, function_name, module_name_prefix=module_name_prefix)
     return registry
 
 
@@ -152,7 +165,10 @@ def _split_env_vars(arg_value: str) -> list[str]:
     return [env_var for env_var in arg_value.split(",") if env_var]
 
 
-def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
+def ensure_pyperf_worker_env(
+    argv: list[str],
+    extra_env_vars: tuple[str, ...] = (DEFAULT_BENCH_FILTER_ENV_VAR,),
+) -> list[str]:
     if "--copy-env" in argv:
         return list(argv)
 
@@ -175,7 +191,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
     if skip_next:
         raise ValueError("Missing value for --inherit-environ")
 
-    for env_var in PYPERF_INHERITED_ENV_VARS:
+    for env_var in (*BASE_PYPERF_INHERITED_ENV_VARS, *extra_env_vars):
         if env_var in os.environ:
             inherited_env.append(env_var)
 
@@ -190,7 +206,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
     return cleaned
 
 
-def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
+def parse_args(argv: list[str], default_output: Path = DEFAULT_OUTPUT) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument(
         "--benchmark",
@@ -207,19 +223,25 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
         "-o",
         "--output",
         type=Path,
-        default=DEFAULT_OUTPUT,
-        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
+        default=default_output,
+        help=f"JSON output file path (default: {default_output.name})",
     )
     parsed, remaining = parser.parse_known_args(argv)
     return parsed, remaining
 
 
-def main() -> None:
-    parsed, remaining_argv = parse_args(sys.argv[1:])
+def main(
+    *,
+    bench_dir: Path = BENCH_DIR,
+    default_output: Path = DEFAULT_OUTPUT,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+    bench_filter_env_var: str = DEFAULT_BENCH_FILTER_ENV_VAR,
+) -> None:
+    parsed, remaining_argv = parse_args(sys.argv[1:], default_output=default_output)
 
-    registry = discover_benchmarks()
+    registry = discover_benchmarks(bench_dir=bench_dir, module_name_prefix=module_name_prefix)
     if not registry:
-        raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
+        raise RuntimeError(f"No benchmark functions found in {bench_dir}")
 
     if parsed.list:
         for bench_id in sorted(registry):
@@ -231,7 +253,7 @@ def main() -> None:
     # the wrong bench. pyperf drops unknown CLI flags when spawning workers,
     # so fall back to an env var carrying the filter.
     requested = list(parsed.benchmark)
-    env_filter = os.environ.get(BENCH_FILTER_ENV_VAR, "")
+    env_filter = os.environ.get(bench_filter_env_var, "")
     if not requested and env_filter:
         requested = [bid for bid in env_filter.split(",") if bid]
 
@@ -243,21 +265,21 @@ def main() -> None:
             raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
         benchmark_ids = requested
         # Propagate to any pyperf worker we're about to spawn.
-        os.environ[BENCH_FILTER_ENV_VAR] = ",".join(benchmark_ids)
+        os.environ[bench_filter_env_var] = ",".join(benchmark_ids)
     else:
         benchmark_ids = sorted(registry)
 
     # Strip any --output args to avoid conflicts with our output handling.
     output_path = parsed.output.resolve()
     remaining_argv = strip_pyperf_output_args(remaining_argv)
-    remaining_argv = ensure_pyperf_worker_env(remaining_argv)
+    remaining_argv = ensure_pyperf_worker_env(remaining_argv, extra_env_vars=(bench_filter_env_var,))
     is_worker = "--worker" in remaining_argv
 
     # Drop benchmarks that the owning module has marked as unavailable on
     # this driver/device. Without this step a single unsupported bench
     # (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
     # since pyperf treats a raised exception as a fatal worker failure.
-    skipped = _collect_skipped_benchmarks(benchmark_ids, registry)
+    skipped = _collect_skipped_benchmarks(benchmark_ids, registry, module_name_prefix=module_name_prefix)
     if skipped and not is_worker:
         for bench_id in sorted(skipped):
             print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)
 
@@ -135,7 +135,7 @@ def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
 def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
     runner_main = load_runner_main(monkeypatch)
 
-    for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
+    for env_var in runner_main.BASE_PYPERF_INHERITED_ENV_VARS:
         monkeypatch.delenv(env_var, raising=False)
     monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
     monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")
 
@@ -0,0 +1,16 @@
+# Build artifacts
+.build/
+__pycache__/
+
+# Benchmark results
+*.json
+.benchmarks/
+
+# Pixi environments
+.pixi/
+
+# Override root .gitignore *.cpp rule (which targets Cython-generated files)
+!benchmarks/cpp/*.cpp
+
+results-python.json
+results-cpp.json
@@ -0,0 +1,11 @@
+# cuda.core benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-core
+that can be found in ../../cuda_core.
+
+This suite shares the pyperf runner with `../cuda_bindings/` via a sys.path
+insert in `run_pyperf.py`. The per-suite setup (`runtime.py`, the `benchmarks/`
+module files) lives here. Benchmark IDs are kept identical to the cuda.bindings
+suite so `compare.py` can diff them directly.