Skip to content

Commit 5b106c4

Browse files
authored
Merge branch 'main' into fix-graph-destructor-shutdown
2 parents 327b53e + ff43d26 commit 5b106c4

39 files changed

Lines changed: 6417 additions & 993 deletions

.github/actions/fetch_ctk/action.yml

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ inputs:
1414
cuda-components:
1515
description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
1616
required: false
17-
default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin"
17+
default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_cupti,libnvjitlink,libcufile,libnvfatbin,libcudla"
1818
cuda-path:
1919
description: "where the CTK components will be installed to, relative to $PWD"
2020
required: false
@@ -27,24 +27,15 @@ runs:
2727
shell: bash --noprofile --norc -xeuo pipefail {0}
2828
run: |
2929
# Pre-process the component list to ensure hash uniqueness
30+
# Use the runtime workspace mount so this also works inside container jobs.
31+
CTK_REDIST_TOOL="${GITHUB_WORKSPACE}/ci/tools/fetch_ctk_redistrib.py"
3032
CTK_CACHE_COMPONENTS=${{ inputs.cuda-components }}
31-
# Conditionally strip out libnvjitlink for CUDA versions < 12
32-
CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
33-
if [[ "$CUDA_MAJOR_VER" -lt 12 ]]; then
34-
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvjitlink/}"
35-
fi
36-
# Conditionally strip out cuda_crt and libnvvm for CUDA versions < 13
37-
CUDA_MAJOR_VER="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
38-
if [[ "$CUDA_MAJOR_VER" -lt 13 ]]; then
39-
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//cuda_crt/}"
40-
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libnvvm/}"
41-
fi
42-
# Conditionally strip out libcufile since it does not support Windows
43-
if [[ "${{ inputs.host-platform }}" == win-* ]]; then
44-
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//libcufile/}"
45-
fi
46-
# Cleanup stray commas after removing components
47-
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
33+
CTK_JSON_URL="https://developer.download.nvidia.com/compute/cuda/redist/redistrib_${{ inputs.cuda-version }}.json"
34+
CTK_CACHE_COMPONENTS="$(python "$CTK_REDIST_TOOL" filter-components \
35+
--host-platform "${{ inputs.host-platform }}" \
36+
--cuda-version "${{ inputs.cuda-version }}" \
37+
--components "$CTK_CACHE_COMPONENTS" \
38+
--metadata-url "$CTK_JSON_URL")"
4839
4940
HASH=$(echo -n "${CTK_CACHE_COMPONENTS}" | sha256sum | awk '{print $1}')
5041
echo "CTK_CACHE_KEY=mini-ctk-${{ inputs.cuda-version }}-${{ inputs.host-platform }}-$HASH" >> $GITHUB_ENV
@@ -78,19 +69,17 @@ runs:
7869
mkdir $CACHE_TMP_DIR
7970
8071
# The binary archives (redist) are guaranteed to be updated as part of the release posting.
72+
# Use the runtime workspace mount so this also works inside container jobs.
73+
CTK_REDIST_TOOL="${GITHUB_WORKSPACE}/ci/tools/fetch_ctk_redistrib.py"
8174
CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
8275
CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
76+
CTK_JSON_FILE="$CACHE_TMP_DIR/redistrib.json"
77+
curl -LSs "$CTK_JSON_URL" -o "$CTK_JSON_FILE"
8378
if [[ "${{ inputs.host-platform }}" == linux* ]]; then
84-
if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
85-
CTK_SUBDIR="linux-x86_64"
86-
elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
87-
CTK_SUBDIR="linux-sbsa"
88-
fi
8979
function extract() {
9080
tar -xvf $1 -C $CACHE_TMP_DIR --strip-components=1
9181
}
9282
elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
93-
CTK_SUBDIR="windows-x86_64"
9483
function extract() {
9584
_TEMP_DIR_=$(mktemp -d)
9685
unzip $1 -d $_TEMP_DIR_
@@ -106,8 +95,10 @@ runs:
10695
curl -LSs $1 -o $2
10796
}
10897
CTK_COMPONENT=$1
109-
CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
110-
python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
98+
CTK_COMPONENT_REL_PATH="$(python "$CTK_REDIST_TOOL" component-relative-path \
99+
--host-platform "${{ inputs.host-platform }}" \
100+
--component "$CTK_COMPONENT" \
101+
--metadata-path "$CTK_JSON_FILE")"
111102
CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
112103
CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
113104
download $CTK_COMPONENT_URL $CTK_COMPONENT_COMPONENT_FILENAME

.github/actions/sccache-summary/action.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ name: sccache summary
66
description: Parse sccache stats JSON and write a summary table to GITHUB_STEP_SUMMARY
77

88
# Inspired by NVIDIA/cccl's prepare-execution-summary.py (PR #3621).
9-
# Only counts C/C++ and CUDA language hits (excludes PTX/CUBIN which are
10-
# not included in sccache's compile_requests counter).
119

1210
inputs:
1311
json-file:
@@ -47,10 +45,11 @@ runs:
4745
with open(json_file) as f:
4846
stats = json.load(f)["stats"]
4947
50-
# compile_requests includes non-compilation calls (linker, etc).
51-
# Use cache_hits + cache_misses as the denominator to match sccache's
52-
# own "Cache hits rate" which only counts actual compilation requests.
53-
counted_languages = {"C/C++", "CUDA"}
48+
# compile_requests only counts top-level nvcc invocations, but each
49+
# invocation spawns sub-tool compilations (cudafe++, cicc, ptxas) that
50+
# sccache tracks under separate language keys. Count all of them so
51+
# the reported rate matches sccache's own "Cache hits rate".
52+
counted_languages = {"C/C++", "CUDA", "CUDA (Device code)", "PTX", "CUBIN"}
5453
hits = sum(
5554
v for k, v in stats.get("cache_hits", {}).get("counts", {}).items()
5655
if k in counted_languages

benchmarks/cuda_bindings/runner/main.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,30 +16,30 @@
1616
PROJECT_ROOT = Path(__file__).resolve().parent.parent
1717
BENCH_DIR = PROJECT_ROOT / "benchmarks"
1818
DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
19+
DEFAULT_MODULE_NAME_PREFIX = "cuda_bindings_bench"
1920
# Env var used to propagate the --benchmark filter from the parent to pyperf
2021
# worker subprocesses. pyperf reconstructs worker argv from scratch and drops
2122
# custom flags like --benchmark, so without this the worker would register the
2223
# full bench list and pyperf would run the wrong bench by task index.
23-
BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
24+
DEFAULT_BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
2425

25-
PYPERF_INHERITED_ENV_VARS = (
26+
BASE_PYPERF_INHERITED_ENV_VARS = (
2627
"CUDA_HOME",
2728
"CUDA_PATH",
2829
"CUDA_VISIBLE_DEVICES",
2930
"LD_LIBRARY_PATH",
3031
"NVIDIA_VISIBLE_DEVICES",
31-
BENCH_FILTER_ENV_VAR,
3232
)
3333
_MODULE_CACHE: dict[Path, ModuleType] = {}
3434

3535

36-
def load_module(module_path: Path) -> ModuleType:
36+
def load_module(module_path: Path, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX) -> ModuleType:
3737
module_path = module_path.resolve()
3838
cached_module = _MODULE_CACHE.get(module_path)
3939
if cached_module is not None:
4040
return cached_module
4141

42-
module_name = f"cuda_bindings_bench_{module_path.stem}"
42+
module_name = f"{module_name_prefix}_{module_path.stem}"
4343
spec = importlib.util.spec_from_file_location(module_name, module_path)
4444
if spec is None or spec.loader is None:
4545
raise RuntimeError(f"Failed to load benchmark module: {module_path}")
@@ -64,13 +64,17 @@ def _discover_module_functions(module_path: Path) -> list[str]:
6464
]
6565

6666

67-
def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
67+
def _lazy_benchmark(
68+
module_path: Path,
69+
function_name: str,
70+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
71+
) -> Callable[[int], float]:
6872
loaded_function: Callable[[int], float] | None = None
6973

7074
def run(loops: int) -> float:
7175
nonlocal loaded_function
7276
if loaded_function is None:
73-
module = load_module(module_path)
77+
module = load_module(module_path, module_name_prefix=module_name_prefix)
7478
loaded_function = getattr(module, function_name)
7579
return loaded_function(loops)
7680

@@ -86,6 +90,7 @@ def run(loops: int) -> float:
8690
def _collect_skipped_benchmarks(
8791
bench_ids: list[str],
8892
registry: dict[str, Callable[[int], float]],
93+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
8994
) -> set[str]:
9095
"""Return bench IDs that the owning module has marked as unsupported.
9196
@@ -106,29 +111,37 @@ def _collect_skipped_benchmarks(
106111
continue
107112
module = loaded_modules.get(module_path)
108113
if module is None:
109-
module = load_module(module_path)
114+
module = load_module(module_path, module_name_prefix=module_name_prefix)
110115
loaded_modules[module_path] = module
111116
module_skip = getattr(module, "SKIPPED_BENCHMARKS", None)
112117
if module_skip and function_name in module_skip:
113118
skipped.add(bench_id)
114119
return skipped
115120

116121

117-
def discover_benchmarks() -> dict[str, Callable[[int], float]]:
122+
def discover_benchmarks(
123+
bench_dir: Path | None = None,
124+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
125+
) -> dict[str, Callable[[int], float]]:
118126
"""Discover bench_ functions.
119127
120128
Each bench_ function must have the signature: bench_*(loops: int) -> float
121129
where it calls the operation `loops` times and returns the total elapsed
122130
time in seconds (using time.perf_counter).
123131
"""
132+
# Resolve the default inside the call so tests (and embedders) can
133+
# monkeypatch ``BENCH_DIR`` at the module level — Python binds default
134+
# args at def-time, so a literal default would ignore later patches.
135+
if bench_dir is None:
136+
bench_dir = BENCH_DIR
124137
registry: dict[str, Callable[[int], float]] = {}
125-
for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
138+
for module_path in sorted(bench_dir.glob("bench_*.py")):
126139
module_name = module_path.stem
127140
for function_name in _discover_module_functions(module_path):
128141
bench_id = benchmark_id(module_name, function_name)
129142
if bench_id in registry:
130143
raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
131-
registry[bench_id] = _lazy_benchmark(module_path, function_name)
144+
registry[bench_id] = _lazy_benchmark(module_path, function_name, module_name_prefix=module_name_prefix)
132145
return registry
133146

134147

@@ -152,7 +165,10 @@ def _split_env_vars(arg_value: str) -> list[str]:
152165
return [env_var for env_var in arg_value.split(",") if env_var]
153166

154167

155-
def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
168+
def ensure_pyperf_worker_env(
169+
argv: list[str],
170+
extra_env_vars: tuple[str, ...] = (DEFAULT_BENCH_FILTER_ENV_VAR,),
171+
) -> list[str]:
156172
if "--copy-env" in argv:
157173
return list(argv)
158174

@@ -175,7 +191,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
175191
if skip_next:
176192
raise ValueError("Missing value for --inherit-environ")
177193

178-
for env_var in PYPERF_INHERITED_ENV_VARS:
194+
for env_var in (*BASE_PYPERF_INHERITED_ENV_VARS, *extra_env_vars):
179195
if env_var in os.environ:
180196
inherited_env.append(env_var)
181197

@@ -190,7 +206,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
190206
return cleaned
191207

192208

193-
def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
209+
def parse_args(argv: list[str], default_output: Path = DEFAULT_OUTPUT) -> tuple[argparse.Namespace, list[str]]:
194210
parser = argparse.ArgumentParser(add_help=False)
195211
parser.add_argument(
196212
"--benchmark",
@@ -207,19 +223,25 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
207223
"-o",
208224
"--output",
209225
type=Path,
210-
default=DEFAULT_OUTPUT,
211-
help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
226+
default=default_output,
227+
help=f"JSON output file path (default: {default_output.name})",
212228
)
213229
parsed, remaining = parser.parse_known_args(argv)
214230
return parsed, remaining
215231

216232

217-
def main() -> None:
218-
parsed, remaining_argv = parse_args(sys.argv[1:])
233+
def main(
234+
*,
235+
bench_dir: Path = BENCH_DIR,
236+
default_output: Path = DEFAULT_OUTPUT,
237+
module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
238+
bench_filter_env_var: str = DEFAULT_BENCH_FILTER_ENV_VAR,
239+
) -> None:
240+
parsed, remaining_argv = parse_args(sys.argv[1:], default_output=default_output)
219241

220-
registry = discover_benchmarks()
242+
registry = discover_benchmarks(bench_dir=bench_dir, module_name_prefix=module_name_prefix)
221243
if not registry:
222-
raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
244+
raise RuntimeError(f"No benchmark functions found in {bench_dir}")
223245

224246
if parsed.list:
225247
for bench_id in sorted(registry):
@@ -231,7 +253,7 @@ def main() -> None:
231253
# the wrong bench. pyperf drops unknown CLI flags when spawning workers,
232254
# so fall back to an env var carrying the filter.
233255
requested = list(parsed.benchmark)
234-
env_filter = os.environ.get(BENCH_FILTER_ENV_VAR, "")
256+
env_filter = os.environ.get(bench_filter_env_var, "")
235257
if not requested and env_filter:
236258
requested = [bid for bid in env_filter.split(",") if bid]
237259

@@ -243,21 +265,21 @@ def main() -> None:
243265
raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
244266
benchmark_ids = requested
245267
# Propagate to any pyperf worker we're about to spawn.
246-
os.environ[BENCH_FILTER_ENV_VAR] = ",".join(benchmark_ids)
268+
os.environ[bench_filter_env_var] = ",".join(benchmark_ids)
247269
else:
248270
benchmark_ids = sorted(registry)
249271

250272
# Strip any --output args to avoid conflicts with our output handling.
251273
output_path = parsed.output.resolve()
252274
remaining_argv = strip_pyperf_output_args(remaining_argv)
253-
remaining_argv = ensure_pyperf_worker_env(remaining_argv)
275+
remaining_argv = ensure_pyperf_worker_env(remaining_argv, extra_env_vars=(bench_filter_env_var,))
254276
is_worker = "--worker" in remaining_argv
255277

256278
# Drop benchmarks that the owning module has marked as unavailable on
257279
# this driver/device. Without this step a single unsupported bench
258280
# (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
259281
# since pyperf treats a raised exception as a fatal worker failure.
260-
skipped = _collect_skipped_benchmarks(benchmark_ids, registry)
282+
skipped = _collect_skipped_benchmarks(benchmark_ids, registry, module_name_prefix=module_name_prefix)
261283
if skipped and not is_worker:
262284
for bench_id in sorted(skipped):
263285
print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)

benchmarks/cuda_bindings/tests/test_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
135135
def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
136136
runner_main = load_runner_main(monkeypatch)
137137

138-
for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
138+
for env_var in runner_main.BASE_PYPERF_INHERITED_ENV_VARS:
139139
monkeypatch.delenv(env_var, raising=False)
140140
monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
141141
monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")

benchmarks/cuda_core/.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Build artifacts
2+
.build/
3+
__pycache__/
4+
5+
# Benchmark results
6+
*.json
7+
.benchmarks/
8+
9+
# Pixi environments
10+
.pixi/
11+
12+
# Override root .gitignore *.cpp rule (which targets Cython-generated files)
13+
!benchmarks/cpp/*.cpp
14+
15+
results-python.json
16+
results-cpp.json

benchmarks/cuda_core/AGENTS.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# cuda.core benchmarks
2+
3+
Read the README.md in this directory for more details about the benchmarks.
4+
5+
When generating code verify that that the code is correct based on the source for cuda-core
6+
that can be found in ../../cuda_core.
7+
8+
This suite shares the pyperf runner with `../cuda_bindings/` via a sys.path
9+
insert in `run_pyperf.py`. The per-suite setup (`runtime.py`, the `benchmarks/`
10+
module files) lives here. Benchmark IDs are kept identical to the cuda.bindings
11+
suite so `compare.py` can diff them directly.

0 commit comments

Comments
 (0)