Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
d4a178c
feat(core.utils): persistent program cache
cpcloud Apr 29, 2026
b914f54
feat(core): Program.compile(cache=...) convenience wrapper
cpcloud Apr 29, 2026
e5ad304
refactor(core.utils): import program cache eagerly, fix ruff findings
cpcloud May 4, 2026
29b973b
docs(core): nest Program caches under CUDA compilation toolchain
cpcloud May 5, 2026
7e4f8ed
docs(core.utils): nudge cache callers toward close()
cpcloud May 5, 2026
d201703
docs(core.utils): teach cache examples with public ObjectCode API
cpcloud May 5, 2026
b93110b
feat(core): require Program.compile(cache=...) to be a ProgramCacheRe…
cpcloud May 5, 2026
97a526a
test(core): mock driver_version to keep _can_load_generated_ptx as cpdef
cpcloud May 5, 2026
fee1941
docs(core): note cache hits do not write to logs
cpcloud May 5, 2026
f54c6fc
fix(core.utils): cache _LinkerBackend driver decision per key call
cpcloud May 5, 2026
f24c67b
fix(core): lowercase target_type in Program.compile and cache key
cpcloud May 5, 2026
365b1f1
refactor(core.utils): collapse _LINKER_RELEVANT_FIELDS into _LINKER_F…
cpcloud May 5, 2026
af57edb
fix(core.utils): contextual error when path-backed ObjectCode vanishes
cpcloud May 5, 2026
569e43a
docs(core): clarify cache= vs ProgramOptions.no_cache independence
cpcloud May 5, 2026
fbc1b4f
test(core.utils): tighten multiprocess cache assertions
cpcloud May 5, 2026
888fa82
test(core.utils): add InMemoryProgramCache thread test and FileStream…
cpcloud May 5, 2026
67306c6
refactor(core.utils): drop dead empty-key branch in _path_for_key
cpcloud May 5, 2026
e472ff0
perf(core.utils): drop redundant stat in FileStreamProgramCache.__len__
cpcloud May 5, 2026
943010f
feat(core.utils): reject max_size_bytes=0 in cache backends
cpcloud May 5, 2026
c9c7daf
docs(core.utils): note ptxas_options order is intentionally significant
cpcloud May 5, 2026
af40133
docs(core.utils): document the _keys.py <-> _program.pyx import contract
cpcloud May 5, 2026
70989e3
docs(core.utils): note options.name double-hash on PTX is intentional
cpcloud May 5, 2026
871771c
docs(core.utils): document burst-write over-eviction trade-off
cpcloud May 5, 2026
3621362
docs(core): explain why Program.compile cache decode is provably safe
cpcloud May 5, 2026
2e3c080
refactor(core.utils): extract _stat_key for the four stat-guarded paths
cpcloud May 5, 2026
9037d77
refactor(core.utils): collapse Windows sharing-retry loops
cpcloud May 5, 2026
57a72fd
fix(test): walk sharded entries dir in FileStream overwrite assertion
cpcloud May 5, 2026
69f6438
fix(test): collapse nested with into a single contextmanager group (S…
cpcloud May 5, 2026
c6aaaa7
fix(core.utils): keep path-backed ObjectCode error message Windows-fr…
cpcloud May 6, 2026
547d3f3
perf(core.utils): track FileStream cache size incrementally
cpcloud May 6, 2026
4aaca63
test(core.utils): drop SUPPORTED_TARGETS cross-check
cpcloud May 6, 2026
8345735
refactor(core.utils): idiomatic with-as on os.scandir, yield from inner
cpcloud May 6, 2026
b6a2590
refactor(core.utils): extract _iter_tmp_entries and _sum_tmp_sizes
cpcloud May 6, 2026
19d7cf7
fix(core.utils): clamp tracked size at zero on __delitem__
cpcloud May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cuda_core/cuda/core/_program.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ cdef class Program:
object _compile_lock # Per-instance lock for compile-time mutation
bint _use_libdevice # Flag for libdevice loading
bint _libdevice_added
bytes _nvrtc_code # Source code for NVRTC retry (PCH auto-resize)
bytes _code # Source code as bytes: used for key derivation and NVRTC PCH retry
str _code_type # Normalised code_type ("c++", "ptx", "nvvm")
str _pch_status # PCH creation outcome after compile
151 changes: 144 additions & 7 deletions cuda_core/cuda/core/_program.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,12 @@ cdef class Program:
self._h_nvvm.reset()

def compile(
self, target_type: ObjectCodeFormatType | str, name_expressions: tuple | list = (), logs = None
self,
target_type: ObjectCodeFormatType | str,
name_expressions: tuple | list = (),
logs=None,
*,
cache: "ProgramCacheResource | None" = None,
) -> ObjectCode:
"""Compile the program to the specified target type.

Expand All @@ -98,13 +103,126 @@ cdef class Program:
Used for template instantiation and similar cases.
logs : object, optional
Object with a ``write`` method to receive compilation logs.
On a cache hit no compilation runs and ``logs`` receives
nothing -- callers that rely on log output to confirm a
compile happened should compile without ``cache=``.
cache : :class:`~cuda.core.utils.ProgramCacheResource`, optional
If provided, the compiled binary is looked up in ``cache`` via a
key derived from the program's code, options, and ``target_type``.
On a hit the cached bytes are wrapped in a fresh
:class:`~cuda.core.ObjectCode` (with the same ``target_type``
and ``ProgramOptions.name``) and returned without re-compiling;
on a miss the compile output is stored as raw bytes (the cache
extracts ``bytes(object_code.code)``). Passing a non-empty
``name_expressions`` together with ``cache=`` raises
``ValueError``: NVRTC populates
``ObjectCode.symbol_mapping`` at compile time and that mapping
is not carried in the binary the cache stores, so cache hits
would silently miss ``get_kernel(name_expression)`` lookups.
Options that require an ``extra_digest`` (``include_path``,
``pre_include``, ``pch``, ``use_pch``, ``pch_dir``, NVVM
``use_libdevice=True``, or NVRTC ``options.name`` with a
directory component) raise ``ValueError`` via
:func:`~cuda.core.utils.make_program_cache_key`; for those
compiles, use the manual ``make_program_cache_key(...)``
pattern directly.

``cache=`` is independent of ``ProgramOptions.no_cache``: the
former controls this program-level cache (compiled-output
reuse across calls), while ``no_cache`` is forwarded to the
Linker to disable its in-process JIT cache for cuLink/nvJitLink.
Setting ``options.no_cache=True`` does not bypass ``cache=``,
and vice-versa.

Returns
-------
:class:`~cuda.core.ObjectCode`
The compiled object code.
"""
return Program_compile(self, str(target_type), name_expressions, logs)
# Mirror Program_init's code_type normalization so callers can pass
# ``ObjectCodeFormatType.PTX`` or ``"PTX"`` and get the same routing
# / cache key as the lowercase string. ``Program_compile_nvrtc``
# keys on lowercase ``target_type`` and ``make_program_cache_key``
# lowercases too.
target_type = str(target_type).lower()

if cache is None:
return _program_compile_uncached(self, target_type, name_expressions, logs)

# Deferred import to avoid a circular import between _program and
# cuda.core.utils._program_cache (the cache module already imports
# ProgramOptions from this module). Import from the leaf module so
# tests that monkeypatch make_program_cache_key via that path
# intercept reliably.
from cuda.core.utils._program_cache import (
ProgramCacheResource,
make_program_cache_key,
)

if not isinstance(cache, ProgramCacheResource):
raise TypeError(
"cache must be an instance of "
"cuda.core.utils.ProgramCacheResource; got "
f"{type(cache).__name__}"
)

# ``name_expressions`` is incompatible with the cache: NVRTC
# populates ``ObjectCode.symbol_mapping`` from name-expression
# mangling at compile time, and that mapping isn't carried in
# the binary bytes the cache stores. Without this guard the
# first call (cache miss) would return an ObjectCode with
# symbol_mapping populated, while every subsequent call (hit)
# would return one without -- silently breaking later
# ``get_kernel(name_expression)`` lookups that work on the
# uncached path. Fail loud here instead.
if name_expressions:
raise ValueError(
"Program.compile(cache=...) does not support name_expressions: "
"ObjectCode.symbol_mapping is populated by NVRTC at compile "
"time and is not preserved across a cache round-trip, so cache "
"hits would silently break get_kernel(name_expression) lookups "
"that the uncached path supports. Compile without cache= when "
"name_expressions are needed, or look up mangled symbols by "
"hand from the cached ObjectCode."
)
Comment on lines +169 to +187
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: I need to address this after 1.0 is out, xref: cupy/cupy#9801


# ``self._code`` is always stored as bytes (see ``Program_init``),
# but ``make_program_cache_key`` only accepts bytes when
# ``code_type == "nvvm"`` -- c++/ptx must be ``str``. The bytes
# came from ``code.encode()`` on a ``str`` Program_init validated
# via ``assert_type(code, str)``, so this round-trip is always
# safe; no try/except needed.
code_for_key = self._code if self._code_type == "nvvm" else self._code.decode("utf-8")
Comment thread
cpcloud marked this conversation as resolved.

key = make_program_cache_key(
code=code_for_key,
code_type=self._code_type,
options=self._options,
target_type=target_type,
)
hit_bytes = cache.get(key)
if hit_bytes is not None:
# The uncached NVRTC path warns when the active driver can't
# load freshly-generated PTX; that loadability is a property
# of the driver, not of how the bytes were produced, so the
Comment thread
cpcloud marked this conversation as resolved.
# warning applies equally to cached PTX. Mirror it here so a
# cache hit doesn't silently hide an incompatibility that the
# uncached call would have surfaced.
Comment thread
leofang marked this conversation as resolved.
if (
self._backend == "NVRTC"
and target_type == "ptx"
and not _can_load_generated_ptx()
):
warn(
"The CUDA driver version is older than the backend version. "
"The generated ptx will not be loadable by the current driver.",
stacklevel=2,
category=RuntimeWarning,
)
return ObjectCode._init(hit_bytes, target_type, name=self._options.name)
compiled = _program_compile_uncached(self, target_type, name_expressions, logs)
cache[key] = compiled
return compiled

@property
def pch_status(self) -> PCHStatusType | None:
Expand Down Expand Up @@ -505,6 +623,19 @@ class ProgramOptions:
# Private Classes and Helper Functions
# =============================================================================


def _program_compile_uncached(program, target_type, name_expressions, logs):
"""Run ``Program_compile`` without the cache wrapper.

Module-level Python function so tests can monkeypatch it from
``cuda.core._program`` to avoid invoking NVRTC when exercising the cache
wrapper in :meth:`Program.compile`. ``Program`` itself is a ``cdef class``
and its methods cannot be reassigned from Python, so the seam must live
outside the class.
"""
return Program_compile(program, target_type, name_expressions, logs)


# Module-level state for NVVM lazy loading
_nvvm_module = None
_nvvm_import_attempted = False
Expand Down Expand Up @@ -620,6 +751,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op

self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
code_type = code_type.lower()
self._code_type = code_type
self._compile_lock = threading.Lock()
self._use_libdevice = False
self._libdevice_added = False
Expand All @@ -640,16 +772,18 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
self._nvrtc_code = code_bytes
self._code = code_bytes
self._backend = str(CompilerBackendType.NVRTC)
self._linker = None

elif code_type == "ptx":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the PTX backend.")
code_bytes = code.encode()
self._code = code_bytes
self._linker = Linker(
ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
ObjectCode._init(code_bytes, code_type), options=_translate_program_options(options)
)
self._backend = str(self._linker.backend)

Expand All @@ -659,10 +793,13 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
code = code.encode("utf-8")
elif not isinstance(code, (bytes, bytearray)):
raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
self._code = bytes(code) # Coerce bytearray -> bytes so retention type is stable

code_ptr = <const char*>(<bytes>code)
# Use self._code (strictly bytes) for the C pointer so a bytearray
# input doesn't trip the `<bytes>code` cast at runtime.
code_ptr = <const char*>self._code
name_ptr = <const char*>options._name
code_len = len(code)
code_len = len(self._code)

with nogil:
HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
Expand Down Expand Up @@ -828,7 +965,7 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required))

cdef cynvrtc.nvrtcProgram retry_prog
cdef const char* code_ptr = <const char*>self._nvrtc_code
cdef const char* code_ptr = <const char*>self._code
cdef const char* name_ptr = <const char*>self._options._name
with nogil:
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
Expand Down
8 changes: 0 additions & 8 deletions cuda_core/cuda/core/utils.py

This file was deleted.

23 changes: 23 additions & 0 deletions cuda_core/cuda/core/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from cuda.core._memoryview import (
StridedMemoryView,
args_viewable_as_strided_memory,
)
from cuda.core.utils._program_cache import (
FileStreamProgramCache,
InMemoryProgramCache,
ProgramCacheResource,
make_program_cache_key,
)

__all__ = [
"FileStreamProgramCache",
"InMemoryProgramCache",
"ProgramCacheResource",
"StridedMemoryView",
"args_viewable_as_strided_memory",
"make_program_cache_key",
]
36 changes: 36 additions & 0 deletions cuda_core/cuda/core/utils/_program_cache/__init__.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I forgot that we already have a folder for collecting utils:

cuda_core/cuda/core/_utils

How about we move the program cache files to there? Say,

cuda_core/cuda/core/_utils/_program_cache/

Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

"""Persistent program cache for cuda.core.

Public surface:

* :class:`ProgramCacheResource` -- bytes-in / bytes-out ABC.
* :class:`InMemoryProgramCache` -- thread-safe LRU dict-backed cache.
* :class:`FileStreamProgramCache` -- atomic, multi-process directory cache.
* :func:`make_program_cache_key` -- key derivation for arbitrary
``Program`` configurations.

The package is split into submodules by concern. Tests that need to
monkeypatch internals (Windows flag, version probes, helpers, ...)
should reach into the owning submodule (e.g.
``_program_cache._file_stream._IS_WINDOWS``,
``_program_cache._keys._linker_backend_and_version``) rather than the
package object: the symbols re-exported here are only convenience
aliases and don't intercept calls within the submodules.
"""

from __future__ import annotations

from ._abc import ProgramCacheResource
from ._file_stream import FileStreamProgramCache
from ._in_memory import InMemoryProgramCache
from ._keys import make_program_cache_key

__all__ = [
"FileStreamProgramCache",
"InMemoryProgramCache",
"ProgramCacheResource",
"make_program_cache_key",
]
Loading
Loading