Skip to content

Commit be46eed

Browse files
authored
Merge branch 'main' into rparolin/managed_mem_advise_prefetch
2 parents 5a2bb4c + 2523e97 commit be46eed

15 files changed

Lines changed: 197 additions & 55 deletions

File tree

cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ cdef int _cuPythonInit() except -1 nogil:
519519
cdef char libPath[260]
520520

521521
with gil, __symbol_lock:
522-
usePTDS = os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=0)
522+
usePTDS = bool(int(os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=0)))
523523

524524
# Load library
525525
libPath[0] = 0

cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ cdef int _cudaPythonInit() except -1 nogil:
1515
global __usePTDS
1616

1717
with gil:
18-
__usePTDS = os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=False)
18+
__usePTDS = bool(int(os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=0)))
1919
__cudaPythonInit = True
2020
return __usePTDS
2121

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
3+
4+
.. module:: cuda.bindings
5+
6+
``cuda-bindings`` 13.2.1 Release notes
7+
======================================
8+
9+
Bugfixes
10+
--------
11+
12+
* Per-thread default stream mode would be used whenever the
13+
``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` environment variable was set,
14+
even if it was set to ``0``.

cuda_bindings/tests/nvml/test_device.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from functools import cache
66

7+
import numpy as np
78
import pytest
89

910
from cuda.bindings import nvml
@@ -78,7 +79,7 @@ def test_get_nv_link_supported_bw_modes(all_devices):
7879
assert not hasattr(modes, "total_bw_modes")
7980

8081
for mode in modes.bw_modes:
81-
assert isinstance(mode, int)
82+
assert isinstance(mode, np.uint8)
8283

8384

8485
def test_device_get_pdi(all_devices):

cuda_core/AGENTS.md

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,82 @@ This file describes `cuda_core`, the high-level Pythonic CUDA subpackage in the
6363
call-site consistency.
6464
- Prefer explicit error propagation over silent fallback paths.
6565
- If you change public behavior, update tests and docs under `docs/source/`.
66+
67+
## API design guidelines
68+
69+
These are some API design guidelines we try to follow when adding new APIs to
70+
`cuda.core`. These rules only apply to public APIs. Private implementation
71+
details can violate these rules at any time.
72+
73+
Public APIs are defined as symbols defined in `__all__` within modules or
74+
subpackages that are not prefixed with `_`.
75+
76+
In code reviews, any violations of this section should be considered
77+
suggestions, not hard rules. Consistency with existing API design in this code
78+
base is also important.
79+
80+
### Unintentional exposure of symbols
81+
82+
The following things should not be exposed as part of the public API:
83+
84+
- Private symbols (prefixed with `_`)
85+
- Symbols from a third-party module or the standard library
86+
- Helper classes that can not be instantiated from Python
87+
88+
### Naming
89+
90+
As a blanket rule, we follow the naming guidelines for capitalization in PEP 8.
91+
92+
Naming should be consistent. We should use the same English words for the same
93+
concepts throughout the public API. When abbreviations are used, they should be
94+
commonly understood, and they should also be used consistently across the public
95+
API.
96+
97+
For all attributes of a class:
98+
99+
- Properties and member variables should be nouns
100+
- Methods should be verbs
101+
- Methods that take no arguments, are idempotent and cheap (O(1) or trivial),
102+
and do not mutate observable state should be properties
103+
104+
Make sure conceptual pairs match, e.g. add/remove, get/set, create/delete,
105+
alloc/free.
106+
107+
Free functions should be verbs.
108+
109+
### Enumerations
110+
111+
Enumerations from the underlying `cuda_bindings` should not be re-exposed.
112+
Instead, a new `StrEnum` subclass should be used to define the values. Anywhere
113+
a `StrEnum` is accepted as an argument, a `str` should also be acceptable. An
114+
invalid value should raise an exception. When a function returns a `str` drawn
115+
from a small number of values, return a `StrEnum` subclass instead.
116+
117+
### Exception handling
118+
119+
Raising exceptions is preferred over a C-style return code that must be checked
120+
by the user.
121+
122+
### Type annotations
123+
124+
Python or Cython type annotations should be included for all public APIs. Avoid
125+
the use of `Any` unless absolutely necessary. The argument and return types as
126+
defined in the docstrings should match the type annotations.
127+
128+
### Semantics
129+
130+
Designs involving manual resource management should be avoided. Where
131+
appropriate, provide context managers (implemented with `__enter__` and
132+
`__exit__`, not `contextlib.contextmanager`) or RAII using a `__del__` or
133+
`__dealloc__` method.
134+
135+
### Documentation
136+
137+
The entirety of the public API should be documented in `api.rst` or one of the
138+
subpages linked from it. Classes that are not directly instantiable but which
139+
may be returned through the public API should be documented in `api_private.rst`
140+
so that they are documented but don't appear in the main index.
141+
142+
### API stability
143+
144+
Reviews should point out where existing public APIs are broken.

cuda_core/cuda/core/_memoryview.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ cdef dict _torch_type_cache = {}
4343
cdef object _torch_version_ok = None
4444

4545
cdef inline bint _torch_version_check():
46-
"""Return True if 2.3 <= torch <= 2.11 (known AOTI ABI range). Memoized.
46+
"""Return True if 2.3 <= torch <= 2.12 (known AOTI ABI range). Memoized.
4747
4848
Lower bound: AOTI functions we use were introduced in PyTorch 2.3.
4949
Upper bound: the ``pyobj_to_aten_handle`` trick relies on the
@@ -64,7 +64,7 @@ cdef inline bint _torch_version_check():
6464
try:
6565
major, minor = int(torch.__version__.split(".")[0]), \
6666
int(torch.__version__.split(".")[1])
67-
_torch_version_ok = (2, 3) <= (major, minor) <= (2, 11)
67+
_torch_version_ok = (2, 3) <= (major, minor) <= (2, 12)
6868
except (ValueError, IndexError):
6969
_torch_version_ok = False
7070
return <bint>_torch_version_ok

cuda_core/cuda/core/_utils/enum_explanations_helpers.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@
3535
def _binding_version() -> tuple[int, int, int]:
3636
"""Return the installed ``cuda-bindings`` version, or a conservative old value."""
3737
try:
38-
parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
38+
version = importlib.metadata.version("cuda-bindings")
3939
except importlib.metadata.PackageNotFoundError:
4040
return (0, 0, 0) # For very old versions of cuda-python
41+
42+
parts = version.partition("+")[0].split(".")[:3]
4143
return tuple(int(v) for v in parts)
4244

4345

cuda_core/cuda/core/utils/_program_cache/_file_stream.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,17 @@ def _path_for_key(self, key: object) -> Path:
422422
k = _as_key_bytes(key)
423423
# Hash the key to a fixed-length identifier so arbitrary-length user
424424
# keys never exceed per-component filename limits (typically 255 on
425-
# ext4 / NTFS). With a 256-bit blake2b digest, the cache relies on
426-
# cryptographic collision resistance for key uniqueness -- two
427-
# distinct keys hashing to the same path is astronomically unlikely
428-
# (~2^-128 with the 32-byte digest in use here).
429-
digest = hashlib.blake2b(k, digest_size=32).hexdigest()
425+
# ext4 / NTFS).
426+
#
427+
# FIPS: must use a FIPS-approved hash algorithm. FIPS-enforcing
428+
# systems can disable non-approved hashlib algorithms (for example
429+
# blake2b) at the OpenSSL level. See #2043.
430+
#
431+
# With a 256-bit SHA-256 digest, the cache relies on collision
432+
# resistance for key uniqueness -- two distinct keys hashing to the
433+
# same path is astronomically unlikely (~2^128 practical collision
434+
# work).
435+
digest = hashlib.sha256(k, usedforsecurity=False).hexdigest()
430436
return self._entries / digest[:2] / digest[2:]
431437

432438
# -- mapping API ---------------------------------------------------------

cuda_core/cuda/core/utils/_program_cache/_keys.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
)
3636

3737
# Bump when the key schema changes in a way that invalidates existing caches.
38-
_KEY_SCHEMA_VERSION = 1
38+
_KEY_SCHEMA_VERSION = 2
3939

4040
_VALID_CODE_TYPES = frozenset({"c++", "ptx", "nvvm"})
4141
_VALID_TARGET_TYPES = frozenset({"ptx", "cubin", "ltoir"})
@@ -768,7 +768,10 @@ def make_program_cache_key(
768768
option_bytes = backend.option_fingerprint(options, target_type)
769769
name_tags = backend.encode_name_expressions(name_expressions)
770770

771-
hasher = hashlib.blake2b(digest_size=32)
771+
# IMPORTANT: Must use a FIPS-approved hash algorithm (SHA-2 family).
772+
# FIPS-enforcing systems can disable non-approved hashlib algorithms
773+
# (for example blake2b) at the OpenSSL level. See #2043.
774+
hasher = hashlib.sha256(usedforsecurity=False)
772775

773776
def _update(label: str, payload: bytes) -> None:
774777
hasher.update(label.encode("ascii"))

cuda_core/tests/test_memory.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1449,11 +1449,11 @@ def test_pinned_mr_numa_id_default_no_ipc(init_cuda):
14491449
device = Device()
14501450
skip_if_pinned_memory_unsupported(device)
14511451

1452-
mr = PinnedMemoryResource(PinnedMemoryResourceOptions())
1452+
mr = create_pinned_memory_resource_or_xfail(PinnedMemoryResourceOptions(), xfail_device=device)
14531453
assert mr.numa_id == -1
14541454
mr.close()
14551455

1456-
mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=False))
1456+
mr = create_pinned_memory_resource_or_xfail(PinnedMemoryResourceOptions(ipc_enabled=False), xfail_device=device)
14571457
assert mr.numa_id == -1
14581458
mr.close()
14591459

@@ -1472,7 +1472,9 @@ def test_pinned_mr_numa_id_default_with_ipc(init_cuda):
14721472
if expected_numa_id < 0:
14731473
pytest.skip("System does not support NUMA")
14741474

1475-
mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, max_size=POOL_SIZE))
1475+
mr = create_pinned_memory_resource_or_xfail(
1476+
PinnedMemoryResourceOptions(ipc_enabled=True, max_size=POOL_SIZE), xfail_device=device
1477+
)
14761478
assert mr.numa_id == expected_numa_id
14771479
mr.close()
14781480

@@ -1486,7 +1488,7 @@ def test_pinned_mr_numa_id_explicit(init_cuda):
14861488
if host_numa_id < 0:
14871489
pytest.skip("System does not support NUMA")
14881490

1489-
mr = PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=host_numa_id))
1491+
mr = create_pinned_memory_resource_or_xfail(PinnedMemoryResourceOptions(numa_id=host_numa_id), xfail_device=device)
14901492
assert mr.numa_id == host_numa_id
14911493
mr.close()
14921494

@@ -1495,7 +1497,10 @@ def test_pinned_mr_numa_id_explicit(init_cuda):
14951497
if not supports_ipc_mempool(device):
14961498
pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
14971499

1498-
mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, numa_id=host_numa_id, max_size=POOL_SIZE))
1500+
mr = create_pinned_memory_resource_or_xfail(
1501+
PinnedMemoryResourceOptions(ipc_enabled=True, numa_id=host_numa_id, max_size=POOL_SIZE),
1502+
xfail_device=device,
1503+
)
14991504
assert mr.numa_id == host_numa_id
15001505
mr.close()
15011506

0 commit comments

Comments
 (0)