Merge branch 'main' into rparolin/managed_mem_advise_prefetch

rparolin · web-flow · commit be46eedd5ab6 · 2026-05-19T10:41:52.000-07:00
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -519,7 +519,7 @@ cdef int _cuPythonInit() except -1 nogil:
     cdef char libPath[260]
 
     with gil, __symbol_lock:
-        usePTDS = os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=0)
+        usePTDS = bool(int(os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=0)))
 
         # Load library
         libPath[0] = 0
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -15,7 +15,7 @@ cdef int _cudaPythonInit() except -1 nogil:
         global __usePTDS
 
         with gil:
-            __usePTDS = os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=False)
+            __usePTDS = bool(int(os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=0)))
         __cudaPythonInit = True
         return __usePTDS
 
diff --git a/cuda_bindings/docs/source/release/13.2.1-notes.rst b/cuda_bindings/docs/source/release/13.2.1-notes.rst
@@ -0,0 +1,14 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. module:: cuda.bindings
+
+``cuda-bindings`` 13.2.1 Release notes
+======================================
+
+Bugfixes
+--------
+
+* Per-thread default stream mode would be used whenever the
+  ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` environment variable was set,
+  even if it was set to ``0``.
diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
@@ -4,6 +4,7 @@
 
 from functools import cache
 
+import numpy as np
 import pytest
 
 from cuda.bindings import nvml
@@ -78,7 +79,7 @@ def test_get_nv_link_supported_bw_modes(all_devices):
         assert not hasattr(modes, "total_bw_modes")
 
         for mode in modes.bw_modes:
-            assert isinstance(mode, int)
+            assert isinstance(mode, np.uint8)
 
 
 def test_device_get_pdi(all_devices):
diff --git a/cuda_core/AGENTS.md b/cuda_core/AGENTS.md
@@ -63,3 +63,82 @@ This file describes `cuda_core`, the high-level Pythonic CUDA subpackage in the
   call-site consistency.
 - Prefer explicit error propagation over silent fallback paths.
 - If you change public behavior, update tests and docs under `docs/source/`.
+
+## API design guidelines
+
+These are some API design guidelines we try to follow when adding new APIs to
+`cuda.core`.  These rules only apply to public APIs.  Private implementation
+details can violate these rules at any time.
+
+Public APIs are defined as symbols defined in `__all__` within modules or
+subpackages that are not prefixed with `_`.
+
+In code reviews, any violations of this section should be considered
+suggestions, not hard rules.  Consistency with existing API design in this code
+base is also important.
+
+### Unintentional exposure of symbols
+
+The following things should not be exposed as part of the public API:
+
+- Private symbols (prefixed with `_`)
+- Symbols from a third-party module or the standard library
+- Helper classes that can not be instantiated from Python
+
+### Naming
+
+As a blanket rule, we follow the naming guidelines for capitalization in PEP 8.
+
+Naming should be consistent.  We should use the same English words for the same
+concepts throughout the public API.  When abbreviations are used, they should be
+commonly understood, and they should also be used consistently across the public
+API.
+
+For all attributes of a class:
+
+- Properties and member variables should be nouns
+- Methods should be verbs
+- Methods that take no arguments, are idempotent and cheap (O(1) or trivial),
+  and do not mutate observable state should be properties
+
+Make sure conceptual pairs match, e.g. add/remove, get/set, create/delete,
+alloc/free.
+
+Free functions should be verbs.
+
+### Enumerations
+
+Enumerations from the underlying `cuda_bindings` should not be re-exposed.
+Instead, a new `StrEnum` subclass should be used to define the values.  Anywhere
+a `StrEnum` is accepted as an argument, a `str` should also be acceptable.  An
+invalid value should raise an exception.  When a function returns a `str` drawn
+from a small number of values, return a `StrEnum` subclass instead.
+
+### Exception handling
+
+Raising exceptions is preferred over a C-style return code that must be checked
+by the user.
+
+### Type annotations
+
+Python or Cython type annotations should be included for all public APIs.  Avoid
+the use of `Any` unless absolutely necessary.  The argument and return types as
+defined in the docstrings should match the type annotations.
+
+### Semantics
+
+Designs involving manual resource management should be avoided.  Where
+appropriate, provide context managers (implemented with `__enter__` and
+`__exit__`, not `contextlib.contextmanager`) or RAII using a `__del__` or
+`__dealloc__` method.
+
+### Documentation
+
+The entirety of the public API should be documented in `api.rst` or one of the
+subpages linked from it.  Classes that are not directly instantiable but which
+may be returned through the public API should be documented in `api_private.rst`
+so that they are documented but don't appear in the main index.
+
+### API stability
+
+Reviews should point out where existing public APIs are broken.
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
@@ -43,7 +43,7 @@ cdef dict _torch_type_cache = {}
 cdef object _torch_version_ok = None
 
 cdef inline bint _torch_version_check():
-    """Return True if 2.3 <= torch <= 2.11 (known AOTI ABI range). Memoized.
+    """Return True if 2.3 <= torch <= 2.12 (known AOTI ABI range). Memoized.
 
     Lower bound: AOTI functions we use were introduced in PyTorch 2.3.
     Upper bound: the ``pyobj_to_aten_handle`` trick relies on the
@@ -64,7 +64,7 @@ cdef inline bint _torch_version_check():
     try:
         major, minor = int(torch.__version__.split(".")[0]), \
                        int(torch.__version__.split(".")[1])
-        _torch_version_ok = (2, 3) <= (major, minor) <= (2, 11)
+        _torch_version_ok = (2, 3) <= (major, minor) <= (2, 12)
     except (ValueError, IndexError):
         _torch_version_ok = False
     return <bint>_torch_version_ok
diff --git a/cuda_core/cuda/core/_utils/enum_explanations_helpers.py b/cuda_core/cuda/core/_utils/enum_explanations_helpers.py
@@ -35,9 +35,11 @@
 def _binding_version() -> tuple[int, int, int]:
     """Return the installed ``cuda-bindings`` version, or a conservative old value."""
     try:
-        parts = importlib.metadata.version("cuda-bindings").split(".")[:3]
+        version = importlib.metadata.version("cuda-bindings")
     except importlib.metadata.PackageNotFoundError:
         return (0, 0, 0)  # For very old versions of cuda-python
+
+    parts = version.partition("+")[0].split(".")[:3]
     return tuple(int(v) for v in parts)
 
 
diff --git a/cuda_core/cuda/core/utils/_program_cache/_file_stream.py b/cuda_core/cuda/core/utils/_program_cache/_file_stream.py
@@ -422,11 +422,17 @@ def _path_for_key(self, key: object) -> Path:
         k = _as_key_bytes(key)
         # Hash the key to a fixed-length identifier so arbitrary-length user
         # keys never exceed per-component filename limits (typically 255 on
-        # ext4 / NTFS). With a 256-bit blake2b digest, the cache relies on
-        # cryptographic collision resistance for key uniqueness -- two
-        # distinct keys hashing to the same path is astronomically unlikely
-        # (~2^-128 with the 32-byte digest in use here).
-        digest = hashlib.blake2b(k, digest_size=32).hexdigest()
+        # ext4 / NTFS).
+        #
+        # FIPS: must use a FIPS-approved hash algorithm. FIPS-enforcing
+        # systems can disable non-approved hashlib algorithms (for example
+        # blake2b) at the OpenSSL level. See #2043.
+        #
+        # With a 256-bit SHA-256 digest, the cache relies on collision
+        # resistance for key uniqueness -- two distinct keys hashing to the
+        # same path is astronomically unlikely (~2^128 practical collision
+        # work).
+        digest = hashlib.sha256(k, usedforsecurity=False).hexdigest()
         return self._entries / digest[:2] / digest[2:]
 
     # -- mapping API ---------------------------------------------------------
diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py
@@ -35,7 +35,7 @@
 )
 
 # Bump when the key schema changes in a way that invalidates existing caches.
-_KEY_SCHEMA_VERSION = 1
+_KEY_SCHEMA_VERSION = 2
 
 _VALID_CODE_TYPES = frozenset({"c++", "ptx", "nvvm"})
 _VALID_TARGET_TYPES = frozenset({"ptx", "cubin", "ltoir"})
@@ -768,7 +768,10 @@ def make_program_cache_key(
     option_bytes = backend.option_fingerprint(options, target_type)
     name_tags = backend.encode_name_expressions(name_expressions)
 
-    hasher = hashlib.blake2b(digest_size=32)
+    # IMPORTANT: Must use a FIPS-approved hash algorithm (SHA-2 family).
+    # FIPS-enforcing systems can disable non-approved hashlib algorithms
+    # (for example blake2b) at the OpenSSL level. See #2043.
+    hasher = hashlib.sha256(usedforsecurity=False)
 
     def _update(label: str, payload: bytes) -> None:
         hasher.update(label.encode("ascii"))
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -1449,11 +1449,11 @@ def test_pinned_mr_numa_id_default_no_ipc(init_cuda):
     device = Device()
     skip_if_pinned_memory_unsupported(device)
 
-    mr = PinnedMemoryResource(PinnedMemoryResourceOptions())
+    mr = create_pinned_memory_resource_or_xfail(PinnedMemoryResourceOptions(), xfail_device=device)
     assert mr.numa_id == -1
     mr.close()
 
-    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=False))
+    mr = create_pinned_memory_resource_or_xfail(PinnedMemoryResourceOptions(ipc_enabled=False), xfail_device=device)
     assert mr.numa_id == -1
     mr.close()
 
@@ -1472,7 +1472,9 @@ def test_pinned_mr_numa_id_default_with_ipc(init_cuda):
     if expected_numa_id < 0:
         pytest.skip("System does not support NUMA")
 
-    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, max_size=POOL_SIZE))
+    mr = create_pinned_memory_resource_or_xfail(
+        PinnedMemoryResourceOptions(ipc_enabled=True, max_size=POOL_SIZE), xfail_device=device
+    )
     assert mr.numa_id == expected_numa_id
     mr.close()
 
@@ -1486,7 +1488,7 @@ def test_pinned_mr_numa_id_explicit(init_cuda):
     if host_numa_id < 0:
         pytest.skip("System does not support NUMA")
 
-    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(numa_id=host_numa_id))
+    mr = create_pinned_memory_resource_or_xfail(PinnedMemoryResourceOptions(numa_id=host_numa_id), xfail_device=device)
     assert mr.numa_id == host_numa_id
     mr.close()
 
@@ -1495,7 +1497,10 @@ def test_pinned_mr_numa_id_explicit(init_cuda):
     if not supports_ipc_mempool(device):
         pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
 
-    mr = PinnedMemoryResource(PinnedMemoryResourceOptions(ipc_enabled=True, numa_id=host_numa_id, max_size=POOL_SIZE))
+    mr = create_pinned_memory_resource_or_xfail(
+        PinnedMemoryResourceOptions(ipc_enabled=True, numa_id=host_numa_id, max_size=POOL_SIZE),
+        xfail_device=device,
+    )
     assert mr.numa_id == host_numa_id
     mr.close()
 
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
@@ -1773,7 +1773,7 @@ def test_filestream_cache_size_cap_counts_tmp_files(tmp_path):
 
 def test_filestream_cache_handles_long_keys(tmp_path):
     """Arbitrary-length keys must not overflow per-component filename limits.
-    The filename is a fixed-length 256-bit blake2b digest; key uniqueness
+    The filename is a fixed-length 256-bit digest; key uniqueness
     relies on the digest's collision resistance."""
     from cuda.core.utils import FileStreamProgramCache
 
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 set -euo pipefail
@@ -133,7 +133,7 @@ status_from_rc() {
 run_pytest() {
   # Run pytest safely under set -e and return its exit code
   set +e
-  python -m pytest "${PYTEST_FLAGS[@]}" "$@"
+  CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM=0 python -m pytest "${PYTEST_FLAGS[@]}" "$@"
   local rc=$?
   set -e
   return ${rc}
diff --git a/toolshed/README.md b/toolshed/README.md
@@ -0,0 +1,10 @@
+# Shared Toolshed
+
+This directory is a shared scratchpad for scripts that are useful
+intermittently, such as environment bootstrap helpers, ad-hoc reproducers, and
+glue for manual workflows. These tools do not warrant CI coverage, unit tests,
+or the rest of the production-code apparatus.
+
+Entries may bitrot between uses. Keeping them in a shared, versioned location
+gives the next person who needs the same workflow a better starting point than
+starting from scratch.
diff --git a/toolshed/conda_create_for_pathfinder_testing.ps1 b/toolshed/conda_create_for_pathfinder_testing.ps1
@@ -1,31 +1,30 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION
 # SPDX-License-Identifier: Apache-2.0
 
 param(
-    [Parameter(Mandatory = $true)]
-    [string]$CudaVersion
+    [Parameter(Mandatory = $true, Position = 0)]
+    [string]$PythonMajorMinor,
+    [Parameter(Mandatory = $true, Position = 1)]
+    [string]$CudaMajorMinorPatch
 )
 
 $ErrorActionPreference = "Stop"
+Set-StrictMode -Version Latest
 
 & "$env:CONDA_EXE" "shell.powershell" "hook" | Out-String | Invoke-Expression
 
-conda create --yes -n "pathfinder_testing_cu$CudaVersion" python=3.13 "cuda-toolkit=$CudaVersion"
-conda activate "pathfinder_testing_cu$CudaVersion"
+conda create --yes -n "pathfinder_testing_cu$CudaMajorMinorPatch" "python=$PythonMajorMinor" "cuda-toolkit=$CudaMajorMinorPatch"
+conda activate "pathfinder_testing_cu$CudaMajorMinorPatch"
 
+# Keep this list aligned with the Windows-installable subset of
+# cuda_pathfinder/pyproject.toml.
 $cpkgs = @(
     "cusparselt-dev",
     "cutensor",
-    "libcublasmp-dev",
+    "cutlass",
     "libcudss-dev",
-    "libcufftmp-dev",
-    "libmathdx-dev",
-    "libnvshmem3",
-    "libnvshmem-dev",
-    "libnvpl-fft-dev"
+    "libmathdx-dev"
 )
 
-foreach ($cpkg in $cpkgs) {
-    Write-Host "CONDA INSTALL: $cpkg"
-    conda install -y -c conda-forge $cpkg
-}
+Write-Host "CONDA INSTALL: $($cpkgs -join ' ')"
+conda install -y -c conda-forge @cpkgs
diff --git a/toolshed/conda_create_for_pathfinder_testing.sh b/toolshed/conda_create_for_pathfinder_testing.sh