Skip to content

Commit 41cea4c

Browse files
authored
Fix #2049: Expose Buffer._size to Python (#2068)
1 parent 4936801 commit 41cea4c

2 files changed

Lines changed: 102 additions & 2 deletions

File tree

cuda_core/cuda/core/_memory/_buffer.pxd

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

@@ -18,13 +18,16 @@ cdef struct _MemAttrs:
1818
cdef class Buffer:
1919
cdef:
2020
DevicePtrHandle _h_ptr
21-
size_t _size
2221
MemoryResource _memory_resource
2322
object _ipc_data
2423
object _owner
2524
_MemAttrs _mem_attrs
2625
bint _mem_attrs_inited
2726
object __weakref__
27+
cdef public:
28+
# Python code in _memory/_virtual_memory_resource.py needs to update
29+
# this value, though it is technically private.
30+
size_t _size
2831

2932

3033
cdef class MemoryResource:

cuda_core/tests/test_memory.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
import contextlib
45
import ctypes
56
import sys
67

@@ -969,6 +970,102 @@ def test_vmm_allocator_grow_allocation(handle_type):
969970
grown_buffer.close()
970971

971972

973+
@pytest.mark.parametrize("handle_type", get_handle_type())
974+
def test_vmm_allocator_grow_allocation_fast_path(handle_type):
975+
"""Exercise the contiguous-extension fast path in modify_allocation.
976+
977+
The dispatch in :func:`VirtualMemoryResource.modify_allocation` routes to
978+
:func:`_grow_allocation_fast_path` only when the CUDA driver honors a
979+
``fixedAddr`` hint pointing immediately after an existing allocation. In
980+
practice the driver almost always declines that hint, so
981+
``test_vmm_allocator_grow_allocation`` above always falls through to the
982+
slow path and the fast-path bookkeeping is never exercised. This test
983+
instead invokes :func:`_grow_allocation_fast_path` directly with a
984+
separately reserved VA range so the bookkeeping at the tail of the
985+
function (``buf._size = new_size``) is reached.
986+
987+
The extension is mapped at a disjoint VA, so the buffer ends up with a
988+
bookkeeping ``size`` larger than the contiguously-mapped region rooted at
989+
its handle. That is acceptable for a unit test of the fast-path
990+
bookkeeping; we tear the buffer down by hand below.
991+
"""
992+
device = Device()
993+
device.set_current()
994+
995+
if not device.properties.virtual_memory_management_supported:
996+
pytest.skip("Virtual memory management is not supported on this device")
997+
998+
handle_type_name, _ = handle_type
999+
options = VirtualMemoryResourceOptions(handle_type=handle_type_name)
1000+
vmm_mr = VirtualMemoryResource(device, config=options)
1001+
1002+
try:
1003+
buffer = vmm_mr.allocate(2 * 1024 * 1024)
1004+
except NotImplementedError:
1005+
assert handle_type_name == "win32"
1006+
return
1007+
1008+
# Build the prop the same way modify_allocation does, so cuMemCreate /
1009+
# _build_access_descriptors inside the fast path see the same shape as
1010+
# in production.
1011+
prop = driver.CUmemAllocationProp()
1012+
prop.type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
1013+
prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
1014+
prop.location.id = device.device_id
1015+
prop.allocFlags.gpuDirectRDMACapable = 0
1016+
if IS_WINDOWS:
1017+
prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT
1018+
else:
1019+
prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
1020+
prop.win32HandleMetaData = 0
1021+
1022+
gran = handle_return(
1023+
driver.cuMemGetAllocationGranularity(
1024+
prop, driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
1025+
)
1026+
)
1027+
1028+
aligned_additional_size = ((2 * 1024 * 1024) + gran - 1) & ~(gran - 1)
1029+
original_size = buffer.size
1030+
original_handle = int(buffer.handle)
1031+
new_size = original_size + aligned_additional_size
1032+
1033+
# Reserve a VA range for the extension. The address is irrelevant for the
1034+
# purposes of exercising the fast path; only its validity matters.
1035+
new_ptr = handle_return(driver.cuMemAddressReserve(aligned_additional_size, gran, 0, 0))
1036+
1037+
try:
1038+
result = vmm_mr._grow_allocation_fast_path(buffer, new_size, prop, aligned_additional_size, new_ptr)
1039+
1040+
# Fast-path contract: same buffer, unchanged handle, updated size.
1041+
assert result is buffer
1042+
assert int(buffer.handle) == original_handle
1043+
assert buffer.size == new_size
1044+
finally:
1045+
# Tear down by hand. The buffer's bookkeeping size may now exceed the
1046+
# contiguous mapping rooted at its handle, so the standard close()
1047+
# path (which calls deallocate(handle, size)) cannot be used safely.
1048+
# Best-effort cleanup; on the current broken build the fast path
1049+
# raises before commit-tail work completes, so some of these may
1050+
# error -- suppress individually.
1051+
with contextlib.suppress(Exception):
1052+
ext_handle = handle_return(driver.cuMemRetainAllocationHandle(new_ptr))
1053+
try:
1054+
handle_return(driver.cuMemUnmap(new_ptr, aligned_additional_size))
1055+
finally:
1056+
handle_return(driver.cuMemRelease(ext_handle))
1057+
with contextlib.suppress(Exception):
1058+
handle_return(driver.cuMemAddressFree(new_ptr, aligned_additional_size))
1059+
with contextlib.suppress(Exception):
1060+
orig_handle = handle_return(driver.cuMemRetainAllocationHandle(original_handle))
1061+
try:
1062+
handle_return(driver.cuMemUnmap(original_handle, original_size))
1063+
finally:
1064+
handle_return(driver.cuMemRelease(orig_handle))
1065+
with contextlib.suppress(Exception):
1066+
handle_return(driver.cuMemAddressFree(original_handle, original_size))
1067+
1068+
9721069
def test_vmm_allocator_rdma_unsupported_exception():
9731070
"""Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it.
9741071

0 commit comments

Comments
 (0)