|
1 | 1 | # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
| 4 | +import contextlib |
4 | 5 | import ctypes |
5 | 6 | import sys |
6 | 7 |
|
@@ -969,6 +970,102 @@ def test_vmm_allocator_grow_allocation(handle_type): |
969 | 970 | grown_buffer.close() |
970 | 971 |
|
971 | 972 |
|
| 973 | +@pytest.mark.parametrize("handle_type", get_handle_type()) |
| 974 | +def test_vmm_allocator_grow_allocation_fast_path(handle_type): |
| 975 | + """Exercise the contiguous-extension fast path in modify_allocation. |
| 976 | +
|
| 977 | + The dispatch in :func:`VirtualMemoryResource.modify_allocation` routes to |
| 978 | + :func:`_grow_allocation_fast_path` only when the CUDA driver honors a |
| 979 | + ``fixedAddr`` hint pointing immediately after an existing allocation. In |
| 980 | + practice the driver almost always declines that hint, so |
| 981 | + ``test_vmm_allocator_grow_allocation`` above always falls through to the |
| 982 | + slow path and the fast-path bookkeeping is never exercised. This test |
| 983 | + instead invokes :func:`_grow_allocation_fast_path` directly with a |
| 984 | + separately reserved VA range so the bookkeeping at the tail of the |
| 985 | + function (``buf._size = new_size``) is reached. |
| 986 | +
|
| 987 | + The extension is mapped at a disjoint VA, so the buffer ends up with a |
| 988 | + bookkeeping ``size`` larger than the contiguously-mapped region rooted at |
| 989 | + its handle. That is acceptable for a unit test of the fast-path |
| 990 | + bookkeeping; we tear the buffer down by hand below. |
| 991 | + """ |
| 992 | + device = Device() |
| 993 | + device.set_current() |
| 994 | + |
| 995 | + if not device.properties.virtual_memory_management_supported: |
| 996 | + pytest.skip("Virtual memory management is not supported on this device") |
| 997 | + |
| 998 | + handle_type_name, _ = handle_type |
| 999 | + options = VirtualMemoryResourceOptions(handle_type=handle_type_name) |
| 1000 | + vmm_mr = VirtualMemoryResource(device, config=options) |
| 1001 | + |
| 1002 | + try: |
| 1003 | + buffer = vmm_mr.allocate(2 * 1024 * 1024) |
| 1004 | + except NotImplementedError: |
| 1005 | + assert handle_type_name == "win32" |
| 1006 | + return |
| 1007 | + |
| 1008 | + # Build the prop the same way modify_allocation does, so cuMemCreate / |
| 1009 | + # _build_access_descriptors inside the fast path see the same shape as |
| 1010 | + # in production. |
| 1011 | + prop = driver.CUmemAllocationProp() |
| 1012 | + prop.type = driver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED |
| 1013 | + prop.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE |
| 1014 | + prop.location.id = device.device_id |
| 1015 | + prop.allocFlags.gpuDirectRDMACapable = 0 |
| 1016 | + if IS_WINDOWS: |
| 1017 | + prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_WIN32_KMT |
| 1018 | + else: |
| 1019 | + prop.requestedHandleTypes = driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR |
| 1020 | + prop.win32HandleMetaData = 0 |
| 1021 | + |
| 1022 | + gran = handle_return( |
| 1023 | + driver.cuMemGetAllocationGranularity( |
| 1024 | + prop, driver.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED |
| 1025 | + ) |
| 1026 | + ) |
| 1027 | + |
| 1028 | + aligned_additional_size = ((2 * 1024 * 1024) + gran - 1) & ~(gran - 1) |
| 1029 | + original_size = buffer.size |
| 1030 | + original_handle = int(buffer.handle) |
| 1031 | + new_size = original_size + aligned_additional_size |
| 1032 | + |
| 1033 | + # Reserve a VA range for the extension. The address is irrelevant for the |
| 1034 | + # purposes of exercising the fast path; only its validity matters. |
| 1035 | + new_ptr = handle_return(driver.cuMemAddressReserve(aligned_additional_size, gran, 0, 0)) |
| 1036 | + |
| 1037 | + try: |
| 1038 | + result = vmm_mr._grow_allocation_fast_path(buffer, new_size, prop, aligned_additional_size, new_ptr) |
| 1039 | + |
| 1040 | + # Fast-path contract: same buffer, unchanged handle, updated size. |
| 1041 | + assert result is buffer |
| 1042 | + assert int(buffer.handle) == original_handle |
| 1043 | + assert buffer.size == new_size |
| 1044 | + finally: |
| 1045 | + # Tear down by hand. The buffer's bookkeeping size may now exceed the |
| 1046 | + # contiguous mapping rooted at its handle, so the standard close() |
| 1047 | + # path (which calls deallocate(handle, size)) cannot be used safely. |
| 1048 | + # Best-effort cleanup; on the current broken build the fast path |
| 1049 | + # raises before commit-tail work completes, so some of these may |
| 1050 | + # error -- suppress individually. |
| 1051 | + with contextlib.suppress(Exception): |
| 1052 | + ext_handle = handle_return(driver.cuMemRetainAllocationHandle(new_ptr)) |
| 1053 | + try: |
| 1054 | + handle_return(driver.cuMemUnmap(new_ptr, aligned_additional_size)) |
| 1055 | + finally: |
| 1056 | + handle_return(driver.cuMemRelease(ext_handle)) |
| 1057 | + with contextlib.suppress(Exception): |
| 1058 | + handle_return(driver.cuMemAddressFree(new_ptr, aligned_additional_size)) |
| 1059 | + with contextlib.suppress(Exception): |
| 1060 | + orig_handle = handle_return(driver.cuMemRetainAllocationHandle(original_handle)) |
| 1061 | + try: |
| 1062 | + handle_return(driver.cuMemUnmap(original_handle, original_size)) |
| 1063 | + finally: |
| 1064 | + handle_return(driver.cuMemRelease(orig_handle)) |
| 1065 | + with contextlib.suppress(Exception): |
| 1066 | + handle_return(driver.cuMemAddressFree(original_handle, original_size)) |
| 1067 | + |
| 1068 | + |
972 | 1069 | def test_vmm_allocator_rdma_unsupported_exception(): |
973 | 1070 | """Test that VirtualMemoryResource throws an exception when RDMA is requested but device doesn't support it. |
974 | 1071 |
|
|
0 commit comments