Skip to content

Commit 9819aa9

Browse files
authored
Preserve memory pool CUDA errors and harden OOM tests (#2084)
* cuda.core: preserve memory pool CUDA errors * test: skip unsupported managed pool warnings Skip managed memory warning tests when explicit managed pool creation reports unsupported, now that cuda.core preserves the underlying CUDA error. xref: #2084 (comment) * test: xfail default mempool interop OOM Handle default mempool query OOMs with the shared mempool xfail helper so the interop test reports the known platform condition instead of a direct assertion failure. xref: #2084 (comment) section 1 * test: xfail graph mempool allocation OOM Handle graph allocation OOMs with the shared mempool xfail helper so graph tests report the known platform condition instead of cascading failures. xref: #2084 (comment) section 2 * cuda.core: clarify internal empty-handle fallback errors Report empty-handle fallback failures in terms of the public operation so users get actionable guidance without exposing non-public helper names.
1 parent ad5001b commit 9819aa9

13 files changed

Lines changed: 222 additions & 116 deletions

cuda_bindings/tests/test_interoperability.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import cuda.bindings.driver as cuda
88
import cuda.bindings.runtime as cudart
9+
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
910

1011

1112
def supportsMemoryPool():
@@ -87,12 +88,14 @@ def test_interop_graphNode():
8788
def test_interop_memPool():
8889
# DRV to RT
8990
err_dr, pool = cuda.cuDeviceGetDefaultMemPool(0)
91+
xfail_if_mempool_oom(err_dr, "cuDeviceGetDefaultMemPool", 0)
9092
assert err_dr == cuda.CUresult.CUDA_SUCCESS
9193
(err_rt,) = cudart.cudaDeviceSetMemPool(0, pool)
9294
assert err_rt == cudart.cudaError_t.cudaSuccess
9395

9496
# RT to DRV
9597
err_rt, pool = cudart.cudaDeviceGetDefaultMemPool(0)
98+
xfail_if_mempool_oom(err_rt, "cudaDeviceGetDefaultMemPool", 0)
9699
assert err_rt == cudart.cudaError_t.cudaSuccess
97100
(err_dr,) = cuda.cuDeviceSetMemPool(0, pool)
98101
assert err_dr == cuda.CUresult.CUDA_SUCCESS

cuda_core/cuda/core/_memory/_device_memory_resource.pyx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ from cuda.core._memory._ipc cimport IPCAllocationHandle
1313
from cuda.core._resource_handles cimport (
1414
as_cu,
1515
get_device_mempool,
16+
get_last_error,
1617
)
1718
from cuda.core._utils.cuda_utils cimport (
1819
check_or_create_options,
@@ -262,6 +263,14 @@ cdef inline _DMR_init(DeviceMemoryResource self, device_id, options):
262263

263264
if opts is None:
264265
self._h_pool = get_device_mempool(dev_id)
266+
if not self._h_pool:
267+
HANDLE_RETURN(get_last_error())
268+
raise RuntimeError(
269+
f"Failed to initialize DeviceMemoryResource for device {dev_id}: "
270+
"cuda-core returned an empty memory pool handle without recording a CUDA error. "
271+
"This is an internal cuda-core error; please report it with your CUDA driver, "
272+
"CUDA Toolkit, and cuda-python versions."
273+
)
265274
self._mempool_owned = False
266275
MP_raise_release_threshold(self)
267276
else:

cuda_core/cuda/core/_memory/_graph_memory_resource.pyx

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, Mem
1111
from cuda.core._resource_handles cimport (
1212
DevicePtrHandle,
1313
deviceptr_alloc_async,
14+
get_last_error,
1415
as_cu,
1516
)
1617

@@ -194,7 +195,13 @@ cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream
194195
check_capturing(s)
195196
h_ptr = deviceptr_alloc_async(size, stream._h_stream)
196197
if not h_ptr:
197-
raise RuntimeError("Failed to allocate memory asynchronously")
198+
HANDLE_RETURN(get_last_error())
199+
raise RuntimeError(
200+
f"Failed to allocate {size} bytes from GraphMemoryResource: "
201+
"cuda-core returned an empty allocation handle without recording a CUDA error. "
202+
"This is an internal cuda-core error; please report it with your CUDA driver, "
203+
"CUDA Toolkit, and cuda-python versions."
204+
)
198205
return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
199206

200207

cuda_core/cuda/core/_memory/_ipc.pyx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,13 @@ cdef _MemPool MP_from_allocation_handle(cls, alloc_handle):
211211
cdef int ipc_fd = int(alloc_handle)
212212
self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE)
213213
if not self._h_pool:
214-
raise RuntimeError("Failed to import memory pool from IPC handle")
214+
HANDLE_RETURN(get_last_error())
215+
raise RuntimeError(
216+
f"Failed to import {cls.__name__} from an allocation handle: "
217+
"cuda-core returned an empty memory pool handle without recording a CUDA error. "
218+
"This is an internal cuda-core error; please report it with your CUDA driver, "
219+
"CUDA Toolkit, and cuda-python versions."
220+
)
215221
self._ipc_data = IPCDataForMR(alloc_handle, True)
216222

217223
# Register it.

cuda_core/cuda/core/_memory/_memory_pool.pyx

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ from cuda.core._resource_handles cimport (
1717
DevicePtrHandle,
1818
create_mempool_handle,
1919
deviceptr_alloc_from_pool,
20+
get_last_error,
2021
as_cu,
2122
as_py,
2223
)
@@ -228,6 +229,14 @@ cdef int MP_init_create_pool(
228229

229230
self._mempool_owned = True
230231
self._h_pool = create_mempool_handle(properties)
232+
if not self._h_pool:
233+
HANDLE_RETURN(get_last_error())
234+
raise RuntimeError(
235+
f"Failed to initialize {self.__class__.__name__}: "
236+
"cuda-core returned an empty memory pool handle without recording a CUDA error. "
237+
"This is an internal cuda-core error; please report it with your CUDA driver, "
238+
"CUDA Toolkit, and cuda-python versions."
239+
)
231240

232241
if ipc_enabled:
233242
alloc_handle = _ipc.MP_export_mempool(self)
@@ -307,7 +316,13 @@ cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream):
307316
check_not_capturing(s)
308317
h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream)
309318
if not h_ptr:
310-
raise RuntimeError("Failed to allocate memory from pool")
319+
HANDLE_RETURN(get_last_error())
320+
raise RuntimeError(
321+
f"Failed to allocate {size} bytes from {self.__class__.__name__}: "
322+
"cuda-core returned an empty allocation handle without recording a CUDA error. "
323+
"This is an internal cuda-core error; please report it with your CUDA driver, "
324+
"CUDA Toolkit, and cuda-python versions."
325+
)
311326
return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
312327

313328

cuda_core/tests/conftest.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
import pathlib
77
import sys
8+
from contextlib import contextmanager
89
from importlib.metadata import PackageNotFoundError, distribution
910

1011
import pytest
@@ -87,6 +88,8 @@ def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs):
8788
return ManagedMemoryResource(*args, **kwargs)
8889
except CUDAError as e:
8990
xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs))
91+
if "CUDA_ERROR_NOT_SUPPORTED" in str(e):
92+
pytest.skip("ManagedMemoryResource is not supported on this platform/device")
9093
raise
9194
except RuntimeError as e:
9295
if "requires CUDA 13.0" in str(e):
@@ -102,6 +105,15 @@ def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs):
102105
raise
103106

104107

108+
@contextmanager
109+
def xfail_on_graph_mempool_oom(device=0):
110+
try:
111+
yield
112+
except CUDAError as e:
113+
xfail_if_mempool_oom(e, "cuGraphAddMemAllocNode", device)
114+
raise
115+
116+
105117
def _device_id_from_resource_options(device, args, kwargs):
106118
if device is not None:
107119
return device

cuda_core/tests/graph/test_graph_definition.py

Lines changed: 66 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from helpers.graph_kernels import compile_common_kernels
1111
from helpers.misc import try_create_condition
1212

13+
from conftest import xfail_on_graph_mempool_oom
1314
from cuda.core import Device, LaunchConfig
1415
from cuda.core.graph import (
1516
AllocNode,
@@ -201,13 +202,15 @@ def _build_disconnected():
201202
def graph_spec(request, init_cuda):
202203
if request.param is not _build_empty:
203204
_skip_if_no_mempool()
204-
return request.param()
205+
with xfail_on_graph_mempool_oom():
206+
return request.param()
205207

206208

207209
@pytest.fixture(params=_NONEMPTY_BUILDERS)
208210
def nonempty_graph_spec(request, init_cuda):
209211
_skip_if_no_mempool()
210-
return request.param()
212+
with xfail_on_graph_mempool_oom():
213+
return request.param()
211214

212215

213216
# =============================================================================
@@ -562,7 +565,8 @@ def node_spec(request, init_cuda):
562565
if spec.needs_mempool:
563566
_skip_if_no_mempool()
564567
g = GraphDefinition()
565-
node, expected_attrs = spec.builder(g)
568+
with xfail_on_graph_mempool_oom():
569+
node, expected_attrs = spec.builder(g)
566570
return spec, g, node, expected_attrs
567571

568572

@@ -803,18 +807,20 @@ def test_alloc_zero_size_fails(sample_graphdef):
803807
def test_free_creates_dependency(sample_graphdef):
804808
"""Free node depends on its predecessor."""
805809
_skip_if_no_mempool()
806-
alloc = sample_graphdef.allocate(ALLOC_SIZE)
807-
free = alloc.deallocate(alloc.dptr)
810+
with xfail_on_graph_mempool_oom():
811+
alloc = sample_graphdef.allocate(ALLOC_SIZE)
812+
free = alloc.deallocate(alloc.dptr)
808813
assert alloc in free.pred
809814

810815

811816
def test_alloc_free_chain(sample_graphdef):
812817
"""Alloc and free can be chained."""
813818
_skip_if_no_mempool()
814-
a1 = sample_graphdef.allocate(ALLOC_SIZE)
815-
a2 = a1.allocate(ALLOC_SIZE)
816-
f2 = a2.deallocate(a2.dptr)
817-
f1 = f2.deallocate(a1.dptr)
819+
with xfail_on_graph_mempool_oom():
820+
a1 = sample_graphdef.allocate(ALLOC_SIZE)
821+
a2 = a1.allocate(ALLOC_SIZE)
822+
f2 = a2.deallocate(a2.dptr)
823+
f1 = f2.deallocate(a1.dptr)
818824
assert a1 in a2.pred
819825
assert a2 in f2.pred
820826
assert f2 in f1.pred
@@ -842,15 +848,17 @@ def test_alloc_device_option(sample_graphdef, device_spec):
842848
"""Device can be specified as int or Device object."""
843849
_skip_if_no_mempool()
844850
device = Device()
845-
node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device))
851+
with xfail_on_graph_mempool_oom(device):
852+
node = sample_graphdef.allocate(ALLOC_SIZE, device=device_spec(device))
846853
assert node.dptr != 0
847854

848855

849856
def test_alloc_peer_access(mempool_device_x2):
850857
"""AllocNode.peer_access reflects requested peers."""
851858
d0, d1 = mempool_device_x2
852859
g = GraphDefinition()
853-
node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id])
860+
with xfail_on_graph_mempool_oom(d0):
861+
node = g.allocate(ALLOC_SIZE, device=d0.device_id, peer_access=[d1.device_id])
854862
assert d1.device_id in node.peer_access
855863

856864

@@ -863,8 +871,9 @@ def test_alloc_peer_access(mempool_device_x2):
863871
def test_join_merges_branches(sample_graphdef, num_branches):
864872
"""join() with multiple branches creates correct dependencies."""
865873
_skip_if_no_mempool()
866-
branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)]
867-
joined = sample_graphdef.join(*branches)
874+
with xfail_on_graph_mempool_oom():
875+
branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)]
876+
joined = sample_graphdef.join(*branches)
868877
assert isinstance(joined, EmptyNode)
869878
assert set(joined.pred) == set(branches)
870879

@@ -956,8 +965,9 @@ def test_instantiate_empty_graph(sample_graphdef, inst_kwargs):
956965
def test_instantiate_with_nodes(sample_graphdef, inst_kwargs):
957966
"""Graph with nodes can be instantiated."""
958967
_skip_if_no_mempool()
959-
sample_graphdef.allocate(ALLOC_SIZE)
960-
sample_graphdef.allocate(ALLOC_SIZE)
968+
with xfail_on_graph_mempool_oom():
969+
sample_graphdef.allocate(ALLOC_SIZE)
970+
sample_graphdef.allocate(ALLOC_SIZE)
961971
graph = _instantiate(sample_graphdef, inst_kwargs)
962972
assert graph is not None
963973

@@ -997,8 +1007,9 @@ def test_instantiate_and_execute_kernel(sample_graphdef, inst_kwargs):
9971007
def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs):
9981008
"""Graph with alloc/free can be executed."""
9991009
_skip_if_no_mempool()
1000-
alloc = sample_graphdef.allocate(ALLOC_SIZE)
1001-
alloc.deallocate(alloc.dptr)
1010+
with xfail_on_graph_mempool_oom():
1011+
alloc = sample_graphdef.allocate(ALLOC_SIZE)
1012+
alloc.deallocate(alloc.dptr)
10021013

10031014
stream = Device().create_stream()
10041015
graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream)
@@ -1010,9 +1021,10 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs):
10101021
def test_instantiate_and_execute_memset(sample_graphdef, inst_kwargs):
10111022
"""Graph with alloc/memset/free can be executed."""
10121023
_skip_if_no_mempool()
1013-
alloc = sample_graphdef.allocate(ALLOC_SIZE)
1014-
ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE)
1015-
ms.deallocate(alloc.dptr)
1024+
with xfail_on_graph_mempool_oom():
1025+
alloc = sample_graphdef.allocate(ALLOC_SIZE)
1026+
ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE)
1027+
ms.deallocate(alloc.dptr)
10161028

10171029
stream = Device().create_stream()
10181030
graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream)
@@ -1026,12 +1038,13 @@ def test_instantiate_and_execute_memcpy(sample_graphdef, inst_kwargs):
10261038
_skip_if_no_mempool()
10271039
import ctypes
10281040

1029-
src_alloc = sample_graphdef.allocate(ALLOC_SIZE)
1030-
dst_alloc = sample_graphdef.allocate(ALLOC_SIZE)
1031-
dep = sample_graphdef.join(src_alloc, dst_alloc)
1032-
ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE)
1033-
cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE)
1034-
cp.deallocate(src_alloc.dptr)
1041+
with xfail_on_graph_mempool_oom():
1042+
src_alloc = sample_graphdef.allocate(ALLOC_SIZE)
1043+
dst_alloc = sample_graphdef.allocate(ALLOC_SIZE)
1044+
dep = sample_graphdef.join(src_alloc, dst_alloc)
1045+
ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE)
1046+
cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE)
1047+
cp.deallocate(src_alloc.dptr)
10351048

10361049
stream = Device().create_stream()
10371050
graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream)
@@ -1166,11 +1179,12 @@ def test_instantiate_and_execute_if_then(sample_graphdef):
11661179
set_handle = mod.get_kernel("set_handle")
11671180
add_one = mod.get_kernel("add_one")
11681181

1169-
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
1170-
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
1171-
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1)
1172-
if_node = setter.if_then(condition)
1173-
if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1182+
with xfail_on_graph_mempool_oom():
1183+
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
1184+
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
1185+
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1)
1186+
if_node = setter.if_then(condition)
1187+
if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
11741188

11751189
graph = sample_graphdef.instantiate()
11761190
stream = Device().create_stream()
@@ -1198,13 +1212,14 @@ def test_instantiate_and_execute_if_else(sample_graphdef):
11981212
set_handle = mod.get_kernel("set_handle")
11991213
add_one = mod.get_kernel("add_one")
12001214

1201-
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
1202-
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
1203-
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0)
1204-
ie_node = setter.if_else(condition)
1205-
ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1206-
n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1207-
n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1215+
with xfail_on_graph_mempool_oom():
1216+
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
1217+
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
1218+
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0)
1219+
ie_node = setter.if_else(condition)
1220+
ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1221+
n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1222+
n1.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
12081223

12091224
graph = sample_graphdef.instantiate()
12101225
stream = Device().create_stream()
@@ -1232,12 +1247,13 @@ def test_instantiate_and_execute_switch(sample_graphdef):
12321247
set_handle = mod.get_kernel("set_handle")
12331248
add_one = mod.get_kernel("add_one")
12341249

1235-
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
1236-
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
1237-
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2)
1238-
sw_node = setter.switch(condition, 4)
1239-
for branch in sw_node.branches:
1240-
branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
1250+
with xfail_on_graph_mempool_oom():
1251+
alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int))
1252+
ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int))
1253+
setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2)
1254+
sw_node = setter.switch(condition, 4)
1255+
for branch in sw_node.branches:
1256+
branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr)
12411257

12421258
graph = sample_graphdef.instantiate()
12431259
stream = Device().create_stream()
@@ -1272,7 +1288,8 @@ def test_conditional_node_type_preserved_by_nodes(sample_graphdef):
12721288
def test_debug_dot_print_creates_file(sample_graphdef, dot_file):
12731289
"""debug_dot_print writes a DOT file."""
12741290
_skip_if_no_mempool()
1275-
sample_graphdef.allocate(ALLOC_SIZE)
1291+
with xfail_on_graph_mempool_oom():
1292+
sample_graphdef.allocate(ALLOC_SIZE)
12761293
sample_graphdef.debug_dot_print(str(dot_file))
12771294
assert dot_file.exists()
12781295
content = dot_file.read_text()
@@ -1282,7 +1299,8 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file):
12821299
def test_debug_dot_print_with_options(sample_graphdef, dot_file):
12831300
"""debug_dot_print accepts GraphDebugPrintOptions."""
12841301
_skip_if_no_mempool()
1285-
sample_graphdef.allocate(ALLOC_SIZE)
1302+
with xfail_on_graph_mempool_oom():
1303+
sample_graphdef.allocate(ALLOC_SIZE)
12861304
options = GraphDebugPrintOptions(verbose=True, handles=True)
12871305
sample_graphdef.debug_dot_print(str(dot_file), options)
12881306
assert dot_file.exists()
@@ -1291,6 +1309,7 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file):
12911309
def test_debug_dot_print_invalid_options(sample_graphdef, dot_file):
12921310
"""debug_dot_print rejects invalid options type."""
12931311
_skip_if_no_mempool()
1294-
sample_graphdef.allocate(ALLOC_SIZE)
1312+
with xfail_on_graph_mempool_oom():
1313+
sample_graphdef.allocate(ALLOC_SIZE)
12951314
with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"):
12961315
sample_graphdef.debug_dot_print(str(dot_file), "invalid")

0 commit comments

Comments
 (0)