Skip to content

Commit 7fd04fe

Browse files
authored
Merge branch 'main' into fix-get-process-name-on-wsl
2 parents 960659c + ae34e4c commit 7fd04fe

16 files changed

Lines changed: 261 additions & 23 deletions

ci/tools/setup-sanitizer

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22

3-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
44
#
55
# SPDX-License-Identifier: Apache-2.0
66

@@ -12,7 +12,15 @@ set -euo pipefail
1212
if [[ "${SETUP_SANITIZER}" == 1 ]]; then
1313
COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer"
1414
COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g')
15-
SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no"
15+
# --target-processes=application-only: attach the sanitizer to the parent
16+
# pytest process only. Spawned multiprocessing.Process children run without
17+
# the sanitizer. This aims to mitigate a class of CI hangs where child
18+
# processes take an extreme amount of time to spawn (>30 seconds). Test bugs
19+
# triggered by that specific condition are typically uncovered only in CI,
20+
# where they become emergencies and are difficult to debug. The parent
21+
# process is still fully sanitized, which is where most of the interesting
22+
# host-side IPC plumbing runs anyway.
23+
SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=application-only --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no"
1624
if [[ "$COMPUTE_SANITIZER_VERSION" -ge 202111 ]]; then
1725
SANITIZER_CMD="${SANITIZER_CMD} --padding=32"
1826
fi

cuda_core/AGENTS.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,15 @@ Python or Cython type annotations should be included for all public APIs. Avoid
125125
the use of `Any` unless absolutely necessary. The argument and return types as
126126
defined in the docstrings should match the type annotations.
127127

128+
Python imports should generally be outside of an if typing.TYPE_CHECK: block, even if the imported object is only used in type annotations. Use if typing.TYPE_CHECK: only to avoid creating import cycles. (This guidance maximizes compatibility with the cross-reference mechanisms in Sphinx.)
129+
128130
### Semantics
129131

130-
Designs involving manual resource management should be avoided. Where
131-
appropriate, provide context managers (implemented with `__enter__` and
132-
`__exit__`, not `contextlib.contextmanager`) or RAII using a `__del__` or
133-
`__dealloc__` method.
132+
APIs should exist for both manual resource management (such as `close()`) and
133+
automatic resource management, using context managers or destructors where
134+
appropriate. Context managers should be implemented with `__enter__` and
135+
`__exit__`, not `contextlib.contextmanager`. For destructors use `__dealloc__`
136+
where possible, otherwise `__del__`.
134137

135138
### Documentation
136139

cuda_core/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ cu12 = ["cuda-bindings[all]==12.*"]
5858
cu13 = ["cuda-bindings[all]==13.*"]
5959

6060
[dependency-groups]
61-
test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "cloudpickle", "psutil", "cffi"]
61+
test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "pytest-timeout", "cloudpickle", "psutil", "cffi"]
6262
ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"]
6363
test-cu12 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy
6464
test-cu13 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy

cuda_core/tests/conftest.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -213,14 +213,24 @@ def pop_all_contexts():
213213

214214
@pytest.fixture
215215
def ipc_device():
216-
"""Obtains a device suitable for IPC-enabled mempool tests, or skips."""
216+
"""Obtains a device suitable for IPC-enabled mempool tests, or skips.
217+
218+
The fixture also tracks every ``multiprocessing.Process`` spawned during
219+
the test and kills any survivors at teardown. This prevents a stuck child
220+
(e.g., compute-sanitizer wedged during IPC teardown -- see issue #2004)
221+
from blocking ``ipc_memory_resource``'s ``mr.close()`` for hours.
222+
"""
223+
from helpers.child_processes import track_child_processes
224+
217225
device = Device(0)
218226
device.set_current()
219227

220228
if not device.properties.memory_pools_supported:
221229
pytest.skip("Device does not support mempool operations")
222230

223-
return _require_ipc_mempool_devices((device,))[0]
231+
device = _require_ipc_mempool_devices((device,))[0]
232+
with track_child_processes():
233+
yield device
224234

225235

226236
@pytest.fixture(
@@ -291,8 +301,16 @@ def mempool_device_x3():
291301

292302
@pytest.fixture
293303
def ipc_mempool_device_x2(mempool_device_x2):
294-
"""Fixture that provides two IPC-capable mempool devices, or skips."""
295-
return _require_ipc_mempool_devices(mempool_device_x2)
304+
"""Fixture that provides two IPC-capable mempool devices, or skips.
305+
306+
Also tracks/kills any leftover ``multiprocessing.Process`` children at
307+
teardown for the same reasons documented on :func:`ipc_device`.
308+
"""
309+
from helpers.child_processes import track_child_processes
310+
311+
devices = _require_ipc_mempool_devices(mempool_device_x2)
312+
with track_child_processes():
313+
yield devices
296314

297315

298316
@pytest.fixture(
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Helpers for tests that spawn ``multiprocessing.Process`` children.
5+
6+
These exist primarily to defend IPC tests against a class of CI hang where a
7+
child process spawns too slowly and the parent does not implement proper guards
8+
for that (see issue #2004). Without intervention, a zombie child holds an IPC
9+
memory handle and blocks the parent's ``mr.close()`` in fixture teardown,
10+
leading to deadlock and wedging the test runner for hours.
11+
"""
12+
13+
import contextlib
14+
import multiprocessing.process
15+
import weakref
16+
17+
from cuda_python_test_helpers import under_compute_sanitizer
18+
19+
CHILD_TIMEOUT_SEC_DEFAULT = 30
20+
CHILD_TIMEOUT_SEC_SANITIZER = 120
21+
22+
23+
def child_timeout_sec() -> int:
24+
"""Return the per-process join/wait timeout for IPC-style tests.
25+
26+
Compute-sanitizer significantly slows process startup and CUDA context
27+
teardown, so we use a larger budget when it is active.
28+
"""
29+
return CHILD_TIMEOUT_SEC_SANITIZER if under_compute_sanitizer() else CHILD_TIMEOUT_SEC_DEFAULT
30+
31+
32+
def kill_subprocesses(*processes):
33+
"""Kill any of the given Process objects that are still alive.
34+
35+
Returns the list of processes that were killed (i.e. that were still alive
36+
when the call was made). Callers should ``assert not survivors`` to convert
37+
a non-empty return value into a clean test failure, e.g.::
38+
39+
proc_a.join(timeout=CHILD_TIMEOUT_SEC)
40+
proc_b.join(timeout=CHILD_TIMEOUT_SEC)
41+
survivors = kill_subprocesses(proc_a, proc_b)
42+
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
43+
assert proc_a.exitcode == 0
44+
assert proc_b.exitcode == 0
45+
46+
Killing survivors before the subsequent asserts prevents a zombie child
47+
from holding IPC handles past the test body and blocking fixture
48+
teardown.
49+
"""
50+
killed = []
51+
for proc in processes:
52+
try:
53+
alive = proc.is_alive()
54+
except (ValueError, AssertionError):
55+
# is_alive() raises if the Process was never started or has
56+
# already been closed; nothing to clean up.
57+
continue
58+
if not alive:
59+
continue
60+
with contextlib.suppress(ValueError, AssertionError):
61+
proc.kill()
62+
proc.join()
63+
killed.append(proc)
64+
return killed
65+
66+
67+
@contextlib.contextmanager
68+
def track_child_processes():
69+
"""Context manager that kills any ``multiprocessing.Process`` children still
70+
alive at exit.
71+
72+
Patches ``multiprocessing.process.BaseProcess.__init__`` to record every
73+
``Process`` instance constructed inside the ``with`` block. This covers
74+
the delegating ``mp.Process`` class as well as direct ``SpawnProcess`` /
75+
``ForkProcess`` instances (including those created by ``mp.Pool``), since
76+
all of them inherit from ``BaseProcess``. On exit, any tracked process
77+
that is still alive is killed and joined.
78+
79+
This protects fixture teardown (e.g. ``ipc_memory_resource``'s
80+
``mr.close()``) from blocking on IPC handles held by a stuck child --
81+
see issue #2004.
82+
"""
83+
tracked = weakref.WeakSet()
84+
base = multiprocessing.process.BaseProcess
85+
original_init = base.__init__
86+
87+
def tracking_init(self, *args, **kwargs):
88+
original_init(self, *args, **kwargs)
89+
tracked.add(self)
90+
91+
base.__init__ = tracking_init
92+
try:
93+
yield
94+
finally:
95+
base.__init__ = original_init
96+
for proc in list(tracked):
97+
# is_alive() / kill() raise ValueError if the Process was never
98+
# started or has already been closed; nothing to clean up in that
99+
# case.
100+
with contextlib.suppress(ValueError):
101+
if proc.is_alive():
102+
proc.kill()
103+
proc.join()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Per-directory conftest for memory IPC tests.
5+
6+
Applies an outer-guard ``pytest.mark.timeout`` to every test in this directory.
7+
Individual tests still drive their own per-process waits using
8+
``child_timeout_sec()`` from ``helpers.child_processes``; this marker is the
9+
final fallback so that no IPC test can wedge the CI runner for hours if
10+
deadlock occurs.
11+
"""
12+
13+
import pathlib
14+
15+
import pytest
16+
from helpers.child_processes import child_timeout_sec
17+
18+
_HERE = pathlib.Path(__file__).parent.resolve()
19+
20+
21+
def _outer_timeout_sec() -> int:
22+
# IPC tests spawn children that run concurrently, so expected wall-clock
23+
# is ~CHILD_TIMEOUT_SEC regardless of how many subsequent join/wait
24+
# timeouts the test chains together (each subsequent join returns
25+
# immediately once its child is already done). Exceeding that already
26+
# means something is genuinely stuck, at which point the outer guard
27+
# firing is the right outcome -- the per-test asserts wouldn't add
28+
# useful diagnostic value over "test exceeded its budget", and the
29+
# autouse track_child_processes() context manager still cleans up.
30+
return child_timeout_sec() + 30
31+
32+
33+
def pytest_collection_modifyitems(config, items):
34+
marker = pytest.mark.timeout(_outer_timeout_sec())
35+
for item in items:
36+
try:
37+
item_path = pathlib.Path(str(item.fspath)).resolve()
38+
except OSError:
39+
continue
40+
if _HERE in item_path.parents:
41+
item.add_marker(marker)

cuda_core/tests/memory_ipc/test_errors.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,31 @@
66
import re
77

88
import pytest
9+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
910

1011
from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
1112
from cuda.core._utils.cuda_utils import CUDAError
1213

13-
CHILD_TIMEOUT_SEC = 30
14+
CHILD_TIMEOUT_SEC = child_timeout_sec()
1415
NBYTES = 64
1516
POOL_SIZE = 2097152
1617

1718

19+
def test_outer_timeout_marker_is_applied(request):
20+
"""Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.
21+
22+
If this test fails, the per-directory conftest is not being loaded, or its
23+
pytest_collection_modifyitems hook is not adding the marker. Without this
24+
marker, the only thing protecting the GHA runner from a wedged IPC test is
25+
the in-test cleanup -- which we want to keep as defense in depth, not as
26+
the sole guard.
27+
"""
28+
expected = child_timeout_sec() + 30
29+
marker = request.node.get_closest_marker("timeout")
30+
assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker"
31+
assert marker.args == (expected,), f"unexpected timeout value: {marker.args!r}"
32+
33+
1834
class ChildErrorHarness:
1935
"""Test harness for checking errors in child processes. Subclasses override
2036
PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
@@ -43,6 +59,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
4359

4460
# Wait for the child process.
4561
process.join(timeout=CHILD_TIMEOUT_SEC)
62+
survivors = kill_subprocesses(process)
63+
assert not survivors, "child did not exit within timeout"
4664
assert process.exitcode == 0
4765
finally:
4866
for mr in self._extra_mrs:

cuda_core/tests/memory_ipc/test_event_ipc.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55

66
import pytest
77
from helpers.buffers import compare_equal_buffers, make_scratch_buffer
8+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
89
from helpers.latch import LatchKernel
910
from helpers.logging import TimestampedLogger
1011

1112
from cuda.core import Device, EventOptions
1213

1314
ENABLE_LOGGING = False # Set True for test debugging and development
14-
CHILD_TIMEOUT_SEC = 30
15+
CHILD_TIMEOUT_SEC = child_timeout_sec()
1516
NBYTES = 64
1617

1718

@@ -67,6 +68,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
6768
log("releasing stream1")
6869
latch.release()
6970
process.join(timeout=CHILD_TIMEOUT_SEC)
71+
survivors = kill_subprocesses(process)
72+
assert not survivors, "child did not exit within timeout"
7073
assert process.exitcode == 0
7174
log("done")
7275

@@ -162,6 +165,8 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw):
162165
assert props[5] is None
163166

164167
process.join(timeout=CHILD_TIMEOUT_SEC)
168+
survivors = kill_subprocesses(process)
169+
assert not survivors, "child did not exit within timeout"
165170
assert process.exitcode == 0
166171

167172
def child_main(self, q_in, q_out):

cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@
1313
import multiprocessing as mp
1414

1515
import pytest
16+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
1617
from helpers.logging import TimestampedLogger
1718

1819
from cuda.core import Buffer, Device
1920

20-
CHILD_TIMEOUT_SEC = 30
21+
CHILD_TIMEOUT_SEC = child_timeout_sec()
2122
NBYTES = 64
2223
POOL_SIZE = 2097152
2324

@@ -84,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
8485

8586
log("waiting for child")
8687
process.join(timeout=CHILD_TIMEOUT_SEC)
88+
survivors = kill_subprocesses(process)
8789
log(f"child exit code: {process.exitcode}")
90+
assert not survivors, "child did not exit within timeout"
8891
assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
8992
log("done")

0 commit comments

Comments
 (0)