Skip to content

Commit d1dbdff

Browse files
committed
tests: add kill_subprocesses helper and apply to IPC tests
Make every IPC test responsible for its own child-process cleanup, rather than leaning on the fixture-level tracker as the primary mechanism. helpers.child_processes.kill_subprocesses(*processes) returns the list of processes that were still alive when called and kills them. Tests pair this with an "assert not survivors" check, so a timeout produces a clean failure message ("timed out waiting on: ['Process-3']") instead of "assert None == 0", and the held IPC handles are released before any further test code runs. Every join+exitcode pattern in tests/memory_ipc/ is converted to the new shape: process.join(timeout=CHILD_TIMEOUT_SEC) survivors = kill_subprocesses(process) assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 For test_send_buffers.py:TestIpcReexport (the test from #2121), the event.wait return value is also captured and asserted on so a timeout there is reported explicitly rather than dropped on the floor. The autouse track_child_processes() context manager in the ipc_device fixture remains as defense in depth for any future test that forgets the pattern. memory_ipc/test_errors.py adds a meta-test test_outer_timeout_marker_is_applied that verifies tests/memory_ipc/conftest.py is loaded and applies the pytest.mark.timeout(300) marker. This catches the "nested conftest silently not picked up" failure mode at test time with a clear error message.
1 parent d65186e commit d1dbdff

9 files changed

Lines changed: 100 additions & 11 deletions

File tree

cuda_core/tests/helpers/child_processes.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,41 @@ def child_timeout_sec() -> int:
2929
return CHILD_TIMEOUT_SEC_SANITIZER if under_compute_sanitizer() else CHILD_TIMEOUT_SEC_DEFAULT
3030

3131

32+
def kill_subprocesses(*processes):
33+
"""Kill any of the given Process objects that are still alive.
34+
35+
Returns the list of processes that were killed (i.e. that were still alive
36+
when the call was made). Callers should ``assert not survivors`` to convert
37+
a non-empty return value into a clean test failure, e.g.::
38+
39+
proc_a.join(timeout=CHILD_TIMEOUT_SEC)
40+
proc_b.join(timeout=CHILD_TIMEOUT_SEC)
41+
survivors = kill_subprocesses(proc_a, proc_b)
42+
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
43+
assert proc_a.exitcode == 0
44+
assert proc_b.exitcode == 0
45+
46+
Killing survivors before the subsequent asserts prevents a zombie child
47+
from holding IPC handles past the test body and blocking fixture
48+
teardown.
49+
"""
50+
killed = []
51+
for proc in processes:
52+
try:
53+
alive = proc.is_alive()
54+
except (ValueError, AssertionError):
55+
# is_alive() raises if the Process was never started or has
56+
# already been closed; nothing to clean up.
57+
continue
58+
if not alive:
59+
continue
60+
with contextlib.suppress(ValueError, AssertionError):
61+
proc.kill()
62+
proc.join()
63+
killed.append(proc)
64+
return killed
65+
66+
3267
@contextlib.contextmanager
3368
def track_child_processes():
3469
"""Context manager that kills any ``multiprocessing.Process`` children still

cuda_core/tests/memory_ipc/test_errors.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import re
77

88
import pytest
9-
from helpers.child_processes import child_timeout_sec
9+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
1010

1111
from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
1212
from cuda.core._utils.cuda_utils import CUDAError
@@ -16,6 +16,20 @@
1616
POOL_SIZE = 2097152
1717

1818

19+
def test_outer_timeout_marker_is_applied(request):
20+
"""Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.
21+
22+
If this test fails, the per-directory conftest is not being loaded, or its
23+
pytest_collection_modifyitems hook is not adding the marker. Without this
24+
marker, the only thing protecting the GHA runner from a wedged IPC test is
25+
the in-test cleanup -- which we want to keep as defense in depth, not as
26+
the sole guard.
27+
"""
28+
marker = request.node.get_closest_marker("timeout")
29+
assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker"
30+
assert marker.args == (300,), f"unexpected timeout value: {marker.args!r}"
31+
32+
1933
class ChildErrorHarness:
2034
"""Test harness for checking errors in child processes. Subclasses override
2135
PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
@@ -44,6 +58,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
4458

4559
# Wait for the child process.
4660
process.join(timeout=CHILD_TIMEOUT_SEC)
61+
survivors = kill_subprocesses(process)
62+
assert not survivors, "child did not exit within timeout"
4763
assert process.exitcode == 0
4864
finally:
4965
for mr in self._extra_mrs:

cuda_core/tests/memory_ipc/test_event_ipc.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import pytest
77
from helpers.buffers import compare_equal_buffers, make_scratch_buffer
8-
from helpers.child_processes import child_timeout_sec
8+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
99
from helpers.latch import LatchKernel
1010
from helpers.logging import TimestampedLogger
1111

@@ -68,6 +68,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
6868
log("releasing stream1")
6969
latch.release()
7070
process.join(timeout=CHILD_TIMEOUT_SEC)
71+
survivors = kill_subprocesses(process)
72+
assert not survivors, "child did not exit within timeout"
7173
assert process.exitcode == 0
7274
log("done")
7375

@@ -163,6 +165,8 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw):
163165
assert props[5] is None
164166

165167
process.join(timeout=CHILD_TIMEOUT_SEC)
168+
survivors = kill_subprocesses(process)
169+
assert not survivors, "child did not exit within timeout"
166170
assert process.exitcode == 0
167171

168172
def child_main(self, q_in, q_out):

cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import multiprocessing as mp
1414

1515
import pytest
16-
from helpers.child_processes import child_timeout_sec
16+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
1717
from helpers.logging import TimestampedLogger
1818

1919
from cuda.core import Buffer, Device
@@ -85,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
8585

8686
log("waiting for child")
8787
process.join(timeout=CHILD_TIMEOUT_SEC)
88+
survivors = kill_subprocesses(process)
8889
log(f"child exit code: {process.exitcode}")
90+
assert not survivors, "child did not exit within timeout"
8991
assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
9092
log("done")

cuda_core/tests/memory_ipc/test_leaks.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
else:
1414
HAVE_PSUTIL = True
1515
import pytest
16-
from helpers.child_processes import child_timeout_sec
16+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
1717

1818
CHILD_TIMEOUT_SEC = child_timeout_sec()
1919
NBYTES = 64
@@ -39,6 +39,8 @@ def exec_success(obj, number=1):
3939
process = mp.Process(target=child_main, args=(obj,))
4040
process.start()
4141
process.join(timeout=CHILD_TIMEOUT_SEC)
42+
survivors = kill_subprocesses(process)
43+
assert not survivors, "child did not exit within timeout"
4244
assert process.exitcode == 0
4345

4446

@@ -55,6 +57,8 @@ def exec_launch_failure(obj, number=1):
5557
process = mp.Process(target=child_main_bad, args=(obj,))
5658
process.start()
5759
process.join(timeout=CHILD_TIMEOUT_SEC)
60+
survivors = kill_subprocesses(process)
61+
assert not survivors, "child did not exit within timeout"
5862
assert process.exitcode != 0
5963

6064

@@ -138,5 +142,7 @@ def prime():
138142
process = mp.Process()
139143
process.start()
140144
process.join(timeout=CHILD_TIMEOUT_SEC)
145+
survivors = kill_subprocesses(process)
146+
assert not survivors, "child did not exit within timeout"
141147
assert process.exitcode == 0
142148
prime_was_run = True

cuda_core/tests/memory_ipc/test_memory_ipc.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import pytest
77
from helpers.buffers import PatternGen
8-
from helpers.child_processes import child_timeout_sec
8+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
99

1010
from cuda.core import Buffer, DeviceMemoryResource
1111

@@ -40,6 +40,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
4040

4141
# Wait for the child process.
4242
process.join(timeout=CHILD_TIMEOUT_SEC)
43+
survivors = kill_subprocesses(process)
44+
assert not survivors, "child did not exit within timeout"
4345
assert process.exitcode == 0
4446

4547
# Verify that the buffer was modified.
@@ -83,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
8385
# Wait for the child processes.
8486
p1.join(timeout=CHILD_TIMEOUT_SEC)
8587
p2.join(timeout=CHILD_TIMEOUT_SEC)
88+
survivors = kill_subprocesses(p1, p2)
89+
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
8690
assert p1.exitcode == 0
8791
assert p2.exitcode == 0
8892

@@ -136,6 +140,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
136140
# Wait for children.
137141
p1.join(timeout=CHILD_TIMEOUT_SEC)
138142
p2.join(timeout=CHILD_TIMEOUT_SEC)
143+
survivors = kill_subprocesses(p1, p2)
144+
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
139145
assert p1.exitcode == 0
140146
assert p2.exitcode == 0
141147

@@ -186,6 +192,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
186192
# Wait for children.
187193
p1.join(timeout=CHILD_TIMEOUT_SEC)
188194
p2.join(timeout=CHILD_TIMEOUT_SEC)
195+
survivors = kill_subprocesses(p1, p2)
196+
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
189197
assert p1.exitcode == 0
190198
assert p2.exitcode == 0
191199

cuda_core/tests/memory_ipc/test_peer_access.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import pytest
77
from helpers.buffers import PatternGen
8-
from helpers.child_processes import child_timeout_sec
8+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
99

1010
from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
1111
from cuda.core._utils.cuda_utils import CUDAError
@@ -36,6 +36,8 @@ def test_main(self, ipc_mempool_device_x2):
3636
process = mp.Process(target=self.child_main, args=(mr,))
3737
process.start()
3838
process.join(timeout=CHILD_TIMEOUT_SEC)
39+
survivors = kill_subprocesses(process)
40+
assert not survivors, "child did not exit within timeout"
3941
assert process.exitcode == 0
4042

4143
# Verify parent's MR still has peer access set (independent state)
@@ -82,6 +84,8 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent):
8284
process = mp.Process(target=self.child_main, args=(mr, buffer))
8385
process.start()
8486
process.join(timeout=CHILD_TIMEOUT_SEC)
87+
survivors = kill_subprocesses(process)
88+
assert not survivors, "child did not exit within timeout"
8589
assert process.exitcode == 0
8690

8791
buffer.close()

cuda_core/tests/memory_ipc/test_send_buffers.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import pytest
88
from helpers.buffers import PatternGen
9-
from helpers.child_processes import child_timeout_sec
9+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
1010

1111
from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
1212

@@ -38,8 +38,11 @@ def test_main(self, ipc_device, nmrs):
3838
process = mp.Process(target=self.child_main, args=(device, buffers))
3939
process.start()
4040

41-
# Wait for the child process.
41+
# Wait for the child process, then kill any survivor so subsequent
42+
# asserts cannot block on a held IPC handle.
4243
process.join(timeout=CHILD_TIMEOUT_SEC)
44+
survivors = kill_subprocesses(process)
45+
assert not survivors, "child did not exit within timeout"
4346
assert process.exitcode == 0
4447

4548
# Verify that the buffers were modified.
@@ -96,11 +99,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
9699
proc_b.start()
97100
proc_c.start()
98101

99-
# Wait for C to signal completion then clean up.
100-
event_c.wait(timeout=CHILD_TIMEOUT_SEC)
102+
# Wait for C to signal completion, then let B finish and join both.
103+
# Gather all state (event result + joins + survivor kills) before
104+
# asserting so cleanup happens regardless of which check fires.
105+
completed = event_c.wait(timeout=CHILD_TIMEOUT_SEC)
101106
event_b.set() # b can finish now
102107
proc_b.join(timeout=CHILD_TIMEOUT_SEC)
103108
proc_c.join(timeout=CHILD_TIMEOUT_SEC)
109+
survivors = kill_subprocesses(proc_b, proc_c)
110+
assert completed, "process C did not signal completion within timeout"
111+
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
104112
assert proc_b.exitcode == 0
105113
assert proc_c.exitcode == 0
106114

cuda_core/tests/memory_ipc/test_serialize.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import pytest
99
from helpers.buffers import PatternGen
10-
from helpers.child_processes import child_timeout_sec
10+
from helpers.child_processes import child_timeout_sec, kill_subprocesses
1111

1212
from cuda.core import Buffer, Device, DeviceMemoryResource, PinnedMemoryResource
1313

@@ -47,6 +47,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
4747

4848
# Wait for the child process.
4949
process.join(timeout=CHILD_TIMEOUT_SEC)
50+
survivors = kill_subprocesses(process)
51+
assert not survivors, "child did not exit within timeout"
5052
assert process.exitcode == 0
5153

5254
# Confirm buffers were modified.
@@ -104,6 +106,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
104106

105107
# Wait for the child process.
106108
process.join(timeout=CHILD_TIMEOUT_SEC)
109+
survivors = kill_subprocesses(process)
110+
assert not survivors, "child did not exit within timeout"
107111
assert process.exitcode == 0
108112

109113
# Confirm buffer was modified.
@@ -152,6 +156,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
152156
process = mp.Process(target=self.child_main, args=(alloc_handle, mr, buffer_desc, buffer))
153157
process.start()
154158
process.join(timeout=CHILD_TIMEOUT_SEC)
159+
survivors = kill_subprocesses(process)
160+
assert not survivors, "child did not exit within timeout"
155161
assert process.exitcode == 0
156162

157163
pgen.verify_buffer(buffer, seed=True)

0 commit comments

Comments
 (0)