From a026a0592d55496d9d2eb93c3501fd66e839a467 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 21 May 2026 10:27:04 -0700 Subject: [PATCH 1/6] tests: layered defense against IPC child-process hangs (#2004) Replace the per-test kill-on-timeout approach with three layered fixes that defend the IPC test suite against compute-sanitizer + CUDA driver / toolkit combinations that wedge child processes during IPC teardown. CI layer: ci/tools/setup-sanitizer now passes --target-processes=application-only so spawned multiprocessing.Process children run without the sanitizer attached. This eliminates the root cause of the hang reported in issue #2004 (compute-sanitizer's IPC teardown analysis getting stuck under Python 3.12 + CUDA 12.9.1). The parent pytest process is still fully sanitized. Fixture layer: ipc_device and ipc_mempool_device_x2 fixtures now wrap their yield in track_child_processes(), a context manager that patches multiprocessing.process.BaseProcess.__init__ to record every Process instance constructed during the test and kills any survivor at teardown. This protects ipc_memory_resource.mr.close() from blocking on IPC handles held by a stuck child, regardless of the original failure mode. Outer-guard layer: pytest-timeout is added to the test dep group, and tests/memory_ipc/conftest.py applies pytest.mark.timeout(300) to every test in the directory. This is the final fallback so no IPC test can wedge the GHA runner for hours if a new failure mode defeats the earlier layers. The hardcoded CHILD_TIMEOUT_SEC = 30 in every IPC test module is replaced with a call to helpers.child_processes.child_timeout_sec(), which returns 30 by default and 120 under compute-sanitizer (detected via the existing under_compute_sanitizer() helper). The unused CHILD_TIMEOUT_SEC declaration in test_workerpool.py is removed. --- ci/tools/setup-sanitizer | 11 ++- cuda_core/pyproject.toml | 2 +- cuda_core/tests/conftest.py | 26 +++++-- cuda_core/tests/helpers/child_processes.py | 69 +++++++++++++++++++ cuda_core/tests/memory_ipc/conftest.py | 29 ++++++++ cuda_core/tests/memory_ipc/test_errors.py | 3 +- cuda_core/tests/memory_ipc/test_event_ipc.py | 3 +- .../memory_ipc/test_ipc_duplicate_import.py | 3 +- cuda_core/tests/memory_ipc/test_leaks.py | 3 +- cuda_core/tests/memory_ipc/test_memory_ipc.py | 3 +- .../tests/memory_ipc/test_peer_access.py | 3 +- .../tests/memory_ipc/test_send_buffers.py | 3 +- cuda_core/tests/memory_ipc/test_serialize.py | 3 +- cuda_core/tests/memory_ipc/test_workerpool.py | 1 - 14 files changed, 146 insertions(+), 16 deletions(-) create mode 100644 cuda_core/tests/helpers/child_processes.py create mode 100644 cuda_core/tests/memory_ipc/conftest.py diff --git a/ci/tools/setup-sanitizer b/ci/tools/setup-sanitizer index e4904ca58ce..39a3d68d9e2 100755 --- a/ci/tools/setup-sanitizer +++ b/ci/tools/setup-sanitizer @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -12,7 +12,14 @@ set -euo pipefail if [[ "${SETUP_SANITIZER}" == 1 ]]; then COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer" COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g') - SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no" + # --target-processes=application-only: attach the sanitizer to the parent + # pytest process only. Spawned multiprocessing.Process children run without + # the sanitizer. This avoids a class of CI hangs where compute-sanitizer's + # IPC teardown analysis wedges a child on certain CUDA driver / toolkit + # combinations (see issue #2004). The parent process is still fully + # sanitized, which is where most of the interesting host-side IPC plumbing + # runs anyway. + SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=application-only --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no" if [[ "$COMPUTE_SANITIZER_VERSION" -ge 202111 ]]; then SANITIZER_CMD="${SANITIZER_CMD} --padding=32" fi diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index d5a4b5fd29c..bb5a918d8b9 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -58,7 +58,7 @@ cu12 = ["cuda-bindings[all]==12.*"] cu13 = ["cuda-bindings[all]==13.*"] [dependency-groups] -test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "cloudpickle", "psutil", "cffi"] +test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "pytest-timeout", "cloudpickle", "psutil", "cffi"] ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"] test-cu12 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy test-cu13 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index a9c028b2db0..19992015825 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -213,14 +213,24 @@ def pop_all_contexts(): @pytest.fixture def ipc_device(): - """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" + """Obtains a device suitable for IPC-enabled mempool tests, or skips. + + The fixture also tracks every ``multiprocessing.Process`` spawned during + the test and kills any survivors at teardown. This prevents a stuck child + (e.g., compute-sanitizer wedged during IPC teardown -- see issue #2004) + from blocking ``ipc_memory_resource``'s ``mr.close()`` for hours. + """ + from helpers.child_processes import track_child_processes + device = Device(0) device.set_current() if not device.properties.memory_pools_supported: pytest.skip("Device does not support mempool operations") - return _require_ipc_mempool_devices((device,))[0] + device = _require_ipc_mempool_devices((device,))[0] + with track_child_processes(): + yield device @pytest.fixture( @@ -291,8 +301,16 @@ def mempool_device_x3(): @pytest.fixture def ipc_mempool_device_x2(mempool_device_x2): - """Fixture that provides two IPC-capable mempool devices, or skips.""" - return _require_ipc_mempool_devices(mempool_device_x2) + """Fixture that provides two IPC-capable mempool devices, or skips. + + Also tracks/kills any leftover ``multiprocessing.Process`` children at + teardown for the same reasons documented on :func:`ipc_device`. + """ + from helpers.child_processes import track_child_processes + + devices = _require_ipc_mempool_devices(mempool_device_x2) + with track_child_processes(): + yield devices @pytest.fixture( diff --git a/cuda_core/tests/helpers/child_processes.py b/cuda_core/tests/helpers/child_processes.py new file mode 100644 index 00000000000..776ba0a9997 --- /dev/null +++ b/cuda_core/tests/helpers/child_processes.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Helpers for tests that spawn ``multiprocessing.Process`` children. + +These exist primarily to defend IPC tests against a class of CI hang where a +child process gets stuck during teardown (e.g., compute-sanitizer's IPC +teardown analysis on certain CUDA driver / toolkit combinations -- see issue +#2004). Without intervention, a zombie child holds an IPC memory handle and +blocks the parent's ``mr.close()`` in fixture teardown, wedging the GHA runner +for hours. +""" + +import contextlib +import multiprocessing.process +import weakref + +from cuda_python_test_helpers import under_compute_sanitizer + +CHILD_TIMEOUT_SEC_DEFAULT = 30 +CHILD_TIMEOUT_SEC_SANITIZER = 120 + + +def child_timeout_sec() -> int: + """Return the per-process join/wait timeout for IPC-style tests. + + Compute-sanitizer significantly slows process startup and CUDA context + teardown, so we use a larger budget when it is active. + """ + return CHILD_TIMEOUT_SEC_SANITIZER if under_compute_sanitizer() else CHILD_TIMEOUT_SEC_DEFAULT + + +@contextlib.contextmanager +def track_child_processes(): + """Context manager that kills any ``multiprocessing.Process`` children still + alive at exit. + + Patches ``multiprocessing.process.BaseProcess.__init__`` to record every + ``Process`` instance constructed inside the ``with`` block. This covers + the delegating ``mp.Process`` class as well as direct ``SpawnProcess`` / + ``ForkProcess`` instances (including those created by ``mp.Pool``), since + all of them inherit from ``BaseProcess``. On exit, any tracked process + that is still alive is killed and joined. + + This protects fixture teardown (e.g. ``ipc_memory_resource``'s + ``mr.close()``) from blocking on IPC handles held by a stuck child -- + see issue #2004. + """ + tracked = weakref.WeakSet() + base = multiprocessing.process.BaseProcess + original_init = base.__init__ + + def tracking_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) + tracked.add(self) + + base.__init__ = tracking_init + try: + yield + finally: + base.__init__ = original_init + for proc in list(tracked): + # is_alive() / kill() raise ValueError if the Process was never + # started or has already been closed; nothing to clean up in that + # case. + with contextlib.suppress(ValueError): + if proc.is_alive(): + proc.kill() + proc.join() diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py new file mode 100644 index 00000000000..535333108c3 --- /dev/null +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Per-directory conftest for memory IPC tests. + +Applies an outer-guard ``pytest.mark.timeout`` to every test in this directory. +Individual tests still drive their own per-process waits using +``child_timeout_sec()`` from ``helpers.child_processes``; this marker is the +final fallback so that no IPC test can wedge the CI runner for hours if some +new driver / sanitizer / IPC interaction defeats every other layer. +""" + +import pathlib + +import pytest + +_HERE = pathlib.Path(__file__).parent.resolve() +_TIMEOUT_SEC = 300 # 5 minutes per test; generous compared to child_timeout_sec(). + + +def pytest_collection_modifyitems(config, items): + marker = pytest.mark.timeout(_TIMEOUT_SEC) + for item in items: + try: + item_path = pathlib.Path(str(item.fspath)).resolve() + except OSError: + continue + if _HERE in item_path.parents: + item.add_marker(marker) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d17e63dc90a..3ebd93d2b27 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -6,11 +6,12 @@ import re import pytest +from helpers.child_processes import child_timeout_sec from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index e1bb45efcfb..1fe68b2ab17 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -5,13 +5,14 @@ import pytest from helpers.buffers import compare_equal_buffers, make_scratch_buffer +from helpers.child_processes import child_timeout_sec from helpers.latch import LatchKernel from helpers.logging import TimestampedLogger from cuda.core import Device, EventOptions ENABLE_LOGGING = False # Set True for test debugging and development -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index f0c4951e8e6..0c211ea9dc9 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -13,11 +13,12 @@ import multiprocessing as mp import pytest +from helpers.child_processes import child_timeout_sec from helpers.logging import TimestampedLogger from cuda.core import Buffer, Device -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index e2a7e8d096b..63282622a51 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -13,8 +13,9 @@ else: HAVE_PSUTIL = True import pytest +from helpers.child_processes import child_timeout_sec -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 USING_FDS = platform.system() == "Linux" diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 54159d81130..92c08146e2a 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -5,10 +5,11 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec from cuda.core import Buffer, DeviceMemoryResource -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 NWORKERS = 2 NTASKS = 2 diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 0c644c25ed4..2c09ad569a9 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -5,11 +5,12 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 041a8539da3..6a25196b89e 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -6,10 +6,11 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 NMRS = 3 NTASKS = 7 diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 78d26387c8a..1979067d328 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -7,10 +7,11 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec from cuda.core import Buffer, Device, DeviceMemoryResource, PinnedMemoryResource -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 5d0c7b1a0f5..609fadbcf3e 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -10,7 +10,6 @@ from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions -CHILD_TIMEOUT_SEC = 30 NBYTES = 64 NWORKERS = 2 NMRS = 3 From d65186ec40a77b746e76640a994723dcbc645f53 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 21 May 2026 11:05:49 -0700 Subject: [PATCH 2/6] tests: reframe IPC hang docs as test-side bug rather than CS bug Reword the inline comment in ci/tools/setup-sanitizer and the docstrings in helpers/child_processes.py and memory_ipc/conftest.py to make clear that the underlying problem is insufficient guards in the IPC tests when child processes spawn slowly (>30s under compute-sanitizer). The sanitizer change and the new helpers are mitigations / safety nets; the durable fix is making the tests handle slow children correctly. --- ci/tools/setup-sanitizer | 11 ++++++----- cuda_core/tests/helpers/child_processes.py | 9 ++++----- cuda_core/tests/memory_ipc/conftest.py | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ci/tools/setup-sanitizer b/ci/tools/setup-sanitizer index 39a3d68d9e2..cce78cedfa9 100755 --- a/ci/tools/setup-sanitizer +++ b/ci/tools/setup-sanitizer @@ -14,11 +14,12 @@ if [[ "${SETUP_SANITIZER}" == 1 ]]; then COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g') # --target-processes=application-only: attach the sanitizer to the parent # pytest process only. Spawned multiprocessing.Process children run without - # the sanitizer. This avoids a class of CI hangs where compute-sanitizer's - # IPC teardown analysis wedges a child on certain CUDA driver / toolkit - # combinations (see issue #2004). The parent process is still fully - # sanitized, which is where most of the interesting host-side IPC plumbing - # runs anyway. + # the sanitizer. This aims to mitigate a class of CI hangs where child + # processes take an extreme amount of time to spawn (>30 seconds). Test bugs + # triggered by that specific condition are typically uncovered only in CI, + # where they become emergencies and are difficult to debug. The parent + # process is still fully sanitized, which is where most of the interesting + # host-side IPC plumbing runs anyway. SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=application-only --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no" if [[ "$COMPUTE_SANITIZER_VERSION" -ge 202111 ]]; then SANITIZER_CMD="${SANITIZER_CMD} --padding=32" diff --git a/cuda_core/tests/helpers/child_processes.py b/cuda_core/tests/helpers/child_processes.py index 776ba0a9997..b12c18a21ca 100644 --- a/cuda_core/tests/helpers/child_processes.py +++ b/cuda_core/tests/helpers/child_processes.py @@ -4,11 +4,10 @@ """Helpers for tests that spawn ``multiprocessing.Process`` children. These exist primarily to defend IPC tests against a class of CI hang where a -child process gets stuck during teardown (e.g., compute-sanitizer's IPC -teardown analysis on certain CUDA driver / toolkit combinations -- see issue -#2004). Without intervention, a zombie child holds an IPC memory handle and -blocks the parent's ``mr.close()`` in fixture teardown, wedging the GHA runner -for hours. +child process spawns too slowly and the parent does not implement proper guards +for that (see issue #2004). Without intervention, a zombie child holds an IPC +memory handle and blocks the parent's ``mr.close()`` in fixture teardown, +leading to deadlock and wedging the test runner for hours. """ import contextlib diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py index 535333108c3..1de08789e7d 100644 --- a/cuda_core/tests/memory_ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -6,8 +6,8 @@ Applies an outer-guard ``pytest.mark.timeout`` to every test in this directory. Individual tests still drive their own per-process waits using ``child_timeout_sec()`` from ``helpers.child_processes``; this marker is the -final fallback so that no IPC test can wedge the CI runner for hours if some -new driver / sanitizer / IPC interaction defeats every other layer. +final fallback so that no IPC test can wedge the CI runner for hours if +deadlock occurs. """ import pathlib From d1dbdfff121e2359fdf6b03079796152d0c4059f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 21 May 2026 11:10:28 -0700 Subject: [PATCH 3/6] tests: add kill_subprocesses helper and apply to IPC tests Make every IPC test responsible for its own child-process cleanup, rather than leaning on the fixture-level tracker as the primary mechanism. helpers.child_processes.kill_subprocesses(*processes) returns the list of processes that were still alive when called and kills them. Tests pair this with an "assert not survivors" check, so a timeout produces a clean failure message ("timed out waiting on: ['Process-3']") instead of "assert None == 0", and the held IPC handles are released before any further test code runs. Every join+exitcode pattern in tests/memory_ipc/ is converted to the new shape: process.join(timeout=CHILD_TIMEOUT_SEC) survivors = kill_subprocesses(process) assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 For test_send_buffers.py:TestIpcReexport (the test from #2121), the event.wait return value is also captured and asserted on so a timeout there is reported explicitly rather than dropped on the floor. The autouse track_child_processes() context manager in the ipc_device fixture remains as defense in depth for any future test that forgets the pattern. memory_ipc/test_errors.py adds a meta-test test_outer_timeout_marker_is_applied that verifies tests/memory_ipc/conftest.py is loaded and applies the pytest.mark.timeout(300) marker. This catches the "nested conftest silently not picked up" failure mode at test time with a clear error message. --- cuda_core/tests/helpers/child_processes.py | 35 +++++++++++++++++++ cuda_core/tests/memory_ipc/test_errors.py | 18 +++++++++- cuda_core/tests/memory_ipc/test_event_ipc.py | 6 +++- .../memory_ipc/test_ipc_duplicate_import.py | 4 ++- cuda_core/tests/memory_ipc/test_leaks.py | 8 ++++- cuda_core/tests/memory_ipc/test_memory_ipc.py | 10 +++++- .../tests/memory_ipc/test_peer_access.py | 6 +++- .../tests/memory_ipc/test_send_buffers.py | 16 ++++++--- cuda_core/tests/memory_ipc/test_serialize.py | 8 ++++- 9 files changed, 100 insertions(+), 11 deletions(-) diff --git a/cuda_core/tests/helpers/child_processes.py b/cuda_core/tests/helpers/child_processes.py index b12c18a21ca..ed9300c77bd 100644 --- a/cuda_core/tests/helpers/child_processes.py +++ b/cuda_core/tests/helpers/child_processes.py @@ -29,6 +29,41 @@ def child_timeout_sec() -> int: return CHILD_TIMEOUT_SEC_SANITIZER if under_compute_sanitizer() else CHILD_TIMEOUT_SEC_DEFAULT +def kill_subprocesses(*processes): + """Kill any of the given Process objects that are still alive. + + Returns the list of processes that were killed (i.e. that were still alive + when the call was made). Callers should ``assert not survivors`` to convert + a non-empty return value into a clean test failure, e.g.:: + + proc_a.join(timeout=CHILD_TIMEOUT_SEC) + proc_b.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(proc_a, proc_b) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" + assert proc_a.exitcode == 0 + assert proc_b.exitcode == 0 + + Killing survivors before the subsequent asserts prevents a zombie child + from holding IPC handles past the test body and blocking fixture + teardown. + """ + killed = [] + for proc in processes: + try: + alive = proc.is_alive() + except (ValueError, AssertionError): + # is_alive() raises if the Process was never started or has + # already been closed; nothing to clean up. + continue + if not alive: + continue + with contextlib.suppress(ValueError, AssertionError): + proc.kill() + proc.join() + killed.append(proc) + return killed + + @contextlib.contextmanager def track_child_processes(): """Context manager that kills any ``multiprocessing.Process`` children still diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index 3ebd93d2b27..406d7df5f32 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -6,7 +6,7 @@ import re import pytest -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError @@ -16,6 +16,20 @@ POOL_SIZE = 2097152 +def test_outer_timeout_marker_is_applied(request): + """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker. + + If this test fails, the per-directory conftest is not being loaded, or its + pytest_collection_modifyitems hook is not adding the marker. Without this + marker, the only thing protecting the GHA runner from a wedged IPC test is + the in-test cleanup -- which we want to keep as defense in depth, not as + the sole guard. + """ + marker = request.node.get_closest_marker("timeout") + assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker" + assert marker.args == (300,), f"unexpected timeout value: {marker.args!r}" + + class ChildErrorHarness: """Test harness for checking errors in child processes. Subclasses override PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples).""" @@ -44,6 +58,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 finally: for mr in self._extra_mrs: diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index 1fe68b2ab17..48985e67b58 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -5,7 +5,7 @@ import pytest from helpers.buffers import compare_equal_buffers, make_scratch_buffer -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from helpers.latch import LatchKernel from helpers.logging import TimestampedLogger @@ -68,6 +68,8 @@ def test_main(self, ipc_device, ipc_memory_resource): log("releasing stream1") latch.release() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 log("done") @@ -163,6 +165,8 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw): assert props[5] is None process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 def child_main(self, q_in, q_out): diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index 0c211ea9dc9..8d450fa8e3f 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -13,7 +13,7 @@ import multiprocessing as mp import pytest -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from helpers.logging import TimestampedLogger from cuda.core import Buffer, Device @@ -85,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource): log("waiting for child") process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) log(f"child exit code: {process.exitcode}") + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}" log("done") diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index 63282622a51..0d45bd61afa 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -13,7 +13,7 @@ else: HAVE_PSUTIL = True import pytest -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 @@ -39,6 +39,8 @@ def exec_success(obj, number=1): process = mp.Process(target=child_main, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 @@ -55,6 +57,8 @@ def exec_launch_failure(obj, number=1): process = mp.Process(target=child_main_bad, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode != 0 @@ -138,5 +142,7 @@ def prime(): process = mp.Process() process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 prime_was_run = True diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 92c08146e2a..0923fe28d8b 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -5,7 +5,7 @@ import pytest from helpers.buffers import PatternGen -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Buffer, DeviceMemoryResource @@ -40,6 +40,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Verify that the buffer was modified. @@ -83,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child processes. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(p1, p2) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -136,6 +140,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(p1, p2) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -186,6 +192,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(p1, p2) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert p1.exitcode == 0 assert p2.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 2c09ad569a9..9e9e2879ae7 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -5,7 +5,7 @@ import pytest from helpers.buffers import PatternGen -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError @@ -36,6 +36,8 @@ def test_main(self, ipc_mempool_device_x2): process = mp.Process(target=self.child_main, args=(mr,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Verify parent's MR still has peer access set (independent state) @@ -82,6 +84,8 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent): process = mp.Process(target=self.child_main, args=(mr, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 buffer.close() diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 6a25196b89e..cc7f45d67c2 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -6,7 +6,7 @@ import pytest from helpers.buffers import PatternGen -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions @@ -38,8 +38,11 @@ def test_main(self, ipc_device, nmrs): process = mp.Process(target=self.child_main, args=(device, buffers)) process.start() - # Wait for the child process. + # Wait for the child process, then kill any survivor so subsequent + # asserts cannot block on a held IPC handle. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Verify that the buffers were modified. @@ -96,11 +99,16 @@ def test_main(self, ipc_device, ipc_memory_resource): proc_b.start() proc_c.start() - # Wait for C to signal completion then clean up. - event_c.wait(timeout=CHILD_TIMEOUT_SEC) + # Wait for C to signal completion, then let B finish and join both. + # Gather all state (event result + joins + survivor kills) before + # asserting so cleanup happens regardless of which check fires. + completed = event_c.wait(timeout=CHILD_TIMEOUT_SEC) event_b.set() # b can finish now proc_b.join(timeout=CHILD_TIMEOUT_SEC) proc_c.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(proc_b, proc_c) + assert completed, "process C did not signal completion within timeout" + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert proc_b.exitcode == 0 assert proc_c.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 1979067d328..2f0e429b103 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -7,7 +7,7 @@ import pytest from helpers.buffers import PatternGen -from helpers.child_processes import child_timeout_sec +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Buffer, Device, DeviceMemoryResource, PinnedMemoryResource @@ -47,6 +47,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Confirm buffers were modified. @@ -104,6 +106,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Confirm buffer was modified. @@ -152,6 +156,8 @@ def test_main(self, ipc_device, ipc_memory_resource): process = mp.Process(target=self.child_main, args=(alloc_handle, mr, buffer_desc, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 pgen.verify_buffer(buffer, seed=True) From 63e17a96c283dcf7b5977121d29804320f6327c0 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 21 May 2026 11:23:06 -0700 Subject: [PATCH 4/6] tests: add memory_ipc/__init__.py to avoid conftest module collision Without __init__.py in tests/memory_ipc/, pytest imports tests/memory_ipc/conftest.py under module name `conftest`, which collides with the top-level tests/conftest.py. Tests under tests/ that use `from conftest import X` then resolve to the wrong file at import time, yielding ImportErrors like: ImportError: cannot import name 'skipif_need_cuda_headers' from 'conftest' (.../tests/memory_ipc/conftest.py) The fix mirrors tests/system/ (which already has both __init__.py and conftest.py): adding tests/memory_ipc/__init__.py makes memory_ipc a proper package, so its conftest is imported as memory_ipc.conftest and the top-level conftest stays under name `conftest`. --- cuda_core/tests/memory_ipc/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 cuda_core/tests/memory_ipc/__init__.py diff --git a/cuda_core/tests/memory_ipc/__init__.py b/cuda_core/tests/memory_ipc/__init__.py new file mode 100644 index 00000000000..27422b3cb7e --- /dev/null +++ b/cuda_core/tests/memory_ipc/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 From fe5b9144b3b87f35b4fd0673255e8e570d41a138 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 21 May 2026 11:33:14 -0700 Subject: [PATCH 5/6] tests: scale memory_ipc outer timeout with child_timeout_sec Replace the fixed 300 s pytest.mark.timeout with 3 * child_timeout_sec(). This is the minimum that lets the worst-case test in the suite (currently TestIpcReexport, with three sequential CHILD_TIMEOUT_SEC waits in the failure path) reach its own asserts before the outer guard fires. Resulting budgets: - Without compute-sanitizer: 90 s (down from 300 s). - Under compute-sanitizer: 360 s (up from 300 s -- the previous value would have killed TestIpcReexport mid-second-join before its "process C did not signal completion within timeout" assertion could produce a clean failure message). The meta-test test_outer_timeout_marker_is_applied now computes the expected timeout from the same formula so the two stay in sync. --- cuda_core/tests/memory_ipc/conftest.py | 14 ++++++++++++-- cuda_core/tests/memory_ipc/test_errors.py | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py index 1de08789e7d..86781d40b9b 100644 --- a/cuda_core/tests/memory_ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -13,13 +13,23 @@ import pathlib import pytest +from helpers.child_processes import child_timeout_sec _HERE = pathlib.Path(__file__).parent.resolve() -_TIMEOUT_SEC = 300 # 5 minutes per test; generous compared to child_timeout_sec(). + + +def _outer_timeout_sec() -> int: + # The worst-case IPC test has three sequential CHILD_TIMEOUT_SEC waits in + # the failure path (e.g. TestIpcReexport: event_c.wait, proc_b.join, + # proc_c.join). Scaling by 3 lets such a test reach its own asserts before + # the outer guard fires, while still cutting the budget by half on + # non-sanitizer runs (90 s vs the previous 300 s) and scaling up under + # compute-sanitizer (360 s). + return 3 * child_timeout_sec() def pytest_collection_modifyitems(config, items): - marker = pytest.mark.timeout(_TIMEOUT_SEC) + marker = pytest.mark.timeout(_outer_timeout_sec()) for item in items: try: item_path = pathlib.Path(str(item.fspath)).resolve() diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index 406d7df5f32..799082a2622 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -25,9 +25,10 @@ def test_outer_timeout_marker_is_applied(request): the in-test cleanup -- which we want to keep as defense in depth, not as the sole guard. """ + expected = 3 * child_timeout_sec() marker = request.node.get_closest_marker("timeout") assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker" - assert marker.args == (300,), f"unexpected timeout value: {marker.args!r}" + assert marker.args == (expected,), f"unexpected timeout value: {marker.args!r}" class ChildErrorHarness: From a476a2ca656e1a763cb9d72acbcfc5a6c52271b9 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 21 May 2026 11:41:38 -0700 Subject: [PATCH 6/6] tests: tighten memory_ipc outer timeout to CHILD_TIMEOUT_SEC + 30 The previous 3 * CHILD_TIMEOUT_SEC scaling assumed worst-case wall-clock where every sequential join/wait hits its full timeout. In practice the children run concurrently, so expected wall-clock is ~CHILD_TIMEOUT_SEC regardless of how many joins the test chains -- once a child is done its join returns immediately. Exceeding CHILD_TIMEOUT_SEC + slack already means something is genuinely stuck, in which case the outer guard firing is the right outcome; the autouse track_child_processes() context manager still cleans up survivors, and the per-test diagnostic message would not be more informative than "test exceeded its budget". New budgets: - Without compute-sanitizer: 60 s (was 90 s). - Under compute-sanitizer: 150 s (was 360 s). The meta-test computes its expected value from the same formula. --- cuda_core/tests/memory_ipc/conftest.py | 16 +++++++++------- cuda_core/tests/memory_ipc/test_errors.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py index 86781d40b9b..a7c8286b4c5 100644 --- a/cuda_core/tests/memory_ipc/conftest.py +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -19,13 +19,15 @@ def _outer_timeout_sec() -> int: - # The worst-case IPC test has three sequential CHILD_TIMEOUT_SEC waits in - # the failure path (e.g. TestIpcReexport: event_c.wait, proc_b.join, - # proc_c.join). Scaling by 3 lets such a test reach its own asserts before - # the outer guard fires, while still cutting the budget by half on - # non-sanitizer runs (90 s vs the previous 300 s) and scaling up under - # compute-sanitizer (360 s). - return 3 * child_timeout_sec() + # IPC tests spawn children that run concurrently, so expected wall-clock + # is ~CHILD_TIMEOUT_SEC regardless of how many subsequent join/wait + # timeouts the test chains together (each subsequent join returns + # immediately once its child is already done). Exceeding that already + # means something is genuinely stuck, at which point the outer guard + # firing is the right outcome -- the per-test asserts wouldn't add + # useful diagnostic value over "test exceeded its budget", and the + # autouse track_child_processes() context manager still cleans up. + return child_timeout_sec() + 30 def pytest_collection_modifyitems(config, items): diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index 799082a2622..42f34dd61c2 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -25,7 +25,7 @@ def test_outer_timeout_marker_is_applied(request): the in-test cleanup -- which we want to keep as defense in depth, not as the sole guard. """ - expected = 3 * child_timeout_sec() + expected = child_timeout_sec() + 30 marker = request.node.get_closest_marker("timeout") assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker" assert marker.args == (expected,), f"unexpected timeout value: {marker.args!r}"