diff --git a/ci/tools/setup-sanitizer b/ci/tools/setup-sanitizer index e4904ca58ce..cce78cedfa9 100755 --- a/ci/tools/setup-sanitizer +++ b/ci/tools/setup-sanitizer @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -12,7 +12,15 @@ set -euo pipefail if [[ "${SETUP_SANITIZER}" == 1 ]]; then COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer" COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g') - SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no" + # --target-processes=application-only: attach the sanitizer to the parent + # pytest process only. Spawned multiprocessing.Process children run without + # the sanitizer. This aims to mitigate a class of CI hangs where child + # processes take an extreme amount of time to spawn (>30 seconds). Test bugs + # triggered by that specific condition are typically uncovered only in CI, + # where they become emergencies and are difficult to debug. The parent + # process is still fully sanitized, which is where most of the interesting + # host-side IPC plumbing runs anyway. + SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=application-only --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no" if [[ "$COMPUTE_SANITIZER_VERSION" -ge 202111 ]]; then SANITIZER_CMD="${SANITIZER_CMD} --padding=32" fi diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index d5a4b5fd29c..bb5a918d8b9 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -58,7 +58,7 @@ cu12 = ["cuda-bindings[all]==12.*"] cu13 = ["cuda-bindings[all]==13.*"] [dependency-groups] -test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "cloudpickle", "psutil", "cffi"] +test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "pytest-timeout", "cloudpickle", "psutil", "cffi"] ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"] test-cu12 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy test-cu13 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index a9c028b2db0..19992015825 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -213,14 +213,24 @@ def pop_all_contexts(): @pytest.fixture def ipc_device(): - """Obtains a device suitable for IPC-enabled mempool tests, or skips.""" + """Obtains a device suitable for IPC-enabled mempool tests, or skips. + + The fixture also tracks every ``multiprocessing.Process`` spawned during + the test and kills any survivors at teardown. This prevents a stuck child + (e.g., compute-sanitizer wedged during IPC teardown -- see issue #2004) + from blocking ``ipc_memory_resource``'s ``mr.close()`` for hours. + """ + from helpers.child_processes import track_child_processes + device = Device(0) device.set_current() if not device.properties.memory_pools_supported: pytest.skip("Device does not support mempool operations") - return _require_ipc_mempool_devices((device,))[0] + device = _require_ipc_mempool_devices((device,))[0] + with track_child_processes(): + yield device @pytest.fixture( @@ -291,8 +301,16 @@ def mempool_device_x3(): @pytest.fixture def ipc_mempool_device_x2(mempool_device_x2): - """Fixture that provides two IPC-capable mempool devices, or skips.""" - return _require_ipc_mempool_devices(mempool_device_x2) + """Fixture that provides two IPC-capable mempool devices, or skips. + + Also tracks/kills any leftover ``multiprocessing.Process`` children at + teardown for the same reasons documented on :func:`ipc_device`. + """ + from helpers.child_processes import track_child_processes + + devices = _require_ipc_mempool_devices(mempool_device_x2) + with track_child_processes(): + yield devices @pytest.fixture( diff --git a/cuda_core/tests/helpers/child_processes.py b/cuda_core/tests/helpers/child_processes.py new file mode 100644 index 00000000000..ed9300c77bd --- /dev/null +++ b/cuda_core/tests/helpers/child_processes.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Helpers for tests that spawn ``multiprocessing.Process`` children. + +These exist primarily to defend IPC tests against a class of CI hang where a +child process spawns too slowly and the parent does not implement proper guards +for that (see issue #2004). Without intervention, a zombie child holds an IPC +memory handle and blocks the parent's ``mr.close()`` in fixture teardown, +leading to deadlock and wedging the test runner for hours. +""" + +import contextlib +import multiprocessing.process +import weakref + +from cuda_python_test_helpers import under_compute_sanitizer + +CHILD_TIMEOUT_SEC_DEFAULT = 30 +CHILD_TIMEOUT_SEC_SANITIZER = 120 + + +def child_timeout_sec() -> int: + """Return the per-process join/wait timeout for IPC-style tests. + + Compute-sanitizer significantly slows process startup and CUDA context + teardown, so we use a larger budget when it is active. + """ + return CHILD_TIMEOUT_SEC_SANITIZER if under_compute_sanitizer() else CHILD_TIMEOUT_SEC_DEFAULT + + +def kill_subprocesses(*processes): + """Kill any of the given Process objects that are still alive. + + Returns the list of processes that were killed (i.e. that were still alive + when the call was made). Callers should ``assert not survivors`` to convert + a non-empty return value into a clean test failure, e.g.:: + + proc_a.join(timeout=CHILD_TIMEOUT_SEC) + proc_b.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(proc_a, proc_b) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" + assert proc_a.exitcode == 0 + assert proc_b.exitcode == 0 + + Killing survivors before the subsequent asserts prevents a zombie child + from holding IPC handles past the test body and blocking fixture + teardown. + """ + killed = [] + for proc in processes: + try: + alive = proc.is_alive() + except (ValueError, AssertionError): + # is_alive() raises if the Process was never started or has + # already been closed; nothing to clean up. + continue + if not alive: + continue + with contextlib.suppress(ValueError, AssertionError): + proc.kill() + proc.join() + killed.append(proc) + return killed + + +@contextlib.contextmanager +def track_child_processes(): + """Context manager that kills any ``multiprocessing.Process`` children still + alive at exit. + + Patches ``multiprocessing.process.BaseProcess.__init__`` to record every + ``Process`` instance constructed inside the ``with`` block. This covers + the delegating ``mp.Process`` class as well as direct ``SpawnProcess`` / + ``ForkProcess`` instances (including those created by ``mp.Pool``), since + all of them inherit from ``BaseProcess``. On exit, any tracked process + that is still alive is killed and joined. + + This protects fixture teardown (e.g. ``ipc_memory_resource``'s + ``mr.close()``) from blocking on IPC handles held by a stuck child -- + see issue #2004. + """ + tracked = weakref.WeakSet() + base = multiprocessing.process.BaseProcess + original_init = base.__init__ + + def tracking_init(self, *args, **kwargs): + original_init(self, *args, **kwargs) + tracked.add(self) + + base.__init__ = tracking_init + try: + yield + finally: + base.__init__ = original_init + for proc in list(tracked): + # is_alive() / kill() raise ValueError if the Process was never + # started or has already been closed; nothing to clean up in that + # case. + with contextlib.suppress(ValueError): + if proc.is_alive(): + proc.kill() + proc.join() diff --git a/cuda_core/tests/memory_ipc/__init__.py b/cuda_core/tests/memory_ipc/__init__.py new file mode 100644 index 00000000000..27422b3cb7e --- /dev/null +++ b/cuda_core/tests/memory_ipc/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/tests/memory_ipc/conftest.py b/cuda_core/tests/memory_ipc/conftest.py new file mode 100644 index 00000000000..a7c8286b4c5 --- /dev/null +++ b/cuda_core/tests/memory_ipc/conftest.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Per-directory conftest for memory IPC tests. + +Applies an outer-guard ``pytest.mark.timeout`` to every test in this directory. +Individual tests still drive their own per-process waits using +``child_timeout_sec()`` from ``helpers.child_processes``; this marker is the +final fallback so that no IPC test can wedge the CI runner for hours if +deadlock occurs. +""" + +import pathlib + +import pytest +from helpers.child_processes import child_timeout_sec + +_HERE = pathlib.Path(__file__).parent.resolve() + + +def _outer_timeout_sec() -> int: + # IPC tests spawn children that run concurrently, so expected wall-clock + # is ~CHILD_TIMEOUT_SEC regardless of how many subsequent join/wait + # timeouts the test chains together (each subsequent join returns + # immediately once its child is already done). Exceeding that already + # means something is genuinely stuck, at which point the outer guard + # firing is the right outcome -- the per-test asserts wouldn't add + # useful diagnostic value over "test exceeded its budget", and the + # autouse track_child_processes() context manager still cleans up. + return child_timeout_sec() + 30 + + +def pytest_collection_modifyitems(config, items): + marker = pytest.mark.timeout(_outer_timeout_sec()) + for item in items: + try: + item_path = pathlib.Path(str(item.fspath)).resolve() + except OSError: + continue + if _HERE in item_path.parents: + item.add_marker(marker) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d17e63dc90a..42f34dd61c2 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -6,15 +6,31 @@ import re import pytest +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 +def test_outer_timeout_marker_is_applied(request): + """Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker. + + If this test fails, the per-directory conftest is not being loaded, or its + pytest_collection_modifyitems hook is not adding the marker. Without this + marker, the only thing protecting the GHA runner from a wedged IPC test is + the in-test cleanup -- which we want to keep as defense in depth, not as + the sole guard. + """ + expected = child_timeout_sec() + 30 + marker = request.node.get_closest_marker("timeout") + assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker" + assert marker.args == (expected,), f"unexpected timeout value: {marker.args!r}" + + class ChildErrorHarness: """Test harness for checking errors in child processes. Subclasses override PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples).""" @@ -43,6 +59,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 finally: for mr in self._extra_mrs: diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index e1bb45efcfb..48985e67b58 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -5,13 +5,14 @@ import pytest from helpers.buffers import compare_equal_buffers, make_scratch_buffer +from helpers.child_processes import child_timeout_sec, kill_subprocesses from helpers.latch import LatchKernel from helpers.logging import TimestampedLogger from cuda.core import Device, EventOptions ENABLE_LOGGING = False # Set True for test debugging and development -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 @@ -67,6 +68,8 @@ def test_main(self, ipc_device, ipc_memory_resource): log("releasing stream1") latch.release() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 log("done") @@ -162,6 +165,8 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw): assert props[5] is None process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 def child_main(self, q_in, q_out): diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index f0c4951e8e6..8d450fa8e3f 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -13,11 +13,12 @@ import multiprocessing as mp import pytest +from helpers.child_processes import child_timeout_sec, kill_subprocesses from helpers.logging import TimestampedLogger from cuda.core import Buffer, Device -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 @@ -84,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource): log("waiting for child") process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) log(f"child exit code: {process.exitcode}") + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}" log("done") diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index e2a7e8d096b..0d45bd61afa 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -13,8 +13,9 @@ else: HAVE_PSUTIL = True import pytest +from helpers.child_processes import child_timeout_sec, kill_subprocesses -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 USING_FDS = platform.system() == "Linux" @@ -38,6 +39,8 @@ def exec_success(obj, number=1): process = mp.Process(target=child_main, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 @@ -54,6 +57,8 @@ def exec_launch_failure(obj, number=1): process = mp.Process(target=child_main_bad, args=(obj,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode != 0 @@ -137,5 +142,7 @@ def prime(): process = mp.Process() process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 prime_was_run = True diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 54159d81130..0923fe28d8b 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -5,10 +5,11 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Buffer, DeviceMemoryResource -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 NWORKERS = 2 NTASKS = 2 @@ -39,6 +40,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Verify that the buffer was modified. @@ -82,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child processes. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(p1, p2) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -135,6 +140,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(p1, p2) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert p1.exitcode == 0 assert p2.exitcode == 0 @@ -185,6 +192,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) p2.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(p1, p2) + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert p1.exitcode == 0 assert p2.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 0c644c25ed4..9e9e2879ae7 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -5,11 +5,12 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions from cuda.core._utils.cuda_utils import CUDAError -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 @@ -35,6 +36,8 @@ def test_main(self, ipc_mempool_device_x2): process = mp.Process(target=self.child_main, args=(mr,)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Verify parent's MR still has peer access set (independent state) @@ -81,6 +84,8 @@ def test_main(self, ipc_mempool_device_x2, grant_access_in_parent): process = mp.Process(target=self.child_main, args=(mr, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 buffer.close() diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index 041a8539da3..cc7f45d67c2 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -6,10 +6,11 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 NMRS = 3 NTASKS = 7 @@ -37,8 +38,11 @@ def test_main(self, ipc_device, nmrs): process = mp.Process(target=self.child_main, args=(device, buffers)) process.start() - # Wait for the child process. + # Wait for the child process, then kill any survivor so subsequent + # asserts cannot block on a held IPC handle. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Verify that the buffers were modified. @@ -95,11 +99,16 @@ def test_main(self, ipc_device, ipc_memory_resource): proc_b.start() proc_c.start() - # Wait for C to signal completion then clean up. - event_c.wait(timeout=CHILD_TIMEOUT_SEC) + # Wait for C to signal completion, then let B finish and join both. + # Gather all state (event result + joins + survivor kills) before + # asserting so cleanup happens regardless of which check fires. + completed = event_c.wait(timeout=CHILD_TIMEOUT_SEC) event_b.set() # b can finish now proc_b.join(timeout=CHILD_TIMEOUT_SEC) proc_c.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(proc_b, proc_c) + assert completed, "process C did not signal completion within timeout" + assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}" assert proc_b.exitcode == 0 assert proc_c.exitcode == 0 diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 78d26387c8a..2f0e429b103 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -7,10 +7,11 @@ import pytest from helpers.buffers import PatternGen +from helpers.child_processes import child_timeout_sec, kill_subprocesses from cuda.core import Buffer, Device, DeviceMemoryResource, PinnedMemoryResource -CHILD_TIMEOUT_SEC = 30 +CHILD_TIMEOUT_SEC = child_timeout_sec() NBYTES = 64 POOL_SIZE = 2097152 @@ -46,6 +47,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Confirm buffers were modified. @@ -103,6 +106,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 # Confirm buffer was modified. @@ -151,6 +156,8 @@ def test_main(self, ipc_device, ipc_memory_resource): process = mp.Process(target=self.child_main, args=(alloc_handle, mr, buffer_desc, buffer)) process.start() process.join(timeout=CHILD_TIMEOUT_SEC) + survivors = kill_subprocesses(process) + assert not survivors, "child did not exit within timeout" assert process.exitcode == 0 pgen.verify_buffer(buffer, seed=True) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 5d0c7b1a0f5..609fadbcf3e 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -10,7 +10,6 @@ from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions -CHILD_TIMEOUT_SEC = 30 NBYTES = 64 NWORKERS = 2 NMRS = 3