Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions ci/tools/setup-sanitizer
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

Expand All @@ -12,7 +12,15 @@ set -euo pipefail
if [[ "${SETUP_SANITIZER}" == 1 ]]; then
COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer"
COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g')
SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no"
# --target-processes=application-only: attach the sanitizer to the parent
# pytest process only. Spawned multiprocessing.Process children run without
# the sanitizer. This aims to mitigate a class of CI hangs where child
# processes take an extreme amount of time to spawn (>30 seconds). Test bugs
# triggered by that specific condition are typically uncovered only in CI,
# where they become emergencies and are difficult to debug. The parent
# process is still fully sanitized, which is where most of the interesting
# host-side IPC plumbing runs anyway.
SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=application-only --launch-timeout=0 --tool=memcheck --error-exitcode=1 --report-api-errors=no"
if [[ "$COMPUTE_SANITIZER_VERSION" -ge 202111 ]]; then
SANITIZER_CMD="${SANITIZER_CMD} --padding=32"
fi
Expand Down
2 changes: 1 addition & 1 deletion cuda_core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ cu12 = ["cuda-bindings[all]==12.*"]
cu13 = ["cuda-bindings[all]==13.*"]

[dependency-groups]
test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "cloudpickle", "psutil", "cffi"]
test = ["cython>=3.2,<3.3", "setuptools", "pytest>=6.2.4", "pytest-benchmark", "pytest-randomly", "pytest-repeat", "pytest-rerunfailures", "pytest-timeout", "cloudpickle", "psutil", "cffi"]
ml-dtypes = ["ml-dtypes>=0.5.4,<0.6.0"]
test-cu12 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda12x; python_version < '3.14'", "cuda-toolkit[cudart]==12.*"] # runtime headers needed by CuPy
test-cu13 = [ {include-group = "ml-dtypes" }, {include-group = "test" }, "cupy-cuda13x; python_version < '3.14'", "cuda-toolkit[cudart]==13.*"] # runtime headers needed by CuPy
Expand Down
26 changes: 22 additions & 4 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,24 @@ def pop_all_contexts():

@pytest.fixture
def ipc_device():
"""Obtains a device suitable for IPC-enabled mempool tests, or skips."""
"""Obtains a device suitable for IPC-enabled mempool tests, or skips.

The fixture also tracks every ``multiprocessing.Process`` spawned during
the test and kills any survivors at teardown. This prevents a stuck child
(e.g., compute-sanitizer wedged during IPC teardown -- see issue #2004)
from blocking ``ipc_memory_resource``'s ``mr.close()`` for hours.
"""
from helpers.child_processes import track_child_processes

device = Device(0)
device.set_current()

if not device.properties.memory_pools_supported:
pytest.skip("Device does not support mempool operations")

return _require_ipc_mempool_devices((device,))[0]
device = _require_ipc_mempool_devices((device,))[0]
with track_child_processes():
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-blocking: since ipc_memory_resource depends on ipc_device, its teardown still runs before this track_child_processes() context exits. So this fixture-level guard may not protect mr.close() if a test fails before reaching its in-test kill_subprocesses() cleanup. The per-test cleanup and pytest-timeout still cover the known hang, so I don’t think this should block.

yield device


@pytest.fixture(
Expand Down Expand Up @@ -291,8 +301,16 @@ def mempool_device_x3():

@pytest.fixture
def ipc_mempool_device_x2(mempool_device_x2):
"""Fixture that provides two IPC-capable mempool devices, or skips."""
return _require_ipc_mempool_devices(mempool_device_x2)
"""Fixture that provides two IPC-capable mempool devices, or skips.

Also tracks/kills any leftover ``multiprocessing.Process`` children at
teardown for the same reasons documented on :func:`ipc_device`.
"""
from helpers.child_processes import track_child_processes

devices = _require_ipc_mempool_devices(mempool_device_x2)
with track_child_processes():
yield devices


@pytest.fixture(
Expand Down
103 changes: 103 additions & 0 deletions cuda_core/tests/helpers/child_processes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Helpers for tests that spawn ``multiprocessing.Process`` children.

These exist primarily to defend IPC tests against a class of CI hang where a
child process spawns too slowly and the parent does not implement proper guards
for that (see issue #2004). Without intervention, a zombie child holds an IPC
memory handle and blocks the parent's ``mr.close()`` in fixture teardown,
leading to deadlock and wedging the test runner for hours.
"""

import contextlib
import multiprocessing.process
import weakref

from cuda_python_test_helpers import under_compute_sanitizer

CHILD_TIMEOUT_SEC_DEFAULT = 30
CHILD_TIMEOUT_SEC_SANITIZER = 120


def child_timeout_sec() -> int:
"""Return the per-process join/wait timeout for IPC-style tests.

Compute-sanitizer significantly slows process startup and CUDA context
teardown, so we use a larger budget when it is active.
"""
return CHILD_TIMEOUT_SEC_SANITIZER if under_compute_sanitizer() else CHILD_TIMEOUT_SEC_DEFAULT


def kill_subprocesses(*processes):
"""Kill any of the given Process objects that are still alive.

Returns the list of processes that were killed (i.e. that were still alive
when the call was made). Callers should ``assert not survivors`` to convert
a non-empty return value into a clean test failure, e.g.::

proc_a.join(timeout=CHILD_TIMEOUT_SEC)
proc_b.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(proc_a, proc_b)
assert not survivors, f"timed out waiting on: {[p.name for p in survivors]}"
assert proc_a.exitcode == 0
assert proc_b.exitcode == 0

Killing survivors before the subsequent asserts prevents a zombie child
from holding IPC handles past the test body and blocking fixture
teardown.
"""
killed = []
for proc in processes:
try:
alive = proc.is_alive()
except (ValueError, AssertionError):
# is_alive() raises if the Process was never started or has
# already been closed; nothing to clean up.
continue
if not alive:
continue
with contextlib.suppress(ValueError, AssertionError):
proc.kill()
proc.join()
killed.append(proc)
return killed


@contextlib.contextmanager
def track_child_processes():
"""Context manager that kills any ``multiprocessing.Process`` children still
alive at exit.

Patches ``multiprocessing.process.BaseProcess.__init__`` to record every
``Process`` instance constructed inside the ``with`` block. This covers
the delegating ``mp.Process`` class as well as direct ``SpawnProcess`` /
``ForkProcess`` instances (including those created by ``mp.Pool``), since
all of them inherit from ``BaseProcess``. On exit, any tracked process
that is still alive is killed and joined.

This protects fixture teardown (e.g. ``ipc_memory_resource``'s
``mr.close()``) from blocking on IPC handles held by a stuck child --
see issue #2004.
"""
tracked = weakref.WeakSet()
base = multiprocessing.process.BaseProcess
original_init = base.__init__

def tracking_init(self, *args, **kwargs):
original_init(self, *args, **kwargs)
tracked.add(self)

base.__init__ = tracking_init
try:
yield
finally:
base.__init__ = original_init
for proc in list(tracked):
# is_alive() / kill() raise ValueError if the Process was never
# started or has already been closed; nothing to clean up in that
# case.
with contextlib.suppress(ValueError):
if proc.is_alive():
proc.kill()
proc.join()
3 changes: 3 additions & 0 deletions cuda_core/tests/memory_ipc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
41 changes: 41 additions & 0 deletions cuda_core/tests/memory_ipc/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Per-directory conftest for memory IPC tests.

Applies an outer-guard ``pytest.mark.timeout`` to every test in this directory.
Individual tests still drive their own per-process waits using
``child_timeout_sec()`` from ``helpers.child_processes``; this marker is the
final fallback so that no IPC test can wedge the CI runner for hours if
deadlock occurs.
"""

import pathlib

import pytest
from helpers.child_processes import child_timeout_sec

_HERE = pathlib.Path(__file__).parent.resolve()


def _outer_timeout_sec() -> int:
# IPC tests spawn children that run concurrently, so expected wall-clock
# is ~CHILD_TIMEOUT_SEC regardless of how many subsequent join/wait
# timeouts the test chains together (each subsequent join returns
# immediately once its child is already done). Exceeding that already
# means something is genuinely stuck, at which point the outer guard
# firing is the right outcome -- the per-test asserts wouldn't add
# useful diagnostic value over "test exceeded its budget", and the
# autouse track_child_processes() context manager still cleans up.
return child_timeout_sec() + 30


def pytest_collection_modifyitems(config, items):
marker = pytest.mark.timeout(_outer_timeout_sec())
for item in items:
try:
item_path = pathlib.Path(str(item.fspath)).resolve()
except OSError:
continue
if _HERE in item_path.parents:
item.add_marker(marker)
20 changes: 19 additions & 1 deletion cuda_core/tests/memory_ipc/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,31 @@
import re

import pytest
from helpers.child_processes import child_timeout_sec, kill_subprocesses

from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
from cuda.core._utils.cuda_utils import CUDAError

CHILD_TIMEOUT_SEC = 30
CHILD_TIMEOUT_SEC = child_timeout_sec()
NBYTES = 64
POOL_SIZE = 2097152


def test_outer_timeout_marker_is_applied(request):
"""Verify that memory_ipc/conftest.py applies the outer pytest-timeout marker.

If this test fails, the per-directory conftest is not being loaded, or its
pytest_collection_modifyitems hook is not adding the marker. Without this
marker, the only thing protecting the GHA runner from a wedged IPC test is
the in-test cleanup -- which we want to keep as defense in depth, not as
the sole guard.
"""
expected = child_timeout_sec() + 30
marker = request.node.get_closest_marker("timeout")
assert marker is not None, "memory_ipc/conftest.py did not apply a timeout marker"
assert marker.args == (expected,), f"unexpected timeout value: {marker.args!r}"


class ChildErrorHarness:
"""Test harness for checking errors in child processes. Subclasses override
PARENT_ACTION, CHILD_ACTION, and ASSERT (see below for examples)."""
Expand Down Expand Up @@ -43,6 +59,8 @@ def test_main(self, ipc_device, ipc_memory_resource):

# Wait for the child process.
process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
assert not survivors, "child did not exit within timeout"
assert process.exitcode == 0
finally:
for mr in self._extra_mrs:
Expand Down
7 changes: 6 additions & 1 deletion cuda_core/tests/memory_ipc/test_event_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

import pytest
from helpers.buffers import compare_equal_buffers, make_scratch_buffer
from helpers.child_processes import child_timeout_sec, kill_subprocesses
from helpers.latch import LatchKernel
from helpers.logging import TimestampedLogger

from cuda.core import Device, EventOptions

ENABLE_LOGGING = False # Set True for test debugging and development
CHILD_TIMEOUT_SEC = 30
CHILD_TIMEOUT_SEC = child_timeout_sec()
NBYTES = 64


Expand Down Expand Up @@ -67,6 +68,8 @@ def test_main(self, ipc_device, ipc_memory_resource):
log("releasing stream1")
latch.release()
process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
assert not survivors, "child did not exit within timeout"
assert process.exitcode == 0
log("done")

Expand Down Expand Up @@ -162,6 +165,8 @@ def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw):
assert props[5] is None

process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
assert not survivors, "child did not exit within timeout"
assert process.exitcode == 0

def child_main(self, q_in, q_out):
Expand Down
5 changes: 4 additions & 1 deletion cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
import multiprocessing as mp

import pytest
from helpers.child_processes import child_timeout_sec, kill_subprocesses
from helpers.logging import TimestampedLogger

from cuda.core import Buffer, Device

CHILD_TIMEOUT_SEC = 30
CHILD_TIMEOUT_SEC = child_timeout_sec()
NBYTES = 64
POOL_SIZE = 2097152

Expand Down Expand Up @@ -84,6 +85,8 @@ def test_main(self, ipc_device, ipc_memory_resource):

log("waiting for child")
process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
log(f"child exit code: {process.exitcode}")
assert not survivors, "child did not exit within timeout"
assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
log("done")
9 changes: 8 additions & 1 deletion cuda_core/tests/memory_ipc/test_leaks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
else:
HAVE_PSUTIL = True
import pytest
from helpers.child_processes import child_timeout_sec, kill_subprocesses

CHILD_TIMEOUT_SEC = 30
CHILD_TIMEOUT_SEC = child_timeout_sec()
NBYTES = 64

USING_FDS = platform.system() == "Linux"
Expand All @@ -38,6 +39,8 @@ def exec_success(obj, number=1):
process = mp.Process(target=child_main, args=(obj,))
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
assert not survivors, "child did not exit within timeout"
assert process.exitcode == 0


Expand All @@ -54,6 +57,8 @@ def exec_launch_failure(obj, number=1):
process = mp.Process(target=child_main_bad, args=(obj,))
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
assert not survivors, "child did not exit within timeout"
assert process.exitcode != 0


Expand Down Expand Up @@ -137,5 +142,7 @@ def prime():
process = mp.Process()
process.start()
process.join(timeout=CHILD_TIMEOUT_SEC)
survivors = kill_subprocesses(process)
assert not survivors, "child did not exit within timeout"
assert process.exitcode == 0
prime_was_run = True
Loading
Loading