From c62413e06d83b1b4d5f9bf0e3871155c55ad994b Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 14 May 2026 01:16:31 -0400 Subject: [PATCH 1/2] Validate checkpoint GPU UUID inputs Signed-off-by: Aryan --- cuda_core/cuda/core/checkpoint.py | 11 +- cuda_core/tests/test_checkpoint_helpers.py | 114 +++++++++++++++++++++ 2 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 cuda_core/tests/test_checkpoint_helpers.py diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py index 7f811013d19..a648911678e 100644 --- a/cuda_core/cuda/core/checkpoint.py +++ b/cuda_core/cuda/core/checkpoint.py @@ -239,14 +239,21 @@ def _as_cuuuid(driver, value, buffers): the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by :attr:`Device.uuid`. """ + if isinstance(value, driver.CUuuid): + return value if isinstance(value, str): - raw = bytes.fromhex(value.replace("-", "")) + try: + raw = bytes.fromhex(value.replace("-", "")) + except ValueError: + raise ValueError( + f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}" + ) from None if len(raw) != 16: raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}") buf = _ctypes.create_string_buffer(raw, 16) buffers.append(buf) return driver.CUuuid(_ctypes.addressof(buf)) - return value + raise TypeError("GPU UUID values must be CUDA UUID objects or UUID strings") __all__ = [ diff --git a/cuda_core/tests/test_checkpoint_helpers.py b/cuda_core/tests/test_checkpoint_helpers.py new file mode 100644 index 00000000000..b79726c647e --- /dev/null +++ b/cuda_core/tests/test_checkpoint_helpers.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Source-level tests for checkpoint helper validation. + +These tests load ``cuda/core/checkpoint.py`` directly from source with small +stub modules, so they can run without importing the full built ``cuda.core`` +package. Run with ``--noconftest`` in environments that do not have the CUDA +extensions available: + + pytest cuda_core/tests/test_checkpoint_helpers.py --noconftest +""" + +from __future__ import annotations + +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + + +def _load_checkpoint_module(monkeypatch): + cuda_pkg = types.ModuleType("cuda") + cuda_pkg.__path__ = [] + core_pkg = types.ModuleType("cuda.core") + core_pkg.__path__ = [] + utils_pkg = types.ModuleType("cuda.core._utils") + utils_pkg.__path__ = [] + bindings_pkg = types.ModuleType("cuda.bindings") + bindings_pkg.__path__ = [] + + cuda_utils = types.ModuleType("cuda.core._utils.cuda_utils") + cuda_utils.handle_return = lambda result: result + + version_mod = types.ModuleType("cuda.core._utils.version") + version_mod.binding_version = lambda: (13, 0, 2) + version_mod.driver_version = lambda: (12, 8, 0) + + typing_mod = types.ModuleType("cuda.core.typing") + typing_mod.ProcessStateType = str + + driver_mod = types.ModuleType("cuda.bindings.driver") + + class CUuuid: + def __init__(self, value): + self.value = value + + class CUcheckpointGpuPair: + def __init__(self): + self.oldUuid = None + self.newUuid = None + + class CUcheckpointRestoreArgs: + def __init__(self): + self.gpuPairs = None + self.gpuPairsCount = 0 + + driver_mod.CUuuid = CUuuid + driver_mod.CUcheckpointGpuPair = CUcheckpointGpuPair + driver_mod.CUcheckpointRestoreArgs = CUcheckpointRestoreArgs + + modules = { + "cuda": cuda_pkg, + "cuda.core": core_pkg, + "cuda.core._utils": utils_pkg, + "cuda.core._utils.cuda_utils": cuda_utils, + "cuda.core._utils.version": version_mod, + "cuda.core.typing": typing_mod, + "cuda.bindings": bindings_pkg, + "cuda.bindings.driver": driver_mod, + } + for name, module in modules.items(): + monkeypatch.setitem(sys.modules, name, module) + + checkpoint_path = Path(__file__).parent.parent / "cuda" / "core" / "checkpoint.py" + spec = importlib.util.spec_from_file_location("cuda.core._checkpoint_test", checkpoint_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module, driver_mod + + +def test_make_restore_args_rejects_non_uuid_values(monkeypatch): + checkpoint, driver = _load_checkpoint_module(monkeypatch) + + with pytest.raises(TypeError, match="GPU UUID values must be CUDA UUID objects or UUID strings"): + checkpoint._make_restore_args(driver, {"01234567-89ab-cdef-0123-456789abcdef": object()}) + + +@pytest.mark.parametrize( + "bad_uuid", + [ + pytest.param("not-hex-uuid-0000-0000-000000000000", id="non_hex"), + pytest.param("01234567-89ab-cdef-0123-456789abcde", id="short"), + ], +) +def test_make_restore_args_rejects_invalid_uuid_strings(monkeypatch, bad_uuid): + checkpoint, driver = _load_checkpoint_module(monkeypatch) + + with pytest.raises(ValueError, match="GPU UUID string must be 32 hex characters"): + checkpoint._make_restore_args(driver, {bad_uuid: "01234567-89ab-cdef-0123-456789abcdef"}) + + +def test_make_restore_args_accepts_uuid_objects(monkeypatch): + checkpoint, driver = _load_checkpoint_module(monkeypatch) + + old_uuid = driver.CUuuid(111) + new_uuid = driver.CUuuid(222) + args = checkpoint._make_restore_args(driver, {old_uuid: new_uuid}) + + assert args.gpuPairsCount == 1 + assert args.gpuPairs[0].oldUuid is old_uuid + assert args.gpuPairs[0].newUuid is new_uuid From 083e41112a285aa1731d4252f330ec0b0042af46 Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 15 May 2026 22:56:27 -0400 Subject: [PATCH 2/2] Narrow checkpoint GPU UUID restore inputs --- cuda_core/cuda/core/checkpoint.py | 9 +- cuda_core/docs/source/api.rst | 8 +- cuda_core/tests/test_checkpoint_helpers.py | 114 --------------------- 3 files changed, 7 insertions(+), 124 deletions(-) delete mode 100644 cuda_core/tests/test_checkpoint_helpers.py diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py index a648911678e..97c63251087 100644 --- a/cuda_core/cuda/core/checkpoint.py +++ b/cuda_core/cuda/core/checkpoint.py @@ -235,12 +235,9 @@ def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None): def _as_cuuuid(driver, value, buffers): """Convert *value* to a ``CUuuid``. - Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in - the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by - :attr:`Device.uuid`. + Accepts a UUID string in the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` + format returned by :attr:`Device.uuid`. """ - if isinstance(value, driver.CUuuid): - return value if isinstance(value, str): try: raw = bytes.fromhex(value.replace("-", "")) @@ -253,7 +250,7 @@ def _as_cuuuid(driver, value, buffers): buf = _ctypes.create_string_buffer(raw, 16) buffers.append(buf) return driver.CUuuid(_ctypes.addressof(buf)) - raise TypeError("GPU UUID values must be CUDA UUID objects or UUID strings") + raise TypeError("GPU UUID values must be UUID strings") __all__ = [ diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 0a88a5bd4b6..7953a623502 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -241,10 +241,10 @@ should be used during restore. For migration workflows, provide mappings for every GPU visible to the NVIDIA kernel-mode driver at checkpoint time. User-space masking such as ``CUDA_VISIBLE_DEVICES`` does not reduce this mapping requirement, so applications that rely on user-space GPU masking may -not be valid migration targets. The mapping may use ``CUuuid`` objects or the -UUID strings returned by :attr:`Device.uuid`. A successful restore returns the -process to the locked state; call ``Process.unlock`` after restore to allow -CUDA API calls to resume. +not be valid migration targets. The mapping should use the UUID strings +returned by :attr:`Device.uuid`. A successful restore returns the process to +the locked state; call ``Process.unlock`` after restore to allow CUDA API +calls to resume. The CUDA driver requires restore to run from the process restore thread. Use ``Process.restore_thread_id`` to discover that thread before calling diff --git a/cuda_core/tests/test_checkpoint_helpers.py b/cuda_core/tests/test_checkpoint_helpers.py deleted file mode 100644 index b79726c647e..00000000000 --- a/cuda_core/tests/test_checkpoint_helpers.py +++ /dev/null @@ -1,114 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Source-level tests for checkpoint helper validation. - -These tests load ``cuda/core/checkpoint.py`` directly from source with small -stub modules, so they can run without importing the full built ``cuda.core`` -package. Run with ``--noconftest`` in environments that do not have the CUDA -extensions available: - - pytest cuda_core/tests/test_checkpoint_helpers.py --noconftest -""" - -from __future__ import annotations - -import importlib.util -import sys -import types -from pathlib import Path - -import pytest - - -def _load_checkpoint_module(monkeypatch): - cuda_pkg = types.ModuleType("cuda") - cuda_pkg.__path__ = [] - core_pkg = types.ModuleType("cuda.core") - core_pkg.__path__ = [] - utils_pkg = types.ModuleType("cuda.core._utils") - utils_pkg.__path__ = [] - bindings_pkg = types.ModuleType("cuda.bindings") - bindings_pkg.__path__ = [] - - cuda_utils = types.ModuleType("cuda.core._utils.cuda_utils") - cuda_utils.handle_return = lambda result: result - - version_mod = types.ModuleType("cuda.core._utils.version") - version_mod.binding_version = lambda: (13, 0, 2) - version_mod.driver_version = lambda: (12, 8, 0) - - typing_mod = types.ModuleType("cuda.core.typing") - typing_mod.ProcessStateType = str - - driver_mod = types.ModuleType("cuda.bindings.driver") - - class CUuuid: - def __init__(self, value): - self.value = value - - class CUcheckpointGpuPair: - def __init__(self): - self.oldUuid = None - self.newUuid = None - - class CUcheckpointRestoreArgs: - def __init__(self): - self.gpuPairs = None - self.gpuPairsCount = 0 - - driver_mod.CUuuid = CUuuid - driver_mod.CUcheckpointGpuPair = CUcheckpointGpuPair - driver_mod.CUcheckpointRestoreArgs = CUcheckpointRestoreArgs - - modules = { - "cuda": cuda_pkg, - "cuda.core": core_pkg, - "cuda.core._utils": utils_pkg, - "cuda.core._utils.cuda_utils": cuda_utils, - "cuda.core._utils.version": version_mod, - "cuda.core.typing": typing_mod, - "cuda.bindings": bindings_pkg, - "cuda.bindings.driver": driver_mod, - } - for name, module in modules.items(): - monkeypatch.setitem(sys.modules, name, module) - - checkpoint_path = Path(__file__).parent.parent / "cuda" / "core" / "checkpoint.py" - spec = importlib.util.spec_from_file_location("cuda.core._checkpoint_test", checkpoint_path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module, driver_mod - - -def test_make_restore_args_rejects_non_uuid_values(monkeypatch): - checkpoint, driver = _load_checkpoint_module(monkeypatch) - - with pytest.raises(TypeError, match="GPU UUID values must be CUDA UUID objects or UUID strings"): - checkpoint._make_restore_args(driver, {"01234567-89ab-cdef-0123-456789abcdef": object()}) - - -@pytest.mark.parametrize( - "bad_uuid", - [ - pytest.param("not-hex-uuid-0000-0000-000000000000", id="non_hex"), - pytest.param("01234567-89ab-cdef-0123-456789abcde", id="short"), - ], -) -def test_make_restore_args_rejects_invalid_uuid_strings(monkeypatch, bad_uuid): - checkpoint, driver = _load_checkpoint_module(monkeypatch) - - with pytest.raises(ValueError, match="GPU UUID string must be 32 hex characters"): - checkpoint._make_restore_args(driver, {bad_uuid: "01234567-89ab-cdef-0123-456789abcdef"}) - - -def test_make_restore_args_accepts_uuid_objects(monkeypatch): - checkpoint, driver = _load_checkpoint_module(monkeypatch) - - old_uuid = driver.CUuuid(111) - new_uuid = driver.CUuuid(222) - args = checkpoint._make_restore_args(driver, {old_uuid: new_uuid}) - - assert args.gpuPairsCount == 1 - assert args.gpuPairs[0].oldUuid is old_uuid - assert args.gpuPairs[0].newUuid is new_uuid