diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py index 7f811013d19..97c63251087 100644 --- a/cuda_core/cuda/core/checkpoint.py +++ b/cuda_core/cuda/core/checkpoint.py @@ -235,18 +235,22 @@ def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None): def _as_cuuuid(driver, value, buffers): """Convert *value* to a ``CUuuid``. - Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in - the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by - :attr:`Device.uuid`. + Accepts a UUID string in the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` + format returned by :attr:`Device.uuid`. """ if isinstance(value, str): - raw = bytes.fromhex(value.replace("-", "")) + try: + raw = bytes.fromhex(value.replace("-", "")) + except ValueError: + raise ValueError( + f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}" + ) from None if len(raw) != 16: raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}") buf = _ctypes.create_string_buffer(raw, 16) buffers.append(buf) return driver.CUuuid(_ctypes.addressof(buf)) - return value + raise TypeError("GPU UUID values must be UUID strings") __all__ = [ diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 0a88a5bd4b6..7953a623502 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -241,10 +241,10 @@ should be used during restore. For migration workflows, provide mappings for every GPU visible to the NVIDIA kernel-mode driver at checkpoint time. User-space masking such as ``CUDA_VISIBLE_DEVICES`` does not reduce this mapping requirement, so applications that rely on user-space GPU masking may -not be valid migration targets. The mapping may use ``CUuuid`` objects or the -UUID strings returned by :attr:`Device.uuid`. A successful restore returns the -process to the locked state; call ``Process.unlock`` after restore to allow -CUDA API calls to resume. +not be valid migration targets. The mapping should use the UUID strings +returned by :attr:`Device.uuid`. A successful restore returns the process to +the locked state; call ``Process.unlock`` after restore to allow CUDA API +calls to resume. The CUDA driver requires restore to run from the process restore thread. Use ``Process.restore_thread_id`` to discover that thread before calling