diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index f2d7c85b62e..9769a39977f 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -78,6 +78,17 @@ class _PatchedProperty(metaclass=_PatchedPropMeta): WorkqueueResource, WorkqueueResourceOptions, ) +from cuda.core._array import Array, ArrayFormat +from cuda.core._mipmapped_array import MipmappedArray +from cuda.core._texture import ( + AddressMode, + FilterMode, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, +) +from cuda.core._surface import SurfaceObject from cuda.core._event import Event, EventOptions from cuda.core._graphics import GraphicsResource from cuda.core._launch_config import LaunchConfig diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd new file mode 100644 index 00000000000..73529cac48e --- /dev/null +++ b/cuda_core/cuda/core/_array.pxd @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from cuda.bindings cimport cydriver + + +cdef class Array: + + cdef: + cydriver.CUarray _handle + tuple _shape # (w,), (w, h), or (w, h, d) + cydriver.CUarray_format _format + unsigned int _num_channels # 1, 2, or 4 + int _device_id + intptr_t _context + bint _owning + bint _surface_load_store + # Optional strong reference to a parent owner (e.g. a MipmappedArray + # whose level this Array views). When set, the parent must outlive + # this Array because the underlying CUarray belongs to the parent. + object _parent_ref + + cpdef close(self) diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx new file mode 100644 index 00000000000..7d02dcd5d21 --- /dev/null +++ b/cuda_core/cuda/core/_array.pyx @@ -0,0 +1,439 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +cimport cpython +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer +from cuda.core._stream cimport Stream +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_context_ptr, + _get_current_device_id, +) + +import enum + + +class ArrayFormat(enum.IntEnum): + """Element format for a :class:`Array` allocation. + + Mirrors ``CUarray_format`` from the CUDA driver API. + """ + UINT8 = cydriver.CU_AD_FORMAT_UNSIGNED_INT8 + UINT16 = cydriver.CU_AD_FORMAT_UNSIGNED_INT16 + UINT32 = cydriver.CU_AD_FORMAT_UNSIGNED_INT32 + INT8 = cydriver.CU_AD_FORMAT_SIGNED_INT8 + INT16 = cydriver.CU_AD_FORMAT_SIGNED_INT16 + INT32 = cydriver.CU_AD_FORMAT_SIGNED_INT32 + FLOAT16 = cydriver.CU_AD_FORMAT_HALF + FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT + + +# Bytes per element (single channel) for each format. +_FORMAT_ELEM_SIZE = { + int(ArrayFormat.UINT8): 1, + int(ArrayFormat.INT8): 1, + int(ArrayFormat.UINT16): 2, + int(ArrayFormat.INT16): 2, + int(ArrayFormat.FLOAT16): 2, + int(ArrayFormat.UINT32): 4, + int(ArrayFormat.INT32): 4, + int(ArrayFormat.FLOAT32): 4, +} + + +cdef void _fill_array_endpoint( + cydriver.CUDA_MEMCPY3D* p, Array arr, bint is_src +) noexcept: + """Populate the src or dst array fields of a CUDA_MEMCPY3D struct.""" + if is_src: + p.srcMemoryType = cydriver.CU_MEMORYTYPE_ARRAY + p.srcArray = arr._handle + p.srcXInBytes = 0 + p.srcY = 0 + p.srcZ = 0 + else: + p.dstMemoryType = cydriver.CU_MEMORYTYPE_ARRAY + p.dstArray = arr._handle + p.dstXInBytes = 0 + p.dstY = 0 + p.dstZ = 0 + + +cdef int _fill_host_endpoint( + cydriver.CUDA_MEMCPY3D* p, + object obj, + bint is_src, + size_t width_bytes, + size_t height, + size_t required, + cpython.Py_buffer* pybuf_out, +) except -1: + """Populate src/dst host fields from a buffer-protocol ``obj``. + + Acquires a Py_buffer view; the caller is responsible for releasing it + (this function always returns with the view held when it returns 1). + """ + cdef int flags = cpython.PyBUF_SIMPLE + if not is_src: + flags |= cpython.PyBUF_WRITABLE + if cpython.PyObject_GetBuffer(obj, pybuf_out, flags) != 0: + raise TypeError( + f"Source/destination must be a Buffer or a contiguous " + f"buffer-protocol object, got {type(obj).__name__}" + ) + if pybuf_out.len < required: + cpython.PyBuffer_Release(pybuf_out) + raise ValueError( + f"Host buffer has {pybuf_out.len} bytes, smaller than the array " + f"extent ({required} bytes)" + ) + if is_src: + p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST + p.srcHost = pybuf_out.buf + p.srcPitch = width_bytes + p.srcHeight = height + p.srcXInBytes = 0 + p.srcY = 0 + p.srcZ = 0 + else: + p.dstMemoryType = cydriver.CU_MEMORYTYPE_HOST + p.dstHost = pybuf_out.buf + p.dstPitch = width_bytes + p.dstHeight = height + p.dstXInBytes = 0 + p.dstY = 0 + p.dstZ = 0 + return 1 + + +cdef int _fill_linear_endpoint( + cydriver.CUDA_MEMCPY3D* p, + object obj, + bint is_src, + size_t width_bytes, + size_t height, + size_t depth, + cpython.Py_buffer* pybuf_out, +) except -1: + """Populate the src or dst linear fields. Returns 1 if pybuf_out was + filled (caller must release it), 0 otherwise. + """ + cdef intptr_t ptr + cdef size_t required = width_bytes * height * depth + if isinstance(obj, Buffer): + if (obj).size < required: + raise ValueError( + f"Buffer size ({(obj).size} bytes) is smaller than " + f"the array extent ({required} bytes)" + ) + ptr = int((obj).handle) + if is_src: + p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE + p.srcDevice = ptr + p.srcPitch = width_bytes + p.srcHeight = height + p.srcXInBytes = 0 + p.srcY = 0 + p.srcZ = 0 + else: + p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE + p.dstDevice = ptr + p.dstPitch = width_bytes + p.dstHeight = height + p.dstXInBytes = 0 + p.dstY = 0 + p.dstZ = 0 + return 0 + return _fill_host_endpoint( + p, obj, is_src, width_bytes, height, required, pybuf_out + ) + + +cdef _copy3d(Array arr, object other, object stream, bint to_array): + """Issue a full-array async 3D memcpy between ``arr`` and ``other``. + + Direction is determined by ``to_array``: True copies *into* arr, False + copies *out of* arr. + """ + cdef cydriver.CUDA_MEMCPY3D params + cdef cpython.Py_buffer pybuf + cdef int got_buffer = 0 + cdef intptr_t stream_handle + cdef cydriver.CUstream c_stream + + if not isinstance(stream, Stream): + raise TypeError(f"stream must be a Stream, got {type(stream).__name__}") + + memset(¶ms, 0, sizeof(params)) + width_bytes, height, depth = arr._extent_bytes() + params.WidthInBytes = width_bytes + params.Height = height + params.Depth = depth + + try: + if to_array: + got_buffer = _fill_linear_endpoint( + ¶ms, other, True, width_bytes, height, depth, &pybuf + ) + _fill_array_endpoint(¶ms, arr, False) + else: + _fill_array_endpoint(¶ms, arr, True) + got_buffer = _fill_linear_endpoint( + ¶ms, other, False, width_bytes, height, depth, &pybuf + ) + + stream_handle = int((stream).handle) + c_stream = stream_handle + with nogil: + HANDLE_RETURN(cydriver.cuMemcpy3DAsync(¶ms, c_stream)) + finally: + if got_buffer: + cpython.PyBuffer_Release(&pybuf) + + +cdef class Array: + """An opaque, hardware-laid-out GPU allocation for texture/surface access. + + Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer + and can only be accessed from kernels through a :class:`TextureObject` or + :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D + spatial locality. + + Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are + supported in this initial version; layered/cubemap/sparse variants will + follow once their shape semantics are settled. + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "Array cannot be instantiated directly. Use Array.from_descriptor()." + ) + + @classmethod + def from_descriptor(cls, *, shape, format, num_channels, surface_load_store=False): + """Allocate a new CUDA array. + + Parameters + ---------- + shape : tuple of int + ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` + in elements. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + surface_load_store : bool + If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array + can be bound as a :class:`SurfaceObject` for kernel-side writes. + Default False. + + Returns + ------- + Array + """ + if not isinstance(format, ArrayFormat): + raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}") + if isinstance(num_channels, bool) or num_channels not in (1, 2, 4): + raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}") + + try: + shape_t = tuple(int(s) for s in shape) + except TypeError as e: + raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e + if not 1 <= len(shape_t) <= 3: + raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}") + for i, dim in enumerate(shape_t): + if dim < 1: + raise ValueError(f"shape[{i}] must be >= 1, got {dim}") + + cdef Array self = cls.__new__(cls) + self._owning = True + self._shape = shape_t + self._format = format + self._num_channels = num_channels + self._surface_load_store = bool(surface_load_store) + self._context = _get_current_context_ptr() + self._device_id = _get_current_device_id() + self._parent_ref = None + + cdef cydriver.CUarray_format c_format = format + cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d + cdef cydriver.CUDA_ARRAY_DESCRIPTOR desc2d + cdef int rank = len(shape_t) + cdef unsigned int flags = ( + cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0 + ) + + # cuArrayCreate (2D path) does not accept flags; use the 3D descriptor + # whenever any flag is set or shape is 3D. + if rank == 3 or flags != 0: + memset(&desc3d, 0, sizeof(desc3d)) + desc3d.Width = shape_t[0] + desc3d.Height = (shape_t[1] if rank >= 2 else 0) + desc3d.Depth = (shape_t[2] if rank >= 3 else 0) + desc3d.Format = c_format + desc3d.NumChannels = num_channels + desc3d.Flags = flags + with nogil: + HANDLE_RETURN(cydriver.cuArray3DCreate(&self._handle, &desc3d)) + else: + memset(&desc2d, 0, sizeof(desc2d)) + desc2d.Width = shape_t[0] + desc2d.Height = (shape_t[1] if rank == 2 else 0) + desc2d.Format = c_format + desc2d.NumChannels = num_channels + with nogil: + HANDLE_RETURN(cydriver.cuArrayCreate(&self._handle, &desc2d)) + + return self + + @classmethod + def _from_handle(cls, intptr_t handle, bint owning, *, device_id=None): + """Wrap an externally-allocated ``CUarray``. + + Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``) + where the array is owned by the graphics API. With ``owning=False``, + :meth:`close` and ``__dealloc__`` will not free the handle. Shape, + format, and channel count are queried from the driver. + """ + cdef Array self = cls.__new__(cls) + self._handle = handle + self._owning = owning + self._context = _get_current_context_ptr() + self._device_id = _get_current_device_id() if device_id is None else int(device_id) + self._parent_ref = None + + cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc + with nogil: + HANDLE_RETURN(cydriver.cuArray3DGetDescriptor(&desc, self._handle)) + + if desc.Depth > 0: + self._shape = (int(desc.Width), int(desc.Height), int(desc.Depth)) + elif desc.Height > 0: + self._shape = (int(desc.Width), int(desc.Height)) + else: + self._shape = (int(desc.Width),) + self._format = desc.Format + self._num_channels = desc.NumChannels + self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST) + return self + + @property + def handle(self): + """The underlying ``CUarray`` as an integer.""" + return self._handle + + @property + def shape(self): + """Allocation shape, in elements.""" + return self._shape + + @property + def format(self): + """The element :class:`ArrayFormat`.""" + return ArrayFormat(self._format) + + @property + def num_channels(self): + """Channels per element (1, 2, or 4).""" + return self._num_channels + + @property + def element_size(self): + """Bytes per element (format size * channels).""" + return _FORMAT_ELEM_SIZE[self._format] * self._num_channels + + @property + def device(self): + """The :class:`Device` this array was allocated on.""" + from cuda.core._device import Device + return Device(self._device_id) + + @property + def surface_load_store(self): + """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST`` + and can be bound as a :class:`SurfaceObject`.""" + return self._surface_load_store + + def _extent_bytes(self): + """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth + normalized to >=1 for lower-rank arrays.""" + cdef int rank = len(self._shape) + cdef size_t w = self._shape[0] * ( + _FORMAT_ELEM_SIZE[self._format] * self._num_channels + ) + cdef size_t h = (self._shape[1] if rank >= 2 else 1) + cdef size_t d = (self._shape[2] if rank >= 3 else 1) + return w, h, d + + def copy_from(self, src, *, stream): + """Copy a full-array's worth of data into this array. + + Parameters + ---------- + src : Buffer or buffer-protocol object + Source data. Must contain at least ``self.size_bytes`` bytes + of contiguous data. + stream : Stream + Stream to issue the copy on. + """ + _copy3d(self, src, stream, to_array=True) + + def copy_to(self, dst, *, stream): + """Copy a full-array's worth of data out of this array. + + Parameters + ---------- + dst : Buffer or writable buffer-protocol object + Destination. Must have at least ``self.size_bytes`` bytes of + writable, contiguous space. + stream : Stream + Stream to issue the copy on. + """ + _copy3d(self, dst, stream, to_array=False) + + @property + def size_bytes(self): + """Total bytes of array storage (``prod(shape) * element_size``).""" + cdef size_t n = 1 + for s in self._shape: + n *= s + return n * (_FORMAT_ELEM_SIZE[self._format] * self._num_channels) + + cpdef close(self): + """Destroy the underlying ``CUarray`` if owned by this object.""" + cdef cydriver.CUarray h = self._handle + cdef bint owning = self._owning + self._handle = NULL + # Drop the parent reference (if any) so a non-owning level Array + # stops pinning its MipmappedArray after close(). + self._parent_ref = None + if h != NULL and owning: + HANDLE_RETURN(cydriver.cuArrayDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuArrayDestroy error here is + # silently dropped. Callers needing visibility should use close(). + if self._handle != NULL and self._owning: + cydriver.cuArrayDestroy(self._handle) + self._handle = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return ( + f"Array(shape={self._shape}, " + f"format={ArrayFormat(self._format).name}, " + f"num_channels={self._num_channels})" + ) diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd new file mode 100644 index 00000000000..52afc1968cc --- /dev/null +++ b/cuda_core/cuda/core/_mipmapped_array.pxd @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from cuda.bindings cimport cydriver + + +cdef class MipmappedArray: + + cdef: + cydriver.CUmipmappedArray _handle + tuple _shape # (w,), (w, h), or (w, h, d) + cydriver.CUarray_format _format + unsigned int _num_channels # 1, 2, or 4 + unsigned int _num_levels + int _device_id + intptr_t _context + bint _owning + bint _surface_load_store + + cpdef close(self) diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx new file mode 100644 index 00000000000..c149d907f62 --- /dev/null +++ b/cuda_core/cuda/core/_mipmapped_array.pyx @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._array cimport Array +from cuda.core._array import ArrayFormat +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_context_ptr, + _get_current_device_id, +) + + +cdef class MipmappedArray: + """A mipmapped CUDA array for texture/surface access across levels. + + Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out + allocation accessible only via a :class:`TextureObject` (or by retrieving + the level's :class:`Array` and binding it as a :class:`SurfaceObject`). + Destroying the :class:`MipmappedArray` destroys all level arrays + implicitly, so the :class:`Array` instances returned by :meth:`get_level` + are non-owning and hold a strong reference back to their parent. + + Construct via :meth:`from_descriptor`. + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "MipmappedArray cannot be instantiated directly. " + "Use MipmappedArray.from_descriptor()." + ) + + @classmethod + def from_descriptor( + cls, *, shape, format, num_channels, num_levels, surface_load_store=False + ): + """Allocate a new mipmapped CUDA array. + + Parameters + ---------- + shape : tuple of int + ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` + in elements, for the base (level 0) mip. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + num_levels : int + Number of mip levels to allocate; must be >= 1. The driver caps + this at the log2 of the largest dimension; passing a larger value + yields a driver error. + surface_load_store : bool + If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual + levels (obtained via :meth:`get_level`) can be bound as + :class:`SurfaceObject` for kernel-side writes. Default False. + + Returns + ------- + MipmappedArray + """ + if not isinstance(format, ArrayFormat): + raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}") + if isinstance(num_channels, bool) or num_channels not in (1, 2, 4): + raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}") + + try: + shape_t = tuple(int(s) for s in shape) + except TypeError as e: + raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e + if not 1 <= len(shape_t) <= 3: + raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}") + for i, dim in enumerate(shape_t): + if dim < 1: + raise ValueError(f"shape[{i}] must be >= 1, got {dim}") + + levels = int(num_levels) + if levels < 1: + raise ValueError(f"num_levels must be >= 1, got {levels}") + + cdef MipmappedArray self = cls.__new__(cls) + self._owning = True + self._shape = shape_t + self._format = format + self._num_channels = num_channels + self._num_levels = levels + self._surface_load_store = bool(surface_load_store) + self._context = _get_current_context_ptr() + self._device_id = _get_current_device_id() + + cdef cydriver.CUarray_format c_format = format + cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d + cdef int rank = len(shape_t) + cdef unsigned int flags = ( + cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0 + ) + cdef unsigned int c_levels = levels + + # Mipmap creation uses the 3D descriptor regardless of rank; lower-rank + # shapes use Height=0/Depth=0 sentinels, matching cuArray3DCreate. + memset(&desc3d, 0, sizeof(desc3d)) + desc3d.Width = shape_t[0] + desc3d.Height = (shape_t[1] if rank >= 2 else 0) + desc3d.Depth = (shape_t[2] if rank >= 3 else 0) + desc3d.Format = c_format + desc3d.NumChannels = num_channels + desc3d.Flags = flags + with nogil: + HANDLE_RETURN( + cydriver.cuMipmappedArrayCreate(&self._handle, &desc3d, c_levels) + ) + + return self + + def get_level(self, level): + """Return a non-owning :class:`Array` view of the given mip level. + + Parameters + ---------- + level : int + Mip level index in ``[0, num_levels)``. + + Returns + ------- + Array + A non-owning :class:`Array` wrapping the level's ``CUarray``. + The :class:`MipmappedArray` is kept alive for the lifetime of the + returned :class:`Array`; the underlying storage is released only + when this :class:`MipmappedArray` is destroyed. + """ + lvl = int(level) + if lvl < 0: + raise ValueError(f"level must be >= 0, got {lvl}") + if lvl >= self._num_levels: + raise ValueError( + f"level ({lvl}) must be < num_levels ({self._num_levels})" + ) + + cdef cydriver.CUarray level_handle + cdef unsigned int c_level = lvl + with nogil: + HANDLE_RETURN( + cydriver.cuMipmappedArrayGetLevel(&level_handle, self._handle, c_level) + ) + + # Wrap as a non-owning Array; the level's underlying CUarray belongs + # to this MipmappedArray and must not be destroyed independently. + arr = Array._from_handle( + level_handle, False, device_id=self._device_id + ) + # Strong ref back to the parent so the mipmap outlives the level view. + (arr)._parent_ref = self + return arr + + @property + def handle(self): + """The underlying ``CUmipmappedArray`` as an integer.""" + return self._handle + + @property + def shape(self): + """Base-level (level 0) allocation shape, in elements.""" + return self._shape + + @property + def format(self): + """The element :class:`ArrayFormat`.""" + return ArrayFormat(self._format) + + @property + def num_channels(self): + """Channels per element (1, 2, or 4).""" + return self._num_channels + + @property + def num_levels(self): + """Number of mip levels.""" + return int(self._num_levels) + + @property + def surface_load_store(self): + """True if this mipmap (and each of its levels) was created with + ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`.""" + return self._surface_load_store + + @property + def device(self): + """The :class:`Device` this mipmap was allocated on.""" + from cuda.core._device import Device + return Device(self._device_id) + + cpdef close(self): + """Destroy the underlying ``CUmipmappedArray`` if owned. + + After ``close()`` any level :class:`Array` returned by :meth:`get_level` + becomes invalid; callers must not access them. + """ + cdef cydriver.CUmipmappedArray h = self._handle + cdef bint owning = self._owning + self._handle = NULL + if h != NULL and owning: + HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuMipmappedArrayDestroy error + # here is silently dropped. Callers needing visibility should use + # close(). + if self._handle != NULL and self._owning: + cydriver.cuMipmappedArrayDestroy(self._handle) + self._handle = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return ( + f"MipmappedArray(shape={self._shape}, " + f"format={ArrayFormat(self._format).name}, " + f"num_channels={self._num_channels}, " + f"num_levels={self._num_levels})" + ) diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd new file mode 100644 index 00000000000..ba7791d5172 --- /dev/null +++ b/cuda_core/cuda/core/_surface.pxd @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from cuda.bindings cimport cydriver + + +cdef class SurfaceObject: + + cdef: + cydriver.CUsurfObject _handle + object _source_ref # keep backing Array alive + int _device_id + intptr_t _context + + cpdef close(self) diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx new file mode 100644 index 00000000000..62cdecc9a01 --- /dev/null +++ b/cuda_core/cuda/core/_surface.pyx @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._array cimport Array +from cuda.core._texture import ResourceDescriptor +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_context_ptr, + _get_current_device_id, +) + + +cdef class SurfaceObject: + """A bindless surface handle for kernel-side typed load/store. + + Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface + has no sampling state (no filtering, no addressing modes, no normalization); + kernels read and write through it using integer pixel coordinates. + + The backing :class:`Array` must have been created with + ``surface_load_store=True`` and is kept alive for the lifetime of this + object to prevent dangling handles. + + Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to + kernels as a 64-bit handle (via the ``handle`` property). + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "SurfaceObject cannot be instantiated directly. " + "Use SurfaceObject.from_array() or SurfaceObject.from_descriptor()." + ) + + @classmethod + def from_array(cls, array): + """Create a surface object directly from an :class:`Array`. + + The array must have been created with ``surface_load_store=True``. + """ + if not isinstance(array, Array): + raise TypeError(f"array must be an Array, got {type(array).__name__}") + return cls.from_descriptor(resource=ResourceDescriptor.from_array(array)) + + @classmethod + def from_descriptor(cls, *, resource): + """Create a surface object from a :class:`ResourceDescriptor`. + + Parameters + ---------- + resource : ResourceDescriptor + Must wrap an :class:`Array` allocated with + ``surface_load_store=True``. Linear/pitch2d resources are not + valid surface backings. + """ + if not isinstance(resource, ResourceDescriptor): + raise TypeError( + f"resource must be a ResourceDescriptor, got " + f"{type(resource).__name__}" + ) + if resource.kind != "array": + raise ValueError( + f"SurfaceObject requires an array-backed ResourceDescriptor, " + f"got kind={resource.kind!r}" + ) + + cdef Array arr = resource.source + if not arr.surface_load_store: + raise ValueError( + "Array must be created with surface_load_store=True to be " + "bound as a SurfaceObject" + ) + + cdef cydriver.CUDA_RESOURCE_DESC res_desc + memset(&res_desc, 0, sizeof(res_desc)) + res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY + res_desc.res.array.hArray = arr._handle + + cdef SurfaceObject self = cls.__new__(cls) + self._source_ref = resource + self._context = _get_current_context_ptr() + self._device_id = _get_current_device_id() + + with nogil: + HANDLE_RETURN( + cydriver.cuSurfObjectCreate(&self._handle, &res_desc) + ) + return self + + @property + def handle(self): + """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg).""" + return self._handle + + @property + def resource(self): + """The :class:`ResourceDescriptor` this surface was built from.""" + return self._source_ref + + @property + def device(self): + from cuda.core._device import Device + return Device(self._device_id) + + cpdef close(self): + """Destroy the underlying ``CUsurfObject``.""" + cdef cydriver.CUsurfObject h = self._handle + self._handle = 0 + self._source_ref = None + if h != 0: + HANDLE_RETURN(cydriver.cuSurfObjectDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuSurfObjectDestroy error is + # silently dropped. Callers needing visibility should use close(). + if self._handle != 0: + cydriver.cuSurfObjectDestroy(self._handle) + self._handle = 0 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return f"SurfaceObject(handle=0x{self._handle:x})" diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd new file mode 100644 index 00000000000..4d2d5004069 --- /dev/null +++ b/cuda_core/cuda/core/_texture.pxd @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from cuda.bindings cimport cydriver + + +cdef class TextureObject: + + cdef: + cydriver.CUtexObject _handle + object _source_ref # keep backing Array (or other resource) alive + object _texture_desc # original TextureDescriptor for introspection + int _device_id + intptr_t _context + + cpdef close(self) diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx new file mode 100644 index 00000000000..6ccffcadbb1 --- /dev/null +++ b/cuda_core/cuda/core/_texture.pyx @@ -0,0 +1,572 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._array cimport Array +from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE +from cuda.core._memory._buffer cimport Buffer +from cuda.core._mipmapped_array cimport MipmappedArray +from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_context_ptr, + _get_current_device_id, +) + +import enum +from dataclasses import dataclass + + +# Driver texture-descriptor flag bits (CU_TRSF_*). +_TRSF_READ_AS_INTEGER = 0x01 +_TRSF_NORMALIZED_COORDINATES = 0x02 +_TRSF_SRGB = 0x10 +_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 0x20 +_TRSF_SEAMLESS_CUBEMAP = 0x40 + + +class AddressMode(enum.IntEnum): + """Boundary behavior for out-of-range texture coordinates.""" + WRAP = cydriver.CU_TR_ADDRESS_MODE_WRAP + CLAMP = cydriver.CU_TR_ADDRESS_MODE_CLAMP + MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR + BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER + + +class FilterMode(enum.IntEnum): + """Texel sampling mode.""" + POINT = cydriver.CU_TR_FILTER_MODE_POINT + LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR + + +class ReadMode(enum.IntEnum): + """How sampled values are returned to the kernel. + + - ``ELEMENT_TYPE``: return the raw element value (integer formats stay + integer, float stays float). + - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized + ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed). + Float formats are unaffected. + """ + ELEMENT_TYPE = 0 + NORMALIZED_FLOAT = 1 + + +class ResourceDescriptor: + """Describes the memory backing a :class:`TextureObject`. + + Construct via the ``from_*`` classmethods: + + - :meth:`from_array` wraps a :class:`Array` (works for both + :class:`TextureObject` and :class:`SurfaceObject`). + - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture + objects built from a linear resource do not support filtering, + normalized coordinates, or addressing modes. + - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image. + Supports filtering and 2D addressing, but only 2D access. + + Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those + require an :class:`Array` allocated with ``surface_load_store=True``. + """ + + __slots__ = ( + "_kind", "_source", + "_format", "_num_channels", + "_size_bytes", + "_width", "_height", "_pitch_bytes", + ) + + def __init__(self): + raise RuntimeError( + "ResourceDescriptor cannot be instantiated directly. " + "Use ResourceDescriptor.from_* factories." + ) + + @classmethod + def from_array(cls, array): + """Build a resource descriptor backed by a :class:`Array`.""" + if not isinstance(array, Array): + raise TypeError(f"array must be an Array, got {type(array).__name__}") + self = cls.__new__(cls) + self._kind = "array" + self._source = array + self._format = None + self._num_channels = None + self._size_bytes = None + self._width = None + self._height = None + self._pitch_bytes = None + return self + + @classmethod + def from_mipmapped_array(cls, mipmapped_array): + """Build a resource descriptor backed by a :class:`MipmappedArray`. + + Suitable for binding to a :class:`TextureObject` for mipmapped + sampling. Not valid as a :class:`SurfaceObject` backing: surfaces + require a single :class:`Array` level (obtain via + :meth:`MipmappedArray.get_level`). + """ + if not isinstance(mipmapped_array, _PyMipmappedArray): + raise TypeError( + f"mipmapped_array must be a MipmappedArray, got " + f"{type(mipmapped_array).__name__}" + ) + self = cls.__new__(cls) + self._kind = "mipmapped_array" + self._source = mipmapped_array + self._format = None + self._num_channels = None + self._size_bytes = None + self._width = None + self._height = None + self._pitch_bytes = None + return self + + @classmethod + def from_linear(cls, buffer, *, format, num_channels, size_bytes=None): + """Build a resource descriptor for a linear (typed 1D) texture fetch. + + Parameters + ---------- + buffer : Buffer + Device-memory backing. Must remain alive for the lifetime of any + :class:`TextureObject` built from this descriptor. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + size_bytes : int, optional + Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not + exceed it. + + Notes + ----- + Texture objects built from a linear resource ignore the + :class:`TextureDescriptor` addressing/filtering fields — kernels read + through a typed 1D fetch with bounds checking only. + """ + if not isinstance(buffer, Buffer): + raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}") + if not isinstance(format, ArrayFormat): + raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}") + if isinstance(num_channels, bool) or num_channels not in (1, 2, 4): + raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}") + + buf_size = int(buffer.size) + elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels) + if size_bytes is None: + size = buf_size + else: + size = int(size_bytes) + if size > buf_size: + raise ValueError( + f"size_bytes ({size}) exceeds buffer.size ({buf_size})" + ) + if size < elem: + raise ValueError( + f"size_bytes ({size}) must be at least one element ({elem} bytes)" + ) + if size % elem != 0: + raise ValueError( + f"size_bytes ({size}) must be a multiple of element size " + f"({elem} bytes for {format.name} x {num_channels})" + ) + + self = cls.__new__(cls) + self._kind = "linear" + self._source = buffer + self._format = int(format) + self._num_channels = int(num_channels) + self._size_bytes = size + self._width = None + self._height = None + self._pitch_bytes = None + return self + + @classmethod + def from_pitch2d( + cls, buffer, *, format, num_channels, width, height, pitch_bytes + ): + """Build a resource descriptor for a row-pitched 2D image. + + Parameters + ---------- + buffer : Buffer + Device-memory backing. Must remain alive for the lifetime of any + :class:`TextureObject` built from this descriptor. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + width : int + Image width, in elements. + height : int + Image height, in rows. + pitch_bytes : int + Distance between consecutive rows, in bytes. Must be at least + ``width * format_size * num_channels`` and meet the driver's + ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``. + """ + if not isinstance(buffer, Buffer): + raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}") + if not isinstance(format, ArrayFormat): + raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}") + if isinstance(num_channels, bool) or num_channels not in (1, 2, 4): + raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}") + + w = int(width) + h = int(height) + p = int(pitch_bytes) + if w < 1: + raise ValueError(f"width must be >= 1, got {w}") + if h < 1: + raise ValueError(f"height must be >= 1, got {h}") + elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels) + min_pitch = w * elem + if p < min_pitch: + raise ValueError( + f"pitch_bytes ({p}) must be >= width * element_size ({min_pitch})" + ) + if p * h > int(buffer.size): + raise ValueError( + f"pitch_bytes * height ({p * h}) exceeds buffer.size ({int(buffer.size)})" + ) + + self = cls.__new__(cls) + self._kind = "pitch2d" + self._source = buffer + self._format = int(format) + self._num_channels = int(num_channels) + self._size_bytes = None + self._width = w + self._height = h + self._pitch_bytes = p + return self + + @property + def kind(self): + return self._kind + + @property + def source(self): + return self._source + + @property + def format(self): + """The element :class:`ArrayFormat` (``None`` for array-backed).""" + return None if self._format is None else ArrayFormat(self._format) + + @property + def num_channels(self): + """Channels per element (``None`` for array-backed).""" + return self._num_channels + + @property + def size_bytes(self): + """Bytes bound for a linear resource (``None`` for other kinds).""" + return self._size_bytes + + @property + def width(self): + """Pitch2D image width, in elements (``None`` for other kinds).""" + return self._width + + @property + def height(self): + """Pitch2D image height, in rows (``None`` for other kinds).""" + return self._height + + @property + def pitch_bytes(self): + """Pitch2D row pitch, in bytes (``None`` for other kinds).""" + return self._pitch_bytes + + def __repr__(self): + if self._kind == "linear": + return ( + f"ResourceDescriptor(kind='linear', format={self.format.name}, " + f"num_channels={self._num_channels}, size_bytes={self._size_bytes})" + ) + if self._kind == "pitch2d": + return ( + f"ResourceDescriptor(kind='pitch2d', format={self.format.name}, " + f"num_channels={self._num_channels}, " + f"width={self._width}, height={self._height}, " + f"pitch_bytes={self._pitch_bytes})" + ) + return f"ResourceDescriptor(kind={self._kind!r})" + + +@dataclass +class TextureDescriptor: + """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``). + + Attributes + ---------- + address_mode : tuple of AddressMode + Boundary behavior per axis. May be a single :class:`AddressMode` (applied + to all axes) or a tuple of 1-3 entries (one per dimension). + filter_mode : FilterMode + Texel sampling mode. Default ``POINT``. + read_mode : ReadMode + How sampled integer values are returned. Default ``ELEMENT_TYPE``. + normalized_coords : bool + If True, coordinates are in ``[0, 1]`` instead of pixel indices. + srgb : bool + If True, perform sRGB → linear conversion on read (8-bit formats only). + disable_trilinear_optimization : bool + If True, request exact trilinear filtering. + seamless_cubemap : bool + If True, enable seamless cubemap edge filtering. + max_anisotropy : int + Maximum anisotropy; 0 disables anisotropic filtering. + mipmap_filter_mode : FilterMode + Filtering between mipmap levels. Default ``POINT``. + mipmap_level_bias : float + min_mipmap_level_clamp : float + max_mipmap_level_clamp : float + border_color : tuple of float or None + 4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means + zero. + """ + + address_mode: object = AddressMode.CLAMP + filter_mode: FilterMode = FilterMode.POINT + read_mode: ReadMode = ReadMode.ELEMENT_TYPE + normalized_coords: bool = False + srgb: bool = False + disable_trilinear_optimization: bool = False + seamless_cubemap: bool = False + max_anisotropy: int = 0 + mipmap_filter_mode: FilterMode = FilterMode.POINT + mipmap_level_bias: float = 0.0 + min_mipmap_level_clamp: float = 0.0 + max_mipmap_level_clamp: float = 0.0 + border_color: tuple | None = None + + +def _normalize_address_modes(address_mode): + """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple.""" + if isinstance(address_mode, AddressMode): + return (address_mode, address_mode, address_mode) + try: + modes = tuple(address_mode) + except TypeError as e: + raise TypeError( + "address_mode must be an AddressMode or a tuple of AddressMode" + ) from e + if not 1 <= len(modes) <= 3: + raise ValueError( + f"address_mode tuple must have 1-3 entries, got {len(modes)}" + ) + for i, m in enumerate(modes): + if not isinstance(m, AddressMode): + raise TypeError( + f"address_mode[{i}] must be an AddressMode, got {type(m).__name__}" + ) + # Pad to 3 entries by repeating the last one. + padded = list(modes) + [modes[-1]] * (3 - len(modes)) + return tuple(padded) + + +cdef class TextureObject: + """A bindless texture handle for kernel-side sampled reads. + + Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the + :class:`Array` referenced by the descriptor) is kept alive for the + lifetime of this object to prevent dangling handles. + + Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit + handle (via the ``handle`` property). + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "TextureObject cannot be instantiated directly. " + "Use TextureObject.from_descriptor()." + ) + + @classmethod + def from_descriptor(cls, *, resource, texture_descriptor): + """Create a texture object from a resource + sampling descriptor. + + Parameters + ---------- + resource : ResourceDescriptor + texture_descriptor : TextureDescriptor + """ + if not isinstance(resource, ResourceDescriptor): + raise TypeError( + f"resource must be a ResourceDescriptor, got " + f"{type(resource).__name__}" + ) + if not isinstance(texture_descriptor, TextureDescriptor): + raise TypeError( + f"texture_descriptor must be a TextureDescriptor, got " + f"{type(texture_descriptor).__name__}" + ) + + cdef cydriver.CUDA_RESOURCE_DESC res_desc + cdef cydriver.CUDA_TEXTURE_DESC tex_desc + memset(&res_desc, 0, sizeof(res_desc)) + memset(&tex_desc, 0, sizeof(tex_desc)) + + # --- Resource descriptor --- + cdef Array arr + cdef MipmappedArray mip + cdef Buffer buf + cdef intptr_t devptr + if resource.kind == "array": + arr = resource.source + res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY + res_desc.res.array.hArray = arr._handle + elif resource.kind == "mipmapped_array": + mip = resource.source + res_desc.resType = cydriver.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY + res_desc.res.mipmap.hMipmappedArray = mip._handle + elif resource.kind == "linear": + buf = resource.source + devptr = int(buf.handle) + res_desc.resType = cydriver.CU_RESOURCE_TYPE_LINEAR + res_desc.res.linear.devPtr = devptr + res_desc.res.linear.format = resource._format + res_desc.res.linear.numChannels = resource._num_channels + res_desc.res.linear.sizeInBytes = resource._size_bytes + elif resource.kind == "pitch2d": + buf = resource.source + devptr = int(buf.handle) + res_desc.resType = cydriver.CU_RESOURCE_TYPE_PITCH2D + res_desc.res.pitch2D.devPtr = devptr + res_desc.res.pitch2D.format = resource._format + res_desc.res.pitch2D.numChannels = resource._num_channels + res_desc.res.pitch2D.width = resource._width + res_desc.res.pitch2D.height = resource._height + res_desc.res.pitch2D.pitchInBytes = resource._pitch_bytes + else: + raise NotImplementedError( + f"ResourceDescriptor kind {resource.kind!r} is not yet supported" + ) + + # --- Texture descriptor --- + modes = _normalize_address_modes(texture_descriptor.address_mode) + tex_desc.addressMode[0] = modes[0] + tex_desc.addressMode[1] = modes[1] + tex_desc.addressMode[2] = modes[2] + + if not isinstance(texture_descriptor.filter_mode, FilterMode): + raise TypeError( + f"filter_mode must be a FilterMode, got " + f"{type(texture_descriptor.filter_mode).__name__}" + ) + tex_desc.filterMode = texture_descriptor.filter_mode + + if not isinstance(texture_descriptor.read_mode, ReadMode): + raise TypeError( + f"read_mode must be a ReadMode, got " + f"{type(texture_descriptor.read_mode).__name__}" + ) + + cdef unsigned int flags = 0 + # CU_TRSF_READ_AS_INTEGER suppresses normalization, so it maps to + # ReadMode.ELEMENT_TYPE. + if texture_descriptor.read_mode == ReadMode.ELEMENT_TYPE: + flags |= _TRSF_READ_AS_INTEGER + if texture_descriptor.normalized_coords: + flags |= _TRSF_NORMALIZED_COORDINATES + if texture_descriptor.srgb: + flags |= _TRSF_SRGB + if texture_descriptor.disable_trilinear_optimization: + flags |= _TRSF_DISABLE_TRILINEAR_OPTIMIZATION + if texture_descriptor.seamless_cubemap: + flags |= _TRSF_SEAMLESS_CUBEMAP + tex_desc.flags = flags + + if texture_descriptor.max_anisotropy < 0: + raise ValueError("max_anisotropy must be >= 0") + tex_desc.maxAnisotropy = texture_descriptor.max_anisotropy + + if not isinstance(texture_descriptor.mipmap_filter_mode, FilterMode): + raise TypeError( + f"mipmap_filter_mode must be a FilterMode, got " + f"{type(texture_descriptor.mipmap_filter_mode).__name__}" + ) + tex_desc.mipmapFilterMode = texture_descriptor.mipmap_filter_mode + tex_desc.mipmapLevelBias = texture_descriptor.mipmap_level_bias + tex_desc.minMipmapLevelClamp = texture_descriptor.min_mipmap_level_clamp + tex_desc.maxMipmapLevelClamp = texture_descriptor.max_mipmap_level_clamp + + cdef int i + if texture_descriptor.border_color is None: + for i in range(4): + tex_desc.borderColor[i] = 0.0 + else: + bc = tuple(texture_descriptor.border_color) + if len(bc) != 4: + raise ValueError( + f"border_color must have 4 elements, got {len(bc)}" + ) + for i in range(4): + tex_desc.borderColor[i] = bc[i] + + cdef TextureObject self = cls.__new__(cls) + self._source_ref = resource + self._texture_desc = texture_descriptor + self._context = _get_current_context_ptr() + self._device_id = _get_current_device_id() + + with nogil: + HANDLE_RETURN( + cydriver.cuTexObjectCreate(&self._handle, &res_desc, &tex_desc, NULL) + ) + return self + + @property + def handle(self): + """The underlying ``CUtexObject`` as an integer (64-bit kernel arg).""" + return self._handle + + @property + def resource(self): + """The :class:`ResourceDescriptor` this texture was built from.""" + return self._source_ref + + @property + def texture_descriptor(self): + """The :class:`TextureDescriptor` this texture was built from.""" + return self._texture_desc + + @property + def device(self): + from cuda.core._device import Device + return Device(self._device_id) + + cpdef close(self): + """Destroy the underlying ``CUtexObject``.""" + cdef cydriver.CUtexObject h = self._handle + self._handle = 0 + self._source_ref = None + if h != 0: + HANDLE_RETURN(cydriver.cuTexObjectDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuTexObjectDestroy error is + # silently dropped. Callers needing visibility should use close(). + if self._handle != 0: + cydriver.cuTexObjectDestroy(self._handle) + self._handle = 0 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return f"TextureObject(handle=0x{self._handle:x})" diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd index 4562cd71355..a8115aaf3f9 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd @@ -4,7 +4,7 @@ cimport cpython from cpython.object cimport PyObject -from libc.stdint cimport int64_t, int32_t, uint8_t, uint16_t, uint32_t +from libc.stdint cimport int64_t, int32_t, intptr_t, uint8_t, uint16_t, uint32_t from cuda.bindings cimport cydriver, cynvrtc, cynvvm, cynvjitlink @@ -25,6 +25,12 @@ cdef int HANDLE_RETURN_NVJITLINK( cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil +# Helpers for retrieving the current CUDA context and device. Raise if no +# active context is bound to the calling thread. +cdef intptr_t _get_current_context_ptr() except? 0 +cdef int _get_current_device_id() except? -1 + + # TODO: stop exposing these within the codebase? cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil cpdef int _check_runtime_error(error) except?-1 diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx index 1bcfa524884..9ffaf3531ff 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -66,6 +66,27 @@ cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil: return 0 +cdef intptr_t _get_current_context_ptr() except? 0: + """Return the current thread's bound CUcontext as an intptr_t. + + Raises ``RuntimeError`` if no context is current. + """ + cdef cydriver.CUcontext ctx + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) + if ctx == NULL: + raise RuntimeError("an active CUDA context is required") + return ctx + + +cdef int _get_current_device_id() except? -1: + """Return the current thread's bound CUdevice ordinal.""" + cdef cydriver.CUdevice dev + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + return dev + + cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil: """Handle NVRTC result codes, raising NVRTCError with program log on failure.""" if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS: diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 0a88a5bd4b6..7c1d33e3393 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -159,6 +159,40 @@ Tensor Memory Accelerator (TMA) TensorMapDescriptorOptions +Textures and surfaces +--------------------- + +CUDA arrays back bindless texture and surface objects for kernel-side sampled +reads and typed load/store. :class:`Array` is allocated through +:meth:`Array.from_descriptor` and bound through a :class:`ResourceDescriptor` +factory; linear (1D) and row-pitched 2D :class:`Buffer` views as well as +mipmapped allocations (:class:`MipmappedArray`) are also supported as texture +backings. + +.. autosummary:: + :toctree: generated/ + + :template: autosummary/cyclass.rst + + Array + MipmappedArray + ResourceDescriptor + TextureObject + SurfaceObject + + :template: dataclass.rst + + TextureDescriptor + +.. autosummary:: + :toctree: generated/ + + ArrayFormat + AddressMode + FilterMode + ReadMode + + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py new file mode 100644 index 00000000000..c8f2c9165b6 --- /dev/null +++ b/cuda_core/examples/gl_interop_fire.py @@ -0,0 +1,774 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop: a classic +# "Doom-style" procedural fire effect. A scalar heat field lives on a +# ping-ponged float CUDA Array; each frame the field is advected upward with a +# horizontal jitter and a small decay, then colorized through a 1D fire-palette +# TextureObject straight into an OpenGL PBO. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to combine a 2D float Array (the heat field) and a 1D RGBA8 Array (the +# color palette) under the same texture/surface API. +# - How to ping-pong a scalar field via Array + SurfaceObject writes and +# TextureObject reads, similar to the reaction-diffusion example but with a +# single channel. +# - How to use TextureObject(NORMALIZED_FLOAT) on a UINT8 palette so a +# tex1D lookup returns RGBA in [0, 1] -- no manual unpacking needed. +# - How to wire mouse / keyboard events into a CUDA simulation without +# blocking the event loop. +# +# How it works +# ============ +# The heat field is a WIDTH x HEIGHT scalar in [0, 1]. Each frame we: +# +# 1. step kernel: for every pixel, +# - if y is near the bottom AND ambient injection is on, write random +# high heat ("the embers"); +# - if the mouse button is held, paint a hot disc near the cursor; +# - otherwise read a horizontally-jittered sample from the row "below" +# (i.e. one texel toward the bottom of the screen) and subtract a +# small decay. This is what creates the upward-flickering motion. +# 2. colorize kernel: per pixel, sample the heat, look it up in a 1D RGBA8 +# fire palette via tex1D, and write RGBA bytes into the PBO. +# +# PING-PONG (two single-channel float Arrays) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +-------------+ tex2D +-------------+ +# | heat_a | ----------------> | | +# | (FLOAT32 x1)| | step_fire | +# +-------------+ | kernel | +# | | +# +-------------+ surf2Dwrite | | +# | heat_b | <---------------- | | +# | (FLOAT32 x1)| +-------------+ +# +-------------+ +# (swap) +# +# Orientation +# ----------- +# OpenGL displays texel row 0 at the bottom of the window. The fullscreen quad +# in create_display_resources() flips t so that kernel y=0 lands at the TOP of +# the screen -- this lets the kernel keep the intuitive "inject at y = h-1, +# advect from y+1 -> y" convention while the visible flames rise upward. +# Mouse coordinates from pyglet (y=0 at window bottom) are flipped to the +# kernel's y-down convention on entry. +# +# surf2Dwrite x-in-bytes +# ---------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# float surface that means `x * sizeof(float)` = `x * 4`. Getting this wrong +# silently corrupts every other column. +# +# What you should see +# =================== +# A flickering wall of doom-style fire rising from the bottom of the window. +# Hold the mouse button and drag to paint a torch of heat at the cursor. +# Press SPACE to toggle the ambient embers along the bottom row (the fire +# will die out when ambient is OFF). Press R to clear the heat field. +# Press Escape or close the window to exit. The window title shows FPS and +# whether ambient injection is currently on. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +# Window dimensions (what the user sees). +WINDOW_WIDTH = 640 +WINDOW_HEIGHT = 480 + +# Simulation dimensions (the heat-field grid). Doom's actual screen was +# 320x200; we use 320x100 so the canonical decay rate of ~1 intensity unit +# per row (random {0, 1, 2}, average 1) produces flames that reach ~36% of +# the screen height -- the recognizable "tall licking flames" look. +# NEAREST-filtered upscale to the 640x480 window stretches vertically 4.8x, +# giving the chunky retro pixel-doubled appearance. +WIDTH = 320 +HEIGHT = 100 + +# Canonical Doom fire palette: 37 hand-tuned colors (intensity 0..36 -> RGB). +# Source: https://github.com/tiagomenegaz/doom-fire (and Fabien Sanglard's +# analysis of the original PSX Doom fire effect). +PALETTE_SIZE = 37 +MAX_INTENSITY = 36 +TORCH_RADIUS = 12 # pixel radius of the mouse-painted hot disc (sim space) + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs).""" + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex1D / tex2D overloads + # resolve. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("step_fire", "colorize_fire"), + ) + + kernels = { + "step": mod.get_kernel("step_fire"), + "colorize": mod.get_kernel("colorize_fire"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + # Both kernels are pixel-parallel over a WIDTH x HEIGHT grid. + configs = {"step": config, "colorize": config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WINDOW_WIDTH, + WINDOW_HEIGHT, + caption="cuda.core Array/Texture/Surface - Doom Fire", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + Standard OpenGL boilerplate for a textured fullscreen quad. The texcoord + `t` is flipped versus the plasma example so that kernel y=0 lands at the + TOP of the screen. That lets the fire kernel keep the intuitive + "inject at the largest y, advect upward" convention while the visible + flames rise toward the top. + + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window). Note the + # flipped t coordinates compared to gl_interop_plasma: (-1, -1) gets t=1 + # so screen-bottom samples the kernel's largest-y row. + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, -1, 0, 1, + 1, -1, 1, 1, + 1, 1, 1, 0, + -1, -1, 0, 1, + 1, 1, 1, 0, + -1, 1, 0, 0, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (filled each frame from the PBO). + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + # NEAREST upscale: makes the low-res simulation render with crisp, + # blocky pixels instead of bilinear-blended mush. Critical to the + # Doom-fire look. + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_heat_arrays(): + """Allocate two single-channel UINT8 ping-pong Arrays for the heat field. + + Intensity is an integer in [0, 36] indexing the canonical Doom palette. + UINT8 is exactly one byte per texel -- surf2Dwrite x-coord = x * 1. + """ + arr_a = Array.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.UINT8, + num_channels=1, + surface_load_store=True, + ) + arr_b = Array.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.UINT8, + num_channels=1, + surface_load_store=True, + ) + return arr_a, arr_b + + +def make_heat_texture(arr): + """Bind `arr` as a TextureObject configured for POINT + CLAMP reads. + + POINT filtering is what gives Doom fire its chunky retro look. LINEAR + smooths the per-frame horizontal jitter into a uniform glow that + doesn't read as fire. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.POINT, + read_mode=ReadMode.ELEMENT_TYPE, + # Non-normalized: the step kernel addresses texels in pixel space. + normalized_coords=False, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def build_fire_palette(): + """Return the canonical Doom fire palette as a (37, 4) uint8 array. + + The 37 entries map intensity 0 (black) -> 36 (white). Each entry is + indexed by the integer intensity in the heat field. + + Source: Fabien Sanglard's PSX Doom analysis, reproduced in + https://github.com/tiagomenegaz/doom-fire. + """ + rgb = [ + ( 7, 7, 7), ( 31, 7, 7), ( 47, 15, 7), ( 71, 15, 7), + ( 87, 23, 7), (103, 31, 7), (119, 31, 7), (143, 39, 7), + (159, 47, 7), (175, 63, 7), (191, 71, 7), (199, 71, 7), + (223, 79, 7), (223, 87, 7), (223, 87, 7), (215, 95, 7), + (215, 95, 7), (215, 103, 15), (207, 111, 15), (207, 119, 15), + (207, 127, 15), (207, 135, 23), (199, 135, 23), (199, 143, 23), + (199, 151, 31), (191, 159, 31), (191, 159, 31), (191, 167, 39), + (191, 167, 39), (191, 175, 47), (183, 175, 47), (183, 183, 47), + (183, 183, 55), (207, 207, 111), (223, 223, 159), (239, 239, 199), + (255, 255, 255), + ] + # Index 0 (the "no fire" color) is rendered as pure black so dead pixels + # don't glow. The canonical (7, 7, 7) reads as a dim background which is + # less dramatic against the dark window. + rgb[0] = (0, 0, 0) + assert len(rgb) == PALETTE_SIZE + rgba = np.empty((PALETTE_SIZE, 4), dtype=np.uint8) + rgba[:, :3] = np.array(rgb, dtype=np.uint8) + rgba[:, 3] = 255 + return rgba + + +def make_palette_array_and_texture(stream): + """Allocate the 1D RGBA8 palette Array, upload, and bind as a texture. + + Returns (palette_array, palette_texture). Both must be closed by the + caller (or used inside `with` blocks). + """ + palette = build_fire_palette() # shape (PALETTE_SIZE, 4), uint8 + arr = Array.from_descriptor( + shape=(PALETTE_SIZE,), + format=ArrayFormat.UINT8, + num_channels=4, + ) + # 1D Array bytes match a flat (PALETTE_SIZE * 4) uint8 buffer. + arr.copy_from(np.ascontiguousarray(palette), stream=stream) + + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + # POINT keeps the palette stops as discrete color bands -- the + # classic Doom fire palette is indexed, not gradient-blended. + filter_mode=FilterMode.POINT, + # NORMALIZED_FLOAT: tex1D returns each UINT8 channel as a + # float in [0, 1], so the colorize kernel can multiply by 255 and + # store directly without manual unpacking. + read_mode=ReadMode.NORMALIZED_FLOAT, + # Normalized: the kernel feeds a heat value in [0, 1] as the LUT + # coordinate. With normalized_coords=True the LINEAR filter blends + # adjacent palette entries smoothly. + normalized_coords=True, + ) + tex = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + return arr, tex + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate heat-field Arrays, palette Array, and the four + # bindless handles (textures + surfaces). We hold them open + # for the lifetime of the window and release in on_close(), + # matching the reaction-diffusion example. (Using `with` + # blocks here would close everything before the pyglet event + # loop has a chance to use them.) + arr_a, arr_b = make_heat_arrays() + palette_arr, palette_tex = make_palette_array_and_texture(stream) + tex_a = make_heat_texture(arr_a) + tex_b = make_heat_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # The heat field is born zeroed by Array.from_descriptor. No seed pass. + state = { + "current": "a", # which array holds the latest heat field + "frame_index": 0, # passed into the step kernel as `t` + "ambient": True, # SPACE toggles bottom-row injection + "mouse_down": False, + "mouse_x": 0, + "mouse_y": 0, + } + + def current_read_write(): + if state["current"] == "a": + return tex_a, surf_b, "b" # read a, write b, next current = b + return tex_b, surf_a, "a" + + def clear_field(): + """Zero both heat arrays and seed the bottom row at full intensity. + + Array.copy_from is the simplest reset path -- a dedicated clear + kernel would be faster but is unnecessary for an interactive demo. + The bottom row is set to MAX_INTENSITY so the very first frame + already has a fire source to advect from. + """ + seed = np.zeros((HEIGHT, WIDTH), dtype=np.uint8) + seed[HEIGHT - 1, :] = MAX_INTENSITY # canonical Doom fire source + arr_a.copy_from(np.ascontiguousarray(seed), stream=stream) + arr_b.copy_from(np.ascontiguousarray(seed), stream=stream) + state["current"] = "a" + + # Seed at startup so frame 1 already has a source row. + clear_field() + stream.sync() + + # --- Step 7: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.SPACE: + state["ambient"] = not state["ambient"] + return + if symbol == key.R: + clear_field() + return + + # Map window coords (WINDOW_WIDTH x WINDOW_HEIGHT, y=0 at bottom) to + # simulation coords (WIDTH x HEIGHT, y=0 at top). + def _window_to_sim(x, y): + sx = int(x * WIDTH / WINDOW_WIDTH) + sy = int((WINDOW_HEIGHT - 1 - y) * HEIGHT / WINDOW_HEIGHT) + return sx, sy + + @window.event + def on_mouse_press(x, y, _button, _modifiers): + state["mouse_down"] = True + state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y) + + @window.event + def on_mouse_release(_x, _y, _button, _modifiers): + state["mouse_down"] = False + + @window.event + def on_mouse_drag(x, y, _dx, _dy, _buttons, _modifiers): + state["mouse_down"] = True + state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y) + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # (a) Advance the heat field by one step. + tex_read, surf_write, next_current = current_read_write() + launch( + stream, + configs["step"], + kernels["step"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint32(state["frame_index"]), + np.int32(state["mouse_x"]), + np.int32(state["mouse_y"]), + np.int32(1 if state["mouse_down"] else 0), + np.int32(1 if state["ambient"] else 0), + ) + state["current"] = next_current + state["frame_index"] += 1 + + # (b) Colorize the latest state into the OpenGL PBO. + tex_heat = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_heat.handle), + np.uint64(palette_tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + ambient_label = "on" if state["ambient"] else "off" + window.set_caption( + "cuda.core Array/Texture/Surface - Doom Fire" + f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS," + f" ambient {ambient_label})" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly to be deterministic about ordering. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + palette_tex.close() + palette_arr.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains two CUDA C++ kernels: +# * step_fire -- advances the heat field. Reads previous state via a +# TextureObject (LINEAR + CLAMP, non-normalized) and +# writes the next state via a SurfaceObject. Bakes +# the bottom-row injection, mouse torch, and upward +# jittered advection into a single pass. +# * colorize_fire -- per pixel: read heat from the heat TextureObject, +# look up the fire palette via tex1D, write +# RGBA bytes to the OpenGL PBO. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. The quad's t +# coordinate is flipped versus the plasma example so that y=0 maps to the +# top of the screen (see create_display_resources for why). +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Small, deterministic, GPU-friendly hash. Returns a value in [0, 1). +// Used both for bottom-row ember intensity and for the per-pixel jitter that +// gives the fire its characteristic horizontal flicker. +__device__ __forceinline__ float hash3(unsigned int x, unsigned int y, + unsigned int t) { + unsigned int h = x * 374761393u + y * 668265263u + t * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h ^= (h >> 16); + return (float)(h & 0x00ffffffu) / (float)0x01000000u; +} + +// Canonical Doom-fire step (gather form of the original scatter algorithm). +// +// Reference scatter (one cell per JS source row): +// decay = random in {0, 1, 2} +// below = state[x, y+1] +// new = max(0, below - decay) +// state[x - decay, y] = new // writes LEFT of source -> leftward lean +// +// Equivalent gather (one CUDA thread per destination cell): +// decay = hash(x, y, t) in {0, 1, 2} +// below = state[x + decay, y+1] // reads from the right-shifted source +// new = max(0, below - decay) +// state[x, y] = new +// +// The right-shifted gather reads the same data the leftward-shifted scatter +// would have produced. + +extern "C" +__global__ +void step_fire(cudaTextureObject_t tex_read, + cudaSurfaceObject_t surf_write, + int width, int height, + unsigned int t, + int mouse_x, int mouse_y, int mouse_active, + int ambient_on) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + const int MAX_I = 36; + + // 1) Mouse torch: a hot disc painted at the cursor (overrides everything). + if (mouse_active) { + int dx = x - mouse_x; + int dy = y - mouse_y; + if (dx * dx + dy * dy <= 12 * 12) { // matches host TORCH_RADIUS + surf2Dwrite((unsigned char)MAX_I, surf_write, x, y); + return; + } + } + + // 2) Bottom row is the steady fire source. Hardcoded to MAX_I when the + // ambient ember bed is on; zero otherwise (lets the fire die down). + if (y == height - 1) { + surf2Dwrite((unsigned char)(ambient_on ? MAX_I : 0), + surf_write, x, y); + return; + } + + // 3) Gather from the row below with random {0, 1, 2} horizontal shift + // and matching intensity decay -- the canonical Doom-fire update. + float jitter_h = hash3((unsigned int)x, (unsigned int)y, t); + int decay = (int)(jitter_h * 3.0f); // 0, 1, or 2 + int src_x = x + decay; + if (src_x >= width) src_x = width - 1; + unsigned char below = tex2D(tex_read, + (float)src_x + 0.5f, + (float)y + 1.5f); + int new_i = (int)below - decay; + if (new_i < 0) new_i = 0; + + // UINT8 is 1 byte, so surf2Dwrite's x argument is already the byte offset. + surf2Dwrite((unsigned char)new_i, surf_write, x, y); +} + +extern "C" +__global__ +void colorize_fire(cudaTextureObject_t tex_heat, + cudaTextureObject_t palette_tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Heat texture is UINT8 + ELEMENT_TYPE: tex2D returns the + // raw intensity byte (0..36). + unsigned char h = tex2D(tex_heat, + (float)x + 0.5f, + (float)y + 0.5f); + + // Palette texture is 1D normalized RGBA8 with POINT filtering and 37 + // entries. Index i lands at coord (i + 0.5) / 37 -- the texel center, + // which POINT samples exactly. + const float palette_size = 37.0f; + float u = ((float)h + 0.5f) / palette_size; + float4 c = tex1D(palette_tex, u); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(c.x * 255.0f); + output[idx + 1] = (unsigned char)(c.y * 255.0f); + output[idx + 2] = (unsigned char)(c.z * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py new file mode 100644 index 00000000000..4bdd55e1569 --- /dev/null +++ b/cuda_core/examples/gl_interop_image_show.py @@ -0,0 +1,428 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# Minimal "Hello World" for the cuda.core texture/surface stack. +# +# Allocates a small `Array`, fills it with a procedural image once, binds it +# as a `TextureObject`, and uses a single CUDA kernel to sample that texture +# at every screen pixel (with a scale + rotation transform) and write the +# result into an OpenGL PBO for display. +# +# Nothing else: no `SurfaceObject`, no ping-pong, no simulation, no mipmaps. +# If you have never touched the new APIs before, open this file first. +# +# ################################################################################ +# +# What this example teaches +# ========================= +# - Allocate an `Array` and upload data into it with `Array.copy_from`. +# - Build a `TextureObject` from a `ResourceDescriptor` + `TextureDescriptor`. +# - The visual difference between `FilterMode.POINT` and `FilterMode.LINEAR` +# (press F to toggle live). +# - That filter mode is baked into the `TextureDescriptor` at creation time, +# so changing it requires destroying and rebuilding the `TextureObject`. +# +# How it works +# ============ +# Startup (once): +# +-------------------+ copy_from +----------+ +# | host numpy image | ------------> | Array | (UINT8 RGBA, 64x64) +# +-------------------+ +----+-----+ +# | +# v +# +-------------+ +# | TextureObj | (filter mode = POINT) +# +-------------+ +# +# Each frame: +# - kernel `sample_image` reads from the TextureObject at a transformed +# (u, v) per screen pixel and writes RGBA bytes to the GL PBO. +# - OpenGL copies the PBO into a screen texture and draws it. +# +# What you should see +# =================== +# A 64x64 procedural test pattern (checkerboard + colored gradient stripes + +# diagonal lines) magnified to fill the window. Press F to switch between +# POINT (blocky) and LINEAR (smooth) sampling; the difference is immediately +# visible. Press R to start/stop a slow rotation. Esc to quit. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +WIDTH = 640 +HEIGHT = 480 +IMAGE_SIZE = 64 # the source Array is IMAGE_SIZE x IMAGE_SIZE RGBA8 + + +# ============================= Helper functions ============================= + + +def make_test_image(size): + """Build a (size, size, 4) uint8 RGBA test pattern. + + Designed so the filter-mode difference is obvious: hard-edged checkerboard + (POINT preserves the edges; LINEAR smooths them) plus a vertical color + gradient stripe (LINEAR blends smoothly between palette stops) plus two + diagonal hairlines (POINT preserves them; LINEAR softens them). + """ + img = np.zeros((size, size, 4), dtype=np.uint8) + # 8x8 black/white checkerboard + cells = size // 8 + for y in range(size): + for x in range(size): + if ((x // cells) + (y // cells)) & 1: + img[y, x, :3] = 255 + # vertical RGB gradient strip down the left third + strip = size // 3 + img[:, :strip, 0] = np.linspace(255, 0, size, dtype=np.uint8)[:, None].repeat(strip, axis=1) + img[:, :strip, 1] = np.linspace(0, 255, size, dtype=np.uint8)[:, None].repeat(strip, axis=1) + img[:, :strip, 2] = 128 + # two diagonal red hairlines + for d in range(size): + img[d, d, :] = [255, 0, 0, 255] + if d < size - 4: + img[d, d + 4, :] = [255, 0, 0, 255] + img[:, :, 3] = 255 # opaque + return img + + +def setup_cuda(): + """Compile the kernel and return (device, stream, kernel, launch_config).""" + dev = Device(0) + dev.set_current() + stream = dev.create_stream() + + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("sample_image",)) + kernel = mod.get_kernel("sample_image") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window. Returns (window, gl_module, pyglet_module).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core Array + TextureObject - Image Show", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard pyglet boilerplate: shader, fullscreen quad, screen texture.""" + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + -1, -1, 0, 0, + 1, -1, 1, 0, + 1, 1, 1, 1, + -1, -1, 0, 0, + 1, 1, 1, 1, + -1, 1, 0, 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0, + gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create the GL PBO that CUDA writes RGBA pixels into each frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, 0, 0, 0, width, height, + gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_texture(arr, filter_mode): + """Build a `TextureObject` for `arr` with the given FilterMode. + + Filter mode is baked into the descriptor at creation; to switch modes + we close this object and call this helper again. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=filter_mode, + # UINT8 source + NORMALIZED_FLOAT means tex2D returns each + # channel as a float in [0, 1] -- handy for the colorize math below. + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) --- + shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the PBO that CUDA will write into --- + pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Allocate the source `Array` and upload the test pattern --- + arr = Array.from_descriptor( + shape=(IMAGE_SIZE, IMAGE_SIZE), + format=ArrayFormat.UINT8, + num_channels=4, + ) + host_image = make_test_image(IMAGE_SIZE) + arr.copy_from(np.ascontiguousarray(host_image), stream=stream) + stream.sync() + + # --- Step 6: Bind the Array as a TextureObject (initially POINT) --- + state = {"filter": FilterMode.POINT, "rotate": False, "angle": 0.0} + tex = make_texture(arr, state["filter"]) + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + nonlocal tex + if symbol == key.ESCAPE: + window.close() + elif symbol == key.F: + # Filter mode is baked at TextureObject creation time. Swapping + # it means closing the old one and building a new one. + state["filter"] = ( + FilterMode.LINEAR if state["filter"] == FilterMode.POINT + else FilterMode.POINT + ) + tex.close() + tex = make_texture(arr, state["filter"]) + elif symbol == key.R: + state["rotate"] = not state["rotate"] + + # --- Step 7: Render loop --- + start = time.monotonic() + last_t = start + frame_count = 0 + fps_time = start + + @window.event + def on_draw(): + nonlocal frame_count, fps_time, last_t + now = time.monotonic() + if state["rotate"]: + state["angle"] += (now - last_t) * 0.5 # rad/sec + last_t = now + + window.clear() + with resource.map(stream=stream) as buf: + launch( + stream, + config, + kernel, + np.uint64(tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(state["angle"]), + ) + copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex) + + frame_count += 1 + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + f"cuda.core Array + TextureObject - Image Show " + f"(filter={state['filter'].name}, " + f"rotate={'on' if state['rotate'] else 'off'}, " + f"{fps:.0f} FPS)" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + tex.close() + arr.close() + resource.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ============================== GPU code (kernel) ============================ + +KERNEL_SOURCE = r""" +extern "C" +__global__ +void sample_image(cudaTextureObject_t tex, + unsigned char* output, + int width, int height, + float angle) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Center the screen pixel around (0, 0) in [-aspect, aspect] x [-1, 1]. + float aspect = (float)width / (float)height; + float sx = ((float)x / (float)width - 0.5f) * 2.0f * aspect; + float sy = ((float)y / (float)height - 0.5f) * 2.0f; + + // Inverse-rotate the screen point: rotating the image by +angle means + // each output pixel reads from the source rotated by -angle. + float c = cosf(-angle), s = sinf(-angle); + float rx = c * sx - s * sy; + float ry = s * sx + c * sy; + + // Map rotated screen point to the [0, 1] x [0, 1] texture domain so the + // image (drawn centered, fitting ~75% of the window height) lands on it. + const float scale = 0.75f; + float u = (rx / (2.0f * scale)) + 0.5f; + float v = (ry / (2.0f * scale)) + 0.5f; + + // AddressMode.CLAMP means out-of-range u/v sample the edge texel. + float4 col = tex2D(tex, u, v); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(col.x * 255.0f); + output[idx + 1] = (unsigned char)(col.y * 255.0f); + output[idx + 2] = (unsigned char)(col.z * 255.0f); + output[idx + 3] = 255; +} +""" + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py new file mode 100644 index 00000000000..c1772514a70 --- /dev/null +++ b/cuda_core/examples/gl_interop_lenia.py @@ -0,0 +1,805 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A Lenia +# continuous cellular automaton is ping-ponged between two CUDA arrays each +# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads through +# a large bell-shaped neighborhood kernel, and a SurfaceObject provides typed +# writes. The final state is colorized straight into an OpenGL PBO. Requires +# pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to drive a wide-radius convolution from a TextureObject configured for +# LINEAR + WRAP + normalized coordinates. The same Array is then bound as a +# SurfaceObject for the typed write back, requiring `surface_load_store=True` +# at allocation time. +# - How a single-channel `float` Array differs from the multi-channel layout +# used in the Gray-Scott example: `num_channels=1`, `tex2D` reads, and +# a 4-byte x-stride in `surf2Dwrite`. +# - How to host-precompute a normalization constant for a stencil with a +# variable-shape support (the bell-curve neighborhood), then pass it as a +# plain float kernel argument. +# +# How it works +# ============ +# Lenia (Bert Wang-Chak Chan, 2018) generalizes Conway's Game of Life to +# continuous space, time, and state. Each cell holds a real value in [0, 1]. +# Per step, every cell: +# +# 1. Integrates a smooth bell-shaped neighborhood kernel K against the +# current state to produce a "potential" U: +# +# U(x) = sum over offsets (dx, dy) inside a disk of radius R of +# K(|(dx, dy)|) * state(x + (dx, dy)) +# divided by sum of K (host-precomputed). +# +# K(r) = exp(-((r / R) - mu_K)^2 / (2 * sigma_K^2)) for r <= R. +# +# 2. Applies the growth function G and updates the state: +# +# state_new = clamp(state_old + dt * (2 * exp(-(U - mu)^2 / +# (2 * sigma^2)) - 1), 0, 1). +# +# Two single-channel `float` arrays are ping-ponged each frame: a +# TextureObject reads one (sampled with LINEAR + WRAP so the disk wraps +# toroidally) and a SurfaceObject writes the other. +# +# PING-PONG (two arrays, swap each step) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +--------------+ tex2D +------------------+ +# | arr_a | ----------------> | | +# | state | | convolve_lenia | +# +--------------+ | kernel | +# | (+ growth fn) | +# +--------------+ surf2Dwrite | | +# | arr_b | <---------------- | | +# | state | +------------------+ +# +--------------+ +# (swap) +# +# After the step we run a separate `colorize_lenia` kernel that samples the +# new state and writes RGBA bytes straight into the OpenGL PBO via +# GraphicsResource. No data ever travels across the PCIe bus during the frame. +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# Lenia's neighborhood radius (R = 13) is wide enough that boundary handling +# really matters. AddressMode.WRAP gives a toroidal world for free, and it is +# only supported in normalized coordinate mode (see the CUDA Programming +# Guide). LINEAR filtering is essentially free on the hardware -- here it +# softens the integer-offset reads a hair, which keeps the dynamics smooth. +# Sample coordinates are `(x + dx + 0.5) / W`; values < 0 or > 1 are fine, +# WRAP handles them. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# single-channel `float` surface that means `x * sizeof(float)` = `x * 4`. +# (The Gray-Scott example uses 8 because it stores `float2`.) +# +# One step per frame +# ------------------ +# Each step convolves a (2R+1)^2 = 729-tap neighborhood for every pixel, which +# is much heavier than a Gray-Scott 5-point Laplacian. With dt = 0.1 the +# dynamics are slow enough that one step per displayed frame is plenty. There +# is no `N_STEPS` loop. +# +# What you should see +# =================== +# A window showing soft, glider-like blobs drifting across the field on a +# teal-on-black palette. Press R to reseed with a new Gaussian blob, 1 to +# clear the field, and Escape to exit. The window title shows the current +# FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 256 +HEIGHT = 256 + +# Neighborhood / kernel shape +R = 13 # convolution radius in pixels (texture-space) +MU_K = 0.5 # bell center for the neighborhood weight K(r/R) +SIGMA_K = 0.15 # bell width for K + +# Growth function shape +MU = 0.15 # bell center for the growth function G(U) +SIGMA = 0.015 # bell width for G + +DT = 0.1 # time step + +# Initial blob radius and peak for the Gaussian seed. +# The radius must be large relative to the neighborhood radius R=13 so the +# kernel-integrated potential U lands near the growth bell's center mu=0.15. +# With SEED_RADIUS=36, U at the blob's centre starts near mu and the field +# survives the first step; smaller seeds collapse to zero within one frame +# because U is far outside the narrow (sigma=0.015) growth bell. +SEED_RADIUS = 36.0 +SEED_PEAK = 0.5 + +# Seed modes (kept in sync with the seed_blob kernel) +SEED_MODE_CLEAR = 0 +SEED_MODE_BLOB = 1 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def compute_kernel_norm(radius, mu_k, sigma_k): + """Precompute 1 / (sum of K(r)) for the bell-shaped neighborhood weight. + + Mirrors exactly what the device kernel does so the convolution is energy- + preserving: walks the (2R+1)x(2R+1) box, accumulates + `exp(-(r/R - mu_k)^2 / (2*sigma_k^2))` for `r <= R`, and returns the + reciprocal sum as a float32. + """ + inv_two_sigma2 = 1.0 / (2.0 * sigma_k * sigma_k) + inv_r = 1.0 / float(radius) + total = 0.0 + for dy in range(-radius, radius + 1): + for dx in range(-radius, radius + 1): + r = math.sqrt(dx * dx + dy * dy) + if r > radius: + continue + rn = r * inv_r - mu_k + total += math.exp(-(rn * rn) * inv_two_sigma2) + if total <= 0.0: + raise RuntimeError("kernel normalization sum collapsed to zero") + return np.float32(1.0 / total) + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + Returns a dict of kernels keyed by name and matching LaunchConfigs. + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("convolve_lenia", "colorize_lenia", "seed_blob"), + ) + + kernels = { + "step": mod.get_kernel("convolve_lenia"), + "colorize": mod.get_kernel("colorize_lenia"), + "seed": mod.get_kernel("seed_blob"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they + # can share a launch config. + configs = {"step": config, "colorize": config, "seed": config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core Array/Texture/Surface - Lenia", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_state_arrays(): + """Allocate the two single-channel `float` ping-pong arrays. + + `surface_load_store=True` is what lets the same Array be bound as both a + TextureObject (sampled reads) and a SurfaceObject (typed writes). + """ + arr_a = Array.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + surface_load_store=True, + ) + arr_b = Array.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + surface_load_store=True, + ) + return arr_a, arr_b + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def seed_state(stream, kernels, configs, write_surf, mode, seed_value): + """Re-initialize the array behind `write_surf` with a Gaussian blob or zeros. + + `mode = SEED_MODE_CLEAR` zeroes the field; `mode = SEED_MODE_BLOB` places a + Gaussian blob with peak ~SEED_PEAK at the center, jittered by `seed_value` + so successive reseeds give different patterns. + + Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so + creating a SurfaceObject inside a `with` block that closes immediately + after `launch` returns would destroy the surface handle before the kernel + actually runs against it. + """ + launch( + stream, + configs["seed"], + kernels["seed"], + np.uint64(write_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(mode), + np.uint32(seed_value), + np.float32(SEED_RADIUS), + np.float32(SEED_PEAK), + ) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the two ping-pong state Arrays --- + # Both are single-channel `float` with `surface_load_store=True` so + # they can be bound as SurfaceObjects. + arr_a, arr_b = make_state_arrays() + + # --- Step 7: Pre-create the four bindless handles --- + # Creating these once is much cheaper than rebuilding them every + # step. The simulation loop just picks which read/write pair to use. + tex_a = make_texture(arr_a) + tex_b = make_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # --- Step 8: Precompute the bell-curve normalization constant --- + # The neighborhood weight K(r) is unnormalized in the kernel; we + # divide by sum(K) so the convolution is a weighted mean rather than + # an unbounded integral. Doing this on the host once at startup is + # much cheaper than redoing it on the device every step. + inv_weight_sum = compute_kernel_norm(R, MU_K, SIGMA_K) + + # --- Step 9: Seed an initial Gaussian blob into arr_a (writes via surf_a) --- + seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, seed_value=0) + # After seeding, `arr_a` is the "current" state. + state = {"current": "a", "seed": 0} + + # --- Step 10: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_read_write(): + if state["current"] == "a": + return tex_a, surf_b, "b" # read a, write b, next current = b + return tex_b, surf_a, "a" + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + # Reseed with a new Gaussian blob; bump the seed so the jitter + # pattern changes each time. + state["seed"] += 1 + seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, state["seed"]) + state["current"] = "a" + return + if symbol == key._1: + # Clear the field. Useful to confirm the simulation is quiet when + # the state is zero. + seed_state(stream, kernels, configs, surf_a, SEED_MODE_CLEAR, 0) + state["current"] = "a" + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # (a) Run one Lenia step. The convolution kernel reads the current + # state via a TextureObject (LINEAR + WRAP gives toroidal + # wrapping at the border), evaluates the growth function, and + # writes the new state via a SurfaceObject. One step per frame + # is intentional: dt = 0.1 is small, and the (2R+1)^2 = 729-tap + # stencil is heavy enough that going faster would not help. + tex_read, surf_write, next_current = current_read_write() + launch( + stream, + configs["step"], + kernels["step"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(R), + np.float32(MU_K), + np.float32(SIGMA_K), + np.float32(MU), + np.float32(SIGMA), + np.float32(DT), + inv_weight_sum, + ) + state["current"] = next_current + + # (b) Colorize the latest state into the OpenGL PBO. + tex_read = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_read.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + "cuda.core Array/Texture/Surface - Lenia" + f" ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains three CUDA C++ kernels: +# * seed_blob -- sets the initial state via SurfaceObject writes. +# Either clears the field (mode = 0) or paints a +# Gaussian blob centered in the field (mode = 1). +# * convolve_lenia -- reads previous state via TextureObject (with +# LINEAR + WRAP bilinear filtering), integrates a +# bell-shaped neighborhood K(r/R) to produce the +# potential U, applies the growth function G(U), +# and writes the next state via SurfaceObject. +# * colorize_lenia -- reads the new state via TextureObject and writes +# RGBA bytes into the OpenGL PBO using a simple +# teal-on-black gradient. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// All kernels run one thread per output pixel and bounds-check at the top. +// `surf2Dwrite` takes the x offset in BYTES; for a single-channel float +// surface that means `x * sizeof(float)` = `x * 4`. + +extern "C" +__global__ +void seed_blob(cudaSurfaceObject_t surf, + int width, int height, + int mode, + unsigned int seed, + float radius, + float peak) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float value = 0.0f; + if (mode == 1) { + // Gaussian blob centered in the field with a small deterministic + // jitter that breaks symmetry differently on each reseed. + float cx = (float)(width / 2); + float cy = (float)(height / 2); + float dx = (float)x - cx; + float dy = (float)y - cy; + float r2 = dx * dx + dy * dy; + float inv = 1.0f / (radius * radius); + value = peak * expf(-r2 * inv); + + unsigned int h = (unsigned int)x * 374761393u + + (unsigned int)y * 668265263u + seed * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + float noise = (h & 0xffffu) / 65535.0f; // in [0, 1] + value += 0.02f * (noise - 0.5f); + if (value < 0.0f) value = 0.0f; + if (value > 1.0f) value = 1.0f; + } + + // float is 4 bytes; surf2Dwrite takes the x offset in BYTES. + surf2Dwrite(value, surf, x * (int)sizeof(float), y); +} + +extern "C" +__global__ +void convolve_lenia(cudaTextureObject_t tex, + cudaSurfaceObject_t surf, + int width, int height, + int R, + float mu_k, float sigma_k, + float mu, float sigma, + float dt, + float inv_weight_sum) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Normalized texture coordinates: WRAP addressing requires them. The + // (x + dx + 0.5) / W idiom places the sample at the texel center; values + // outside [0, 1] are fine because WRAP wraps them toroidally. + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float inv_R = 1.0f / (float)R; + float inv_two_sigma_k2 = 1.0f / (2.0f * sigma_k * sigma_k); + float inv_two_sigma2 = 1.0f / (2.0f * sigma * sigma); + + // Integrate the bell-shaped weight K(r/R) against the current state. + float U = 0.0f; + for (int dy = -R; dy <= R; ++dy) { + for (int dx = -R; dx <= R; ++dx) { + float fdx = (float)dx; + float fdy = (float)dy; + float r2 = fdx * fdx + fdy * fdy; + float r = sqrtf(r2); + if (r > (float)R) continue; // restrict to the disk + float rn = r * inv_R - mu_k; + float w = expf(-(rn * rn) * inv_two_sigma_k2); + + float sx = ((float)x + fdx + 0.5f) * inv_w; + float sy = ((float)y + fdy + 0.5f) * inv_h; + float s = tex2D(tex, sx, sy); + U += w * s; + } + } + U *= inv_weight_sum; // host-precomputed 1 / sum(K) + + // Read the current cell value (point sample at the texel center). + float sx0 = ((float)x + 0.5f) * inv_w; + float sy0 = ((float)y + 0.5f) * inv_h; + float state = tex2D(tex, sx0, sy0); + + // Growth function G(U) = 2 * exp(-(U - mu)^2 / (2 * sigma^2)) - 1, + // mapping U near mu to +1 (grow) and U far from mu to -1 (shrink). + float du = U - mu; + float G = 2.0f * expf(-(du * du) * inv_two_sigma2) - 1.0f; + + float new_state = state + dt * G; + if (new_state < 0.0f) new_state = 0.0f; + if (new_state > 1.0f) new_state = 1.0f; + + surf2Dwrite(new_state, surf, x * (int)sizeof(float), y); +} + +extern "C" +__global__ +void colorize_lenia(cudaTextureObject_t tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = ((float)x + 0.5f) * inv_w; + float cy = ((float)y + 0.5f) * inv_h; + + float v = tex2D(tex, cx, cy); + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // Linear interpolation from a deep teal at v = 0 to a bright teal at + // v = 1. Two stops -- simple, easy to read, no LUT required. + // (0, 15, 30, 255) -> (50, 200, 180, 255) + float r = ( 0.0f + v * ( 50.0f - 0.0f)); + float g = ( 15.0f + v * (200.0f - 15.0f)); + float b = ( 30.0f + v * (180.0f - 30.0f)); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)r; + output[idx + 1] = (unsigned char)g; + output[idx + 2] = (unsigned char)b; + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py new file mode 100644 index 00000000000..11abca54c22 --- /dev/null +++ b/cuda_core/examples/gl_interop_mandelbrot.py @@ -0,0 +1,692 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.Array and TextureObject used as a *color +# lookup table* (palette LUT) for a real-time Mandelbrot deep-zoom explorer. +# A CUDA kernel computes smooth iteration counts and uses tex1D with +# LINEAR + CLAMP + NORMALIZED_FLOAT sampling to read a 256-entry RGBA palette, +# writing the final RGBA bytes straight into an OpenGL PBO via GraphicsResource. +# Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to use a 1D cuda.core.Array as a palette and bind it via a +# TextureObject for hardware-filtered color lookups inside a kernel. +# - How LINEAR + AddressMode.CLAMP + ReadMode.NORMALIZED_FLOAT + normalized +# coordinates give you a free `texture(palette, t)` style sampler that +# returns a float4 in [0, 1] regardless of the underlying storage format. +# - How to drive a real-time interactive viewer: mouse pan, scroll-wheel zoom +# anchored at the cursor, and key-driven iteration cap. +# +# How it works +# ============ +# The Mandelbrot set is defined by iterating z -> z^2 + c starting from +# z = 0; pixels are colored by how quickly z escapes the disk of radius 2. +# +# +---------+ ResourceDescriptor.from_array +# | Array | --------------------------------+ +# | float4 | v +# | size 256| +-------------------+ +# +---------+ | TextureObject | +# ^ copy_from(host) | (palette LUT) | +# | +---------+---------+ +# host palette | +# (numpy float32x4, 256 stops) | +# v +# tex1D(palette, t) +# | +# v +# +-----------------------+ +# | mandelbrot kernel | +# | (one thread / pixel) | +# +-----------+-----------+ +# | +# v GraphicsResource.map +# +-----------------------+ +# | OpenGL PBO (RGBA8) | +# +-----------------------+ +# +# Smooth iteration count +# ---------------------- +# A plain integer escape count produces ugly banded colors. With a bailout +# radius R = 2 (escape when |z|^2 > 4), we use the standard smooth formula: +# +# mu = iter + 1 - log(log(|z|)) / log(2) +# +# At the escape step |z| > 2, so log(|z|) > log(2) > 0 and log(log(|z|)) is +# finite. We compute this in double and cast to float for the palette lookup. +# +# Cursor-anchored zoom +# -------------------- +# On scroll, we want the world point under the mouse cursor to remain under +# the cursor after the zoom. We capture (wx, wy) under the cursor with the +# old scale, multiply the scale by 0.9 (zoom in) or 1.1 (zoom out), then +# back-solve cx, cy so the same screen pixel still maps to (wx, wy): +# +# cx_new = wx - (mouse_x - W/2) * scale_new +# cy_new = wy - (mouse_y - H/2) * scale_new +# +# Why double precision for cx, cy, scale? +# --------------------------------------- +# Float32 runs out of mantissa bits around 1e6x zoom; double gets you to +# roughly 1e13x before the pixel grid coarsens visibly. The kernel takes +# cx, cy, scale as doubles and only narrows to float for the color lookup. +# +# Address mode note +# ----------------- +# We use AddressMode.CLAMP (per the example brief). Combined with the +# `fmodf(mu * 0.02f, 1.0f)` cycling formula, the palette index is already +# guaranteed to be in [0, 1), so CLAMP and WRAP both produce identical +# results in practice -- there is no visible seam. +# +# What you should see +# =================== +# A window showing the Mandelbrot set. Drag with the left mouse button to +# pan, scroll the wheel to zoom in/out at the cursor, press R to reset the +# view, and `[`/`]` to lower/raise the iteration cap. The window title shows +# the current zoom level, center, max_iter, and FPS. Close the window or +# press Escape to exit. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Window and viewer parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 768 +PALETTE_SIZE = 256 + +# Default view: classic Mandelbrot framing centered slightly left of origin. +DEFAULT_CX = -0.5 +DEFAULT_CY = 0.0 +DEFAULT_SCALE = 4.0 / HEIGHT # world-units per pixel (4-unit-tall view) +DEFAULT_MAX_ITER = 512 + +# Bounds for [/] iteration adjust. +MIN_MAX_ITER = 64 +MAX_MAX_ITER = 8192 +ITER_STEP = 64 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# Array/TextureObject as a palette LUT, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernel and return (device, stream, kernel, config).""" + dev = Device(0) + dev.set_current() + + # Bindless texture objects (cuTexObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless texture objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex1D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("mandelbrot",)) + + kernel = mod.get_kernel("mandelbrot") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core Array/Texture - Mandelbrot Deep Zoom", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def build_palette(): + """Build a 256-entry RGBA float32 palette by lerping through color stops. + + Returns a flat numpy array of shape (PALETTE_SIZE * 4,) dtype=float32 + suitable for Array.copy_from(). Each color channel is in [0, 1]. + """ + # Hand-picked stops: deep blue -> cyan -> yellow -> orange -> red -> + # magenta -> black (the final stop is used by points that hit max_iter + # and don't escape). + stops = np.array( + [ + [0.00, 0.02, 0.05, 0.30, 1.0], # position, R, G, B, A + [0.16, 0.10, 0.50, 0.90, 1.0], # cyan + [0.42, 1.00, 0.95, 0.20, 1.0], # yellow + [0.58, 1.00, 0.55, 0.10, 1.0], # orange + [0.74, 0.95, 0.10, 0.10, 1.0], # red + [0.90, 0.65, 0.10, 0.85, 1.0], # magenta + [1.00, 0.00, 0.00, 0.00, 1.0], # black + ], + dtype=np.float32, + ) + + pal = np.empty((PALETTE_SIZE, 4), dtype=np.float32) + positions = stops[:, 0] + colors = stops[:, 1:] + for i in range(PALETTE_SIZE): + t = i / (PALETTE_SIZE - 1) + # Find the bracketing segment. + j = int(np.searchsorted(positions, t, side="right")) - 1 + j = max(0, min(j, len(positions) - 2)) + t0 = positions[j] + t1 = positions[j + 1] + seg = (t - t0) / (t1 - t0) if t1 > t0 else 0.0 + pal[i] = colors[j] + seg * (colors[j + 1] - colors[j]) + + # Flatten to (PALETTE_SIZE * 4,) so the byte layout matches a + # float4 x PALETTE_SIZE 1D Array. + return np.ascontiguousarray(pal.reshape(-1), dtype=np.float32) + + +def make_palette_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + # NORMALIZED_FLOAT is a no-op for FLOAT32 storage (the data is already + # in [0, 1]); we set it because the spec calls for it and to document + # the intent for readers building palettes from UINT8 storage. + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Build and upload the palette LUT --- + # One 1D Array, 256 entries of float4 RGBA. The host-side palette is + # a flat numpy float32 array; copy_from() does an async H2D copy, so + # we sync the stream once afterwards to make sure the data has landed + # before we start sampling from it in the render loop. + host_palette = build_palette() + palette_arr = Array.from_descriptor( + shape=(PALETTE_SIZE,), + format=ArrayFormat.FLOAT32, + num_channels=4, + ) + palette_arr.copy_from(host_palette, stream=stream) + stream.sync() + + # --- Step 7: Bind the palette Array as a TextureObject (LUT) --- + palette_tex = make_palette_texture(palette_arr) + + # --- Step 8: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + # View state. cx, cy, scale are kept in Python floats (double precision) + # and converted to np.float64 on each kernel launch. + view = { + "cx": float(DEFAULT_CX), + "cy": float(DEFAULT_CY), + "scale": float(DEFAULT_SCALE), + "max_iter": int(DEFAULT_MAX_ITER), + # Pan-drag state (left mouse button). + "dragging": False, + } + + def screen_to_world(mouse_x, mouse_y): + """Map a pyglet mouse coordinate to the world point currently under it. + + Pyglet's window origin is bottom-left and the rendered texture's + origin is also bottom-left, so no y-flip is needed. + """ + wx = view["cx"] + (mouse_x - WIDTH / 2.0) * view["scale"] + wy = view["cy"] + (mouse_y - HEIGHT / 2.0) * view["scale"] + return wx, wy + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + view["cx"] = float(DEFAULT_CX) + view["cy"] = float(DEFAULT_CY) + view["scale"] = float(DEFAULT_SCALE) + view["max_iter"] = int(DEFAULT_MAX_ITER) + return + if symbol == key.BRACKETLEFT: + view["max_iter"] = max(MIN_MAX_ITER, view["max_iter"] - ITER_STEP) + return + if symbol == key.BRACKETRIGHT: + view["max_iter"] = min(MAX_MAX_ITER, view["max_iter"] + ITER_STEP) + return + + @window.event + def on_mouse_press(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + view["dragging"] = True + + @window.event + def on_mouse_release(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + view["dragging"] = False + + @window.event + def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers): + if buttons & pyglet.window.mouse.LEFT: + # Pan: move the center opposite to the cursor drag (so the scene + # follows the cursor). dy is positive when moving up in pyglet's + # bottom-left origin space, matching the texture orientation. + view["cx"] -= dx * view["scale"] + view["cy"] += dy * view["scale"] + + @window.event + def on_mouse_scroll(x, y, _scroll_x, scroll_y): + # Cursor-anchored zoom: keep the world point under the cursor pinned. + wx, wy = screen_to_world(x, y) + factor = 0.9 if scroll_y > 0 else 1.1 + view["scale"] *= factor + # Back-solve cx, cy so screen pixel (x, y) still maps to (wx, wy). + view["cx"] = wx - (x - WIDTH / 2.0) * view["scale"] + view["cy"] = wy - (y - HEIGHT / 2.0) * view["scale"] + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # (a) Map the PBO so CUDA can write to it. This gives us a Buffer + # whose .handle is a device pointer pointing into the GL PBO. + with resource.map(stream=stream) as buf: + launch( + stream, + config, + kernel, + np.uint64(palette_tex.handle), # bindless texture handle + buf.handle, # output PBO (RGBA8) + np.int32(WIDTH), + np.int32(HEIGHT), + np.float64(view["cx"]), + np.float64(view["cy"]), + np.float64(view["scale"]), + np.int32(view["max_iter"]), + ) + # Unmap happens automatically when the `with` block exits. + + # (b) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (c) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + zoom = 1.0 / view["scale"] if view["scale"] > 0 else 0.0 + window.set_caption( + "cuda.core Array/Texture - Mandelbrot" + f" | zoom {zoom:.3e}x" + f" | center ({view['cx']:.6f}, {view['cy']:.6f})" + f" | iter {view['max_iter']}" + f" | {fps:.0f} FPS" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly. + resource.close() + palette_tex.close() + palette_arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE is a single CUDA C++ kernel `mandelbrot` that computes a +# smooth iteration count per pixel and looks up the color via +# tex1D(palette, t). Coordinates and the scale factor are doubles +# to support deep zooms; only the color lookup runs in single precision. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Mandelbrot deep-zoom kernel with a TextureObject palette LUT. +// +// Each thread computes one pixel. Coordinates and scale are doubles so the +// zoom doesn't quantize at modest depth. Once we have the smooth iteration +// count we narrow to float and use tex1D to read the palette. + +extern "C" +__global__ +void mandelbrot(cudaTextureObject_t palette, + unsigned char* output, + int width, int height, + double cx, double cy, double scale, + int max_iter) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Map pixel -> complex plane (doubles). + double c_re = cx + ((double)x - 0.5 * (double)width) * scale; + double c_im = cy + ((double)y - 0.5 * (double)height) * scale; + + // Standard escape iteration with bailout radius 2 (compare squared norm + // against 4 to skip the sqrt in the inner loop). + double zr = 0.0; + double zi = 0.0; + double zr2 = 0.0; + double zi2 = 0.0; + int iter = 0; + while (iter < max_iter && (zr2 + zi2) <= 4.0) { + zi = 2.0 * zr * zi + c_im; + zr = zr2 - zi2 + c_re; + zr2 = zr * zr; + zi2 = zi * zi; + ++iter; + } + + unsigned char r, g, b; + if (iter >= max_iter) { + // Inside the set (or close enough): solid black. + r = 0; + g = 0; + b = 0; + } else { + // Smooth iteration count: + // mu = iter + 1 - log(log(|z|)) / log(2) + // = iter + 1 - log(0.5 * log(|z|^2)) / log(2) + // At escape, |z|^2 > 4, so 0.5 * log(|z|^2) > log(2) > 0 -- the + // outer log is well-defined. Compute in double, narrow to float + // for the palette lookup. + double log_zn = 0.5 * log(zr2 + zi2); + double nu = log(log_zn) / log(2.0); + float mu = (float)((double)(iter + 1) - nu); + + // Cycle through the palette: 0.02 controls how quickly we wrap + // through the gradient as the iteration count climbs. + float t = fmodf(mu * 0.02f, 1.0f); + if (t < 0.0f) t += 1.0f; // fmodf can return negative for negative mu + + float4 rgba = tex1D(palette, t); + + // Clamp before narrowing to bytes. + float fr = rgba.x; if (fr < 0.0f) fr = 0.0f; if (fr > 1.0f) fr = 1.0f; + float fg = rgba.y; if (fg < 0.0f) fg = 0.0f; if (fg > 1.0f) fg = 1.0f; + float fb = rgba.z; if (fb < 0.0f) fb = 0.0f; if (fb > 1.0f) fb = 1.0f; + r = (unsigned char)(fr * 255.0f); + g = (unsigned char)(fg * 255.0f); + b = (unsigned char)(fb * 255.0f); + } + + int idx = (y * width + x) * 4; + output[idx + 0] = r; + output[idx + 1] = g; + output[idx + 2] = b; + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py new file mode 100644 index 00000000000..38b09513464 --- /dev/null +++ b/cuda_core/examples/gl_interop_mipmap_lod.py @@ -0,0 +1,717 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates the new cuda.core texture/surface stack: +# MipmappedArray, SurfaceObject, and a TextureObject that does trilinear +# (LINEAR mipmap + LINEAR filter) sampling with user-controlled LOD bias. +# Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# How to allocate a mipmap pyramid as a single MipmappedArray, populate each +# level from a CUDA kernel by binding it as a SurfaceObject, and then sample +# the whole pyramid from a TextureObject with manual LOD bias. +# +# How it works +# ============ +# A mipmap pyramid is a stack of progressively-halved images of the same +# texture. The base level (level 0) holds the highest-resolution version; each +# subsequent level is a 2x2 box-filtered downsample of the level below it: +# +# level 0: 512 x 512 <- highest detail +# level 1: 256 x 256 +# level 2: 128 x 128 +# ... +# level 9: 1 x 1 <- a single average color +# +# At sample time, the GPU picks the mip level that best matches the on-screen +# size of the texel, optionally blending between adjacent levels (trilinear). +# Selecting a coarser level than the "right" one is called a positive LOD bias +# and produces a softer/blurrier image; a negative bias selects finer levels +# (sharper but more aliased when undersampled). +# +# +----------------------+ +-----------------------+ +# | MipmappedArray | | TextureObject | +# | (single allocation, | <--- | (samples the whole | +# | 10 mip levels) | | pyramid w/ trilinear | +# +----------------------+ | filtering) | +# ^ ^ +-----------------------+ +# | | +# | +---- one SurfaceObject per level, used at BUILD time only +# | to let a kernel write pixels into that level. +# | +# +----------- get_level(L) returns a NON-OWNING Array view of level L; +# the storage belongs to the parent MipmappedArray. +# +# STARTUP -- one-time mipmap build +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Allocate MipmappedArray (10 levels, float4 RGBA, surface_load_store=True). +# 2. Level 0: launch `seed_base` kernel -> SurfaceObject -> high-frequency +# procedural pattern. +# 3. For L = 1..num_levels-1: launch `downsample` kernel: +# - reads level L-1 through a TextureObject (POINT-filtered) +# - writes level L through a SurfaceObject +# - 4-sample box average of the parent's 2x2 footprint. +# +# PER FRAME (render loop) +# ~~~~~~~~~~~~~~~~~~~~~~~ +# The display TextureObject samples the whole pyramid with `tex2DLod`, +# where the LOD is computed per-pixel as `log2(zoom) + lod_bias`. The result +# is written to a GL PBO via GraphicsResource, then drawn as a textured quad. +# +# What you should see +# =================== +# A 512x512 procedural pattern (concentric rings + diagonal grid) shown +# stretched across the window. Use the mouse wheel to zoom in/out (this +# implicitly changes the LOD), and use the bracket keys `[` / `]` to add a +# manual LOD bias on top of that. Press `R` to reset. +# +# Mouse wheel zoom in / out +# [ LOD bias -= 0.25 (sharper, more aliased) +# ] LOD bias += 0.25 (blurrier, samples a coarser level) +# R reset zoom + bias +# Escape / close quit +# +# The window title shows the current zoom, manual bias, and effective LOD. +# Close the window or press Escape to exit. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + MipmappedArray, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Configuration (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +BASE_SIZE = 512 # Texture base-level edge length (must be a power of two). +LOD_BIAS_STEP = 0.25 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA, OpenGL, and the mipmap pyramid. If you're +# here to learn about MipmappedArray / SurfaceObject / mipmapped TextureObject, +# you can skip straight to main() -- the interesting part is there. These +# helpers exist so that main() reads like a short story. +# ============================================================================ + + +def _check_compute_capability(dev): + """Surface load/store + mipmapped arrays require sm_30+.""" + cc = dev.compute_capability + if cc.major < 3: + print( + f"This example requires compute capability >= 3.0, " + f"got sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + +def setup_cuda(): + """Compile the three kernels and return everything we need to drive them. + + Returns + ------- + (dev, stream, kernels, arch_str) + kernels is a dict with keys "seed_base", "downsample", "display". + """ + dev = Device(0) + dev.set_current() + _check_compute_capability(dev) + stream = dev.create_stream() + + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("seed_base", "downsample", "display"), + ) + kernels = { + "seed_base": mod.get_kernel("seed_base"), + "downsample": mod.get_kernel("downsample"), + "display": mod.get_kernel("display"), + } + return dev, stream, kernels, f"sm_{dev.arch}" + + +def build_mipmap_pyramid(mip, num_levels, stream, kernels): + """Populate every level of `mip` using SurfaceObject writes. + + Strategy + -------- + * Level 0 is filled directly by `seed_base`, which writes a procedural + pattern through a SurfaceObject bound to level 0. + * Each subsequent level L is filled by `downsample`, which reads level L-1 + through a POINT-filtered TextureObject and box-averages a 2x2 footprint + into level L through a SurfaceObject. + * All operations are issued on a single stream, so they serialize + implicitly -- no per-level sync is needed. + """ + # ---- Level 0: seed the base image ------------------------------------- + base_arr = mip.get_level(0) # non-owning view; do NOT use a `with` block + with SurfaceObject.from_array(base_arr) as base_surf: + block = (16, 16, 1) + grid = ( + (BASE_SIZE + block[0] - 1) // block[0], + (BASE_SIZE + block[1] - 1) // block[1], + 1, + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["seed_base"], + np.uint64(base_surf.handle), + np.int32(BASE_SIZE), + np.int32(BASE_SIZE), + ) + # base_arr (non-owning) is allowed to fall out of scope here; the parent + # MipmappedArray keeps the underlying storage alive. + + # ---- Levels 1..N-1: box-filter downsample ------------------------------ + # Each iteration reads level (L-1) through a temporary TextureObject and + # writes level L through a temporary SurfaceObject. Both close cleanly + # at the end of their `with` blocks. + src_tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.POINT, # explicit per-texel reads + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=False, # integer pixel coordinates + ) + for level in range(1, num_levels): + parent_size = BASE_SIZE >> (level - 1) + level_size = BASE_SIZE >> level + if level_size < 1: + break + + src_arr = mip.get_level(level - 1) + dst_arr = mip.get_level(level) + src_res = ResourceDescriptor.from_array(src_arr) + with TextureObject.from_descriptor( + resource=src_res, texture_descriptor=src_tex_desc + ) as src_tex, SurfaceObject.from_array(dst_arr) as dst_surf: + block = (16, 16, 1) + grid = ( + (level_size + block[0] - 1) // block[0], + (level_size + block[1] - 1) // block[1], + 1, + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["downsample"], + np.uint64(src_tex.handle), + np.uint64(dst_surf.handle), + np.int32(parent_size), + np.int32(level_size), + ) + # src_arr, dst_arr (non-owning) fall out of scope; storage stays + # alive via the parent MipmappedArray. + + # One sync at the end is enough -- the whole build chain ran on this + # stream and serialized naturally. + stream.sync() + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="MipmappedArray Example - Mipmap LOD viewer", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard GL boilerplate: a shader program, a fullscreen quad, and an + empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific. + + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, -1, 0, 0, + 1, -1, 1, 0, + 1, 1, 1, 1, + -1, -1, 0, 0, + 1, 1, 1, 1, + -1, 1, 0, 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA8 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, _arch = setup_cuda() + + # --- Step 2: Allocate the mipmap pyramid and build every level --- + # surface_load_store=True is required for kernel-side writes. + num_levels = int(math.log2(BASE_SIZE)) + 1 + mip = MipmappedArray.from_descriptor( + shape=(BASE_SIZE, BASE_SIZE), + format=ArrayFormat.FLOAT32, + num_channels=4, + num_levels=num_levels, + surface_load_store=True, + ) + build_mipmap_pyramid(mip, num_levels, stream, kernels) + + # --- Step 3: Bind the WHOLE pyramid as a trilinear-filtered texture --- + # Normalized coordinates (0..1) make zoom-by-uv simple. The texture + # descriptor's mipmap_level_bias stays 0.0; the display kernel + # receives the user-controlled bias as a kernel argument and folds + # it into the tex2DLod call (avoids rebuilding the TextureObject + # whenever the user changes the bias). + display_tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + mipmap_filter_mode=FilterMode.LINEAR, # trilinear + mipmap_level_bias=0.0, + min_mipmap_level_clamp=0.0, + max_mipmap_level_clamp=float(num_levels - 1), + ) + display_tex = TextureObject.from_descriptor( + resource=ResourceDescriptor.from_mipmapped_array(mip), + texture_descriptor=display_tex_desc, + ) + + # --- Step 4: Open a window and set up the GL/CUDA bridge --- + window, gl, pyglet = create_window() + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Render loop state --- + # `zoom` controls how big a texel is on screen: zoom > 1 stretches the + # texture and selects coarser mip levels (positive LOD); zoom < 1 shrinks + # the texture and selects finer levels. `lod_bias` is a manual offset + # added on top. + state = {"zoom": 1.0, "lod_bias": 0.0} + start_time = time.monotonic() + frame_count = [0] + fps_time = [start_time] + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + def effective_lod(): + # Same formula the display kernel uses, clamped to the legal range so + # the window title matches what the GPU actually sees. + raw = math.log2(max(state["zoom"], 1e-6)) + state["lod_bias"] + return max(0.0, min(float(num_levels - 1), raw)) + + @window.event + def on_draw(): + window.clear() + + # (a) Map the PBO so CUDA can write into it. + with resource.map(stream=stream) as buf: + # (b) Launch the display kernel -- samples the mipmap and writes RGBA. + launch( + stream, + config, + kernels["display"], + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint64(display_tex.handle), + np.float32(state["zoom"]), + np.float32(state["lod_bias"]), + np.float32(float(num_levels - 1)), + ) + # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes + # the CUDA work against subsequent OpenGL use. + + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count[0] += 1 + now = time.monotonic() + if now - fps_time[0] >= 1.0: + fps = frame_count[0] / (now - fps_time[0]) + window.set_caption( + f"MipmappedArray LOD viewer " + f"({WIDTH}x{HEIGHT}, {fps:.0f} FPS) -- " + f"zoom={state['zoom']:.2f}, " + f"bias={state['lod_bias']:+.2f}, " + f"LOD={effective_lod():.2f}" + ) + frame_count[0] = 0 + fps_time[0] = now + + @window.event + def on_mouse_scroll(x, y, scroll_x, scroll_y): + # One wheel step changes zoom by ~12.5%. Clamped to keep LOD in range. + if scroll_y == 0: + return + factor = 1.125 ** scroll_y + state["zoom"] = max(1.0 / 64.0, min(64.0, state["zoom"] * factor)) + + @window.event + def on_key_press(symbol, modifiers): + key = pyglet.window.key + if symbol == key.BRACKETLEFT: + state["lod_bias"] = max( + -float(num_levels), state["lod_bias"] - LOD_BIAS_STEP + ) + elif symbol == key.BRACKETRIGHT: + state["lod_bias"] = min( + float(num_levels), state["lod_bias"] + LOD_BIAS_STEP + ) + elif symbol == key.R: + state["zoom"] = 1.0 + state["lod_bias"] = 0.0 + + @window.event + def on_close(): + # Release CUDA-side resources in reverse construction order. GL + # objects clean up via pyglet on window close. + resource.close() + display_tex.close() + mip.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# Three CUDA kernels are concatenated into one program string so they share a +# single NVRTC compile. All three operate on float4 RGBA pixels. +# +# seed_base -- writes a high-frequency procedural pattern to level 0 via a +# SurfaceObject. NOTE: surf2Dwrite's x-coordinate is in BYTES, +# not in elements, so we multiply by sizeof(float4) every time. +# +# downsample -- reads level L-1 through a POINT-filtered TextureObject and +# writes the 2x2 box average to level L through a SurfaceObject. +# tex2D with non-normalized coords needs the +0.5 half-texel +# offset to hit exact texel centers. +# +# display -- samples the WHOLE mipmap pyramid with tex2DLod, where the +# per-thread LOD is `clamp(log2(zoom) + lod_bias, 0, maxLod)`. +# Writes 8-bit RGBA into the PBO. +# +# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA- +# specific there. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// -------------------------------------------------------------------------- +// Helper: clamp a float to [a, b]. +// -------------------------------------------------------------------------- +__device__ __forceinline__ float clampf(float v, float a, float b) { + return fminf(fmaxf(v, a), b); +} + +// CUDA does not ship a builtin "fract" so we provide one (used by seed_base). +__device__ __forceinline__ float fracf(float v) { + return v - floorf(v); +} + +// -------------------------------------------------------------------------- +// seed_base: write a procedural high-frequency pattern to level 0. +// +// surf is a SurfaceObject bound to the level-0 Array (float4 RGBA). The +// pattern is a colorful blend of concentric rings, a diagonal grid, and a +// radial sweep, designed to have plenty of fine detail so the difference +// between mip levels is visually obvious. +// -------------------------------------------------------------------------- +extern "C" __global__ +void seed_base(cudaSurfaceObject_t surf, int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float u = ((float)x + 0.5f) / (float)width; + float v = ((float)y + 0.5f) / (float)height; + + // Concentric rings centered on the image. + float cx = u - 0.5f; + float cy = v - 0.5f; + float r = sqrtf(cx * cx + cy * cy); + float rings = 0.5f + 0.5f * sinf(r * 80.0f); + + // Diagonal grid -- thin lines about every 1/16 of the image. + float gx = fabsf(fracf(u * 16.0f) - 0.5f); + float gy = fabsf(fracf(v * 16.0f) - 0.5f); + float grid = (gx < 0.05f || gy < 0.05f) ? 1.0f : 0.0f; + + // Angular sweep gives the rings some color variation. + float theta = atan2f(cy, cx); + float sweep = 0.5f + 0.5f * sinf(theta * 6.0f); + + // Combine into an RGBA color. Keep values in [0, 1]. + float red = clampf(rings * (0.4f + 0.6f * sweep) + 0.3f * grid, 0.0f, 1.0f); + float green = clampf(rings * (0.6f - 0.4f * sweep) + 0.3f * grid, 0.0f, 1.0f); + float blue = clampf(0.4f + 0.4f * sweep + 0.5f * grid, 0.0f, 1.0f); + float alpha = 1.0f; + + float4 px = make_float4(red, green, blue, alpha); + + // Surface writes index x in BYTES (this is the classic gotcha). + surf2Dwrite(px, surf, x * (int)sizeof(float4), y); +} + +// -------------------------------------------------------------------------- +// downsample: box-filter a 2x2 footprint of the parent level into one texel. +// +// src is a POINT-filtered TextureObject bound to level (L-1). +// dst is a SurfaceObject bound to level L. +// (dst_w, dst_h) is the size of level L. +// (src_w = 2 * dst_w, src_h = 2 * dst_h is implicit and unused; we pass it +// only for the bounds check.) +// +// Texture coordinates: tex2D with non-normalized coords returns texel (i, j) +// when sampled at (i + 0.5, j + 0.5). So for output texel (x, y) the four +// parent texels live at parent-coords (2x + 0.5, 2y + 0.5), (2x + 1.5, ...). +// -------------------------------------------------------------------------- +extern "C" __global__ +void downsample(cudaTextureObject_t src, + cudaSurfaceObject_t dst, + int src_size, + int dst_size) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= dst_size || y >= dst_size) return; + + float fx = 2.0f * (float)x; + float fy = 2.0f * (float)y; + + float4 a = tex2D(src, fx + 0.5f, fy + 0.5f); + float4 b = tex2D(src, fx + 1.5f, fy + 0.5f); + float4 c = tex2D(src, fx + 0.5f, fy + 1.5f); + float4 d = tex2D(src, fx + 1.5f, fy + 1.5f); + + float4 px; + px.x = 0.25f * (a.x + b.x + c.x + d.x); + px.y = 0.25f * (a.y + b.y + c.y + d.y); + px.z = 0.25f * (a.z + b.z + c.z + d.z); + px.w = 0.25f * (a.w + b.w + c.w + d.w); + + // Silence unused-variable warning for the convenience parameter. + (void)src_size; + + surf2Dwrite(px, dst, x * (int)sizeof(float4), y); +} + +// -------------------------------------------------------------------------- +// display: per-pixel mipmap sample with manual LOD bias. +// +// tex is a TextureObject built from the whole MipmappedArray (LINEAR + +// LINEAR mipmap filter, normalized coords). For each output pixel we compute +// a single per-thread LOD from `zoom` and `lod_bias`, then sample with +// tex2DLod. Output is written as RGBA8 into a linear byte buffer. +// -------------------------------------------------------------------------- +extern "C" __global__ +void display(unsigned char *output, + int width, + int height, + cudaTextureObject_t tex, + float zoom, + float lod_bias, + float max_lod) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Normalized window coords in [0, 1]. + float u = ((float)x + 0.5f) / (float)width; + float v = ((float)y + 0.5f) / (float)height; + + // Zoom around the window center so the user sees the effect symmetrically. + u = (u - 0.5f) * zoom + 0.5f; + v = (v - 0.5f) * zoom + 0.5f; + + // LOD: zoom > 1 means the texture is being stretched (each texel covers + // more screen area), which intuitively corresponds to selecting a coarser + // (higher) mip level. log2(zoom) yields exactly that. lod_bias is added + // on top, and the final value is clamped to the legal range. + float lod = log2f(fmaxf(zoom, 1e-6f)) + lod_bias; + lod = clampf(lod, 0.0f, max_lod); + + float4 c = tex2DLod(tex, u, v, lod); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(clampf(c.x, 0.0f, 1.0f) * 255.0f); + output[idx + 1] = (unsigned char)(clampf(c.y, 0.0f, 1.0f) * 255.0f); + output[idx + 2] = (unsigned char)(clampf(c.z, 0.0f, 1.0f) * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py new file mode 100644 index 00000000000..177e7b8d320 --- /dev/null +++ b/cuda_core/examples/gl_interop_ocean.py @@ -0,0 +1,836 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A real-time +# Gerstner-wave ocean is rebuilt every frame: a heightmap Array is rewritten +# through a SurfaceObject, sampled through a TextureObject with LINEAR + WRAP +# filtering for normal estimation, and shaded with Phong + Fresnel sky +# reflection straight into an OpenGL PBO. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to use a CUDA Array as a typed heightmap that is simultaneously +# written by one kernel (via SurfaceObject) and sampled by another (via +# TextureObject) within the same frame. +# - How LINEAR filtering + WRAP addressing + normalized coordinates gives +# essentially-free bilinear neighbor lookups for finite-difference normal +# estimation on a tiling heightmap. +# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so +# the entire render path never leaves the GPU. +# +# How it works +# ============ +# Gerstner waves are a sum of N moving sinusoids with directional vectors -- +# a classic ocean approximation that looks shockingly close to FFT ocean at a +# glance without any external library dependencies. For each heightmap texel: +# +# h(x, z, t) = sum_i A_i * sin( D_i . (x, z) * k_i - w_i * t + phi_i ) +# +# where k_i = 2*pi / wavelength_i and w_i = sqrt(g * k_i) is the dispersion +# relation for deep-water gravity waves. We bake 12 waves with hand-picked +# directions / wavelengths / amplitudes / phases into the kernel as constant +# arrays. Weather presets just scale amplitude and speed at the host level. +# +# PER FRAME (all on GPU) +# ~~~~~~~~~~~~~~~~~~~~~~ +# +-----------------+ surf2Dwrite +--------------+ +# | update_height | --------------> | heightmap | +# | kernel | | Array | +# +-----------------+ | (FLOAT32) | +# +--------------+ +# | +# | tex2D (LINEAR + WRAP) +# v +# +-----------------+ write RGBA8 +# | render_ocean | ----------------> PBO +# | kernel | +# +-----------------+ +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# WRAP / MIRROR addressing modes require normalized coordinates (see the CUDA +# Programming Guide). The ocean naturally tiles, so WRAP gives free seamless +# horizon repetition. LINEAR filtering means our four-tap finite-difference +# normal estimate gets bilinear interpolation between texels for free, which +# smooths the lighting noticeably without a single extra ALU instruction. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# surf2Dwrite takes the x coordinate in BYTES, not in elements. For a +# single-channel float surface that means `x * sizeof(float)` = `x * 4`. +# Getting this wrong silently corrupts every other column. +# +# What you should see +# =================== +# A window showing a real-time animated ocean rendered with Phong shading and +# a Fresnel-modulated sky reflection. Drag with the left mouse button to +# orbit, scroll to zoom, press 1/2/3 to switch weather presets (calm / +# breezy / stormy), press P to pause animation, Escape to exit. Window title +# shows preset name and FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Window and heightmap dimensions (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 768 +GRID = 512 # heightmap resolution (GRID x GRID texels) + +# Weather presets: (amplitude_scale, speed_scale, label). +# These are applied as multiplicative scalars on top of the per-wave amplitude +# and angular-frequency arrays baked into the kernel, so a single compiled +# binary can render every preset. +PRESETS = { + "1": (0.35, 0.7, "calm"), + "2": (1.00, 1.0, "breezy"), + "3": (1.85, 1.4, "stormy"), +} +DEFAULT_PRESET = "2" + +# Initial camera (orbit-around-origin) parameters. +INITIAL_YAW = 0.6 # radians around world-y +INITIAL_PITCH = 0.35 # radians above the horizon (small positive = looking down) +INITIAL_DISTANCE = 5.0 # camera distance from origin +PITCH_LIMIT = 1.4 # clamp |pitch| to keep basis non-degenerate (< pi/2) +ZOOM_MIN = 1.5 +ZOOM_MAX = 30.0 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + The two kernels live on different grids: + - update_height runs over the heightmap (GRID x GRID texels). + - render_ocean runs over output pixels (WIDTH x HEIGHT). + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # C++ compile so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("update_height", "render_ocean"), + ) + + kernels = { + "update": mod.get_kernel("update_height"), + "render": mod.get_kernel("render_ocean"), + } + + block = (16, 16, 1) + update_grid = ( + (GRID + block[0] - 1) // block[0], + (GRID + block[1] - 1) // block[1], + 1, + ) + render_grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + configs = { + "update": LaunchConfig(grid=update_grid, block=block), + "render": LaunchConfig(grid=render_grid, block=block), + } + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core Array/Texture/Surface - Gerstner Ocean", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + Standard OpenGL boilerplate -- not CUDA-specific. Returns + (shader_program, vao_id, tex_id). The shader_program is a pyglet + ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window). + quad_verts = np.array( + [ + -1, -1, 0, 0, + 1, -1, 1, 0, + 1, 1, 1, 1, + -1, -1, 0, 0, + 1, 1, 1, 1, + -1, 1, 0, 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0, + gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, 0, 0, 0, width, height, + gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_heightmap_array(): + """Allocate the single-channel float heightmap Array.""" + return Array.from_descriptor( + shape=(GRID, GRID), + format=ArrayFormat.FLOAT32, + num_channels=1, + surface_load_store=True, + ) + + +def make_height_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def orbit_camera_position(yaw, pitch, distance): + """Convert (yaw, pitch, distance) to a world-space camera position. + + The camera orbits the origin looking at it. World up is +y. Pitch is the + angle above the xz-plane: pitch=0 puts the camera on the horizon, + pitch=+1.4 nearly directly overhead. + """ + cp = math.cos(pitch) + sp = math.sin(pitch) + cy = math.cos(yaw) + sy = math.sin(yaw) + cam_x = distance * cp * sy + cam_y = distance * sp + cam_z = distance * cp * cy + return cam_x, cam_y, cam_z + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the heightmap Array and build its texture/surface --- + # We pre-create both the TextureObject (read path) and the + # SurfaceObject (write path) once and reuse them every frame. Creating + # them inside the per-frame loop would work but adds per-frame overhead + # and risks lifetime issues with async kernel launches. + height_arr = make_heightmap_array() + height_tex = make_height_texture(height_arr) + height_surf = SurfaceObject.from_array(height_arr) + + # --- Step 7: Camera + animation state --- + state = { + "preset": DEFAULT_PRESET, + "yaw": INITIAL_YAW, + "pitch": INITIAL_PITCH, + "distance": INITIAL_DISTANCE, + "drag": False, + "paused": False, + "t_anim": 0.0, + "t_prev": time.monotonic(), + } + + # --- Step 8: Render loop --- + frame_count = 0 + fps_time = state["t_prev"] + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # Advance animation time only when not paused, so pausing freezes the + # ocean exactly where it was rather than letting it lurch when resumed. + now = time.monotonic() + dt = now - state["t_prev"] + state["t_prev"] = now + if not state["paused"]: + state["t_anim"] += dt + t = state["t_anim"] + + amp_scale, speed_scale, _label = PRESETS[state["preset"]] + + # (a) Rebuild the heightmap for time t. + launch( + stream, + configs["update"], + kernels["update"], + np.uint64(height_surf.handle), + np.int32(GRID), + np.int32(GRID), + np.float32(t), + np.float32(amp_scale), + np.float32(speed_scale), + ) + + # (b) Render the scene: sample the heightmap through the texture, + # estimate normals via finite differences, shade with Phong + + # Fresnel sky reflection, write RGBA8 into the OpenGL PBO. + cam_x, cam_y, cam_z = orbit_camera_position( + state["yaw"], state["pitch"], state["distance"] + ) + with resource.map(stream=stream) as buf: + launch( + stream, + configs["render"], + kernels["render"], + np.uint64(height_tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(cam_x), + np.float32(cam_y), + np.float32(cam_z), + np.float32(t), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) PBO -> GL texture (GPU-to-GPU). + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + label = PRESETS[state["preset"]][2] + paused = " [paused]" if state["paused"] else "" + window.set_caption( + "cuda.core Array/Texture/Surface - Gerstner Ocean" + f" [{label}]{paused} ({WIDTH}x{HEIGHT}, {fps:.0f} FPS)" + ) + frame_count = 0 + fps_time = now + + # --- Mouse: drag to orbit, scroll to zoom ------------------------------ + @window.event + def on_mouse_press(x, y, button, modifiers): + if button == pyglet.window.mouse.LEFT: + state["drag"] = True + + @window.event + def on_mouse_release(x, y, button, modifiers): + if button == pyglet.window.mouse.LEFT: + state["drag"] = False + + @window.event + def on_mouse_drag(x, y, dx, dy, buttons, modifiers): + if not (buttons & pyglet.window.mouse.LEFT): + return + # Rotate yaw on horizontal drag, pitch on vertical drag. The yaw + # direction matches the camera moving with the cursor. + state["yaw"] -= dx * 0.005 + state["pitch"] -= dy * 0.005 + # Clamp pitch to keep the camera basis non-degenerate (never look + # straight down/up the world-y axis). + if state["pitch"] > PITCH_LIMIT: + state["pitch"] = PITCH_LIMIT + if state["pitch"] < -PITCH_LIMIT: + state["pitch"] = -PITCH_LIMIT + + @window.event + def on_mouse_scroll(x, y, scroll_x, scroll_y): + # Geometric zoom in camera distance; clamp to a sensible range. + factor = 1.1 ** (-scroll_y) + new_d = state["distance"] * factor + state["distance"] = max(ZOOM_MIN, min(ZOOM_MAX, new_d)) + + # --- Keyboard: 1/2/3 weather presets, P pauses, Escape exits ---------- + @window.event + def on_key_press(symbol, modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.P: + state["paused"] = not state["paused"] + return + for digit_key, name in ( + (key._1, "1"), + (key._2, "2"), + (key._3, "3"), + ): + if symbol == digit_key: + state["preset"] = name + return + + @window.event + def on_close(): + # Release CUDA resources in reverse order of creation. + resource.close() + height_tex.close() + height_surf.close() + height_arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# KERNEL_SOURCE contains two CUDA C++ kernels: +# - update_height: per-heightmap-texel. Sums 12 Gerstner waves and writes +# one float per texel via SurfaceObject. +# - render_ocean: per-screen-pixel. Builds a camera ray, intersects the +# ocean plane (y=0), samples the heightmap via +# TextureObject (LINEAR + WRAP), estimates the normal via +# finite differences, and shades with Phong + Fresnel sky +# reflection. Misses go to a vertical sky gradient. +# +# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws a +# texture on a fullscreen quad -- nothing CUDA-specific. +# ============================================================================ + +KERNEL_SOURCE = r""" +// --------------------------------------------------------------------------- +// Wave bank: 12 Gerstner-ish waves with hand-picked parameters. +// +// Wavelengths span 0.05 .. 1.0 world units. Amplitudes decrease with +// frequency so that long swells dominate and short ripples ride on top +// (a rough Phillips/JONSWAP-style envelope, but coarsely hand-tuned for +// visual punch rather than physical accuracy). +// +// Directions are spread non-uniformly around the unit circle to avoid the +// streaky-grid look you get from evenly-spaced directions. +// --------------------------------------------------------------------------- +__constant__ float c_dirx[12] = { + 1.000f, 0.866f, 0.500f, 0.000f, -0.500f, -0.866f, + -1.000f, -0.940f, -0.500f, 0.174f, 0.643f, 0.940f +}; +__constant__ float c_dirz[12] = { + 0.000f, 0.500f, 0.866f, 1.000f, 0.866f, 0.500f, + 0.000f, 0.342f, 0.866f, 0.985f, 0.766f, 0.342f +}; +__constant__ float c_wavelen[12] = { + 1.000f, 0.730f, 0.520f, 0.380f, 0.260f, 0.190f, + 0.140f, 0.105f, 0.085f, 0.070f, 0.058f, 0.050f +}; +__constant__ float c_amp[12] = { + 0.080f, 0.060f, 0.045f, 0.034f, 0.025f, 0.018f, + 0.013f, 0.010f, 0.0075f, 0.0055f, 0.0040f, 0.0030f +}; +__constant__ float c_phase[12] = { + 0.00f, 1.20f, 2.10f, 0.40f, 3.70f, 5.10f, + 2.65f, 4.85f, 1.55f, 6.05f, 3.20f, 0.95f +}; + +// Deep-water dispersion: w = sqrt(g * k), with k = 2*pi / wavelength. +__device__ __forceinline__ float angular_freq(float wavelength) { + const float G = 9.81f; + float k = 6.2831853f / wavelength; + return sqrtf(G * k); +} + +// World extent (in world units) covered by one tile of the heightmap. +// The heightmap WRAPs, so the ocean tiles seamlessly every TILE world units. +__device__ __forceinline__ float tile_extent() { return 4.0f; } + +// --------------------------------------------------------------------------- +// Tiny vec3 helpers. Kept inline + __forceinline__ so they stay free. +// --------------------------------------------------------------------------- +struct V3 { float x, y, z; }; + +__device__ __forceinline__ V3 v3(float x, float y, float z) { + V3 r; r.x = x; r.y = y; r.z = z; return r; +} +__device__ __forceinline__ V3 v_add(V3 a, V3 b) { + return v3(a.x + b.x, a.y + b.y, a.z + b.z); +} +__device__ __forceinline__ V3 v_sub(V3 a, V3 b) { + return v3(a.x - b.x, a.y - b.y, a.z - b.z); +} +__device__ __forceinline__ V3 v_scale(V3 a, float s) { + return v3(a.x * s, a.y * s, a.z * s); +} +__device__ __forceinline__ float v_dot(V3 a, V3 b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} +__device__ __forceinline__ V3 v_cross(V3 a, V3 b) { + return v3(a.y * b.z - a.z * b.y, + a.z * b.x - a.x * b.z, + a.x * b.y - a.y * b.x); +} +__device__ __forceinline__ V3 v_normalize(V3 a) { + float inv = rsqrtf(fmaxf(v_dot(a, a), 1e-20f)); + return v_scale(a, inv); +} + +// --------------------------------------------------------------------------- +// update_height: each thread computes one heightmap texel. +// +// Sums the 12 Gerstner waves at world position (x, z), using the +// amplitude_scale and speed_scale knobs to switch between weather presets +// without recompiling the kernel. Writes one float via surf2Dwrite. +// --------------------------------------------------------------------------- +extern "C" __global__ +void update_height(cudaSurfaceObject_t surf, + int width, int height, + float t, + float amp_scale, float speed_scale) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + if (ix >= width || iy >= height) return; + + // Map texel (ix, iy) to world position (x, z) inside one tile. + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float te = tile_extent(); + float wx = ((float)ix + 0.5f) * inv_w * te; + float wz = ((float)iy + 0.5f) * inv_h * te; + + float h = 0.0f; + #pragma unroll + for (int i = 0; i < 12; ++i) { + float k = 6.2831853f / c_wavelen[i]; + float w = angular_freq(c_wavelen[i]) * speed_scale; + float arg = (c_dirx[i] * wx + c_dirz[i] * wz) * k - w * t + c_phase[i]; + h += c_amp[i] * sinf(arg); + } + h *= amp_scale; + + // Single-channel float surface: byte offset is x * sizeof(float). + surf2Dwrite(h, surf, ix * (int)sizeof(float), iy); +} + +// --------------------------------------------------------------------------- +// Sample the heightmap at a world position. Texture is normalized + WRAP, +// so we just divide world coords by tile_extent. WRAP gives us the tiling +// for free at the horizon. +// --------------------------------------------------------------------------- +__device__ __forceinline__ float sample_height(cudaTextureObject_t tex, + float wx, float wz) { + float inv_te = 1.0f / tile_extent(); + return tex2D(tex, wx * inv_te, wz * inv_te); +} + +// --------------------------------------------------------------------------- +// Sky gradient: a vertical interpolation from a soft horizon to a deeper +// overhead blue. `up_angle` is in [-1, 1] (the y component of the ray dir). +// --------------------------------------------------------------------------- +__device__ __forceinline__ V3 sky_color(float up_angle) { + // Clamp to [0, 1] so straight-down rays still get a horizon color. + float a = fmaxf(0.0f, fminf(1.0f, up_angle)); + // Soft pale-blue horizon + V3 horizon = v3(0.70f, 0.82f, 0.92f); + // Deeper blue overhead + V3 zenith = v3(0.18f, 0.34f, 0.62f); + // Curve so the gradient isn't linear -- horizon stays brighter longer. + float t = powf(a, 0.6f); + return v_add(v_scale(horizon, 1.0f - t), v_scale(zenith, t)); +} + +// --------------------------------------------------------------------------- +// render_ocean: each thread shades one screen pixel. +// +// 1. Reconstruct the camera basis from cam_pos (orbiting origin, world-up). +// 2. Build a perspective ray through the pixel. +// 3. Intersect ray with y = 0 plane; if it misses, return sky gradient. +// 4. Sample heightmap at hit point; finite-difference for the normal. +// 5. Phong diffuse + specular, blended with Fresnel sky reflection. +// 6. Write RGBA8 into the OpenGL PBO. +// --------------------------------------------------------------------------- +extern "C" __global__ +void render_ocean(cudaTextureObject_t tex, + unsigned char* out, + int w, int h, + float cam_x, float cam_y, float cam_z, + float /*t*/) { + int px = blockIdx.x * blockDim.x + threadIdx.x; + int py = blockIdx.y * blockDim.y + threadIdx.y; + if (px >= w || py >= h) return; + + // ---- Camera basis ---- + // Forward looks from cam_pos toward origin. World up is +y. + // cam_y > 0 guarantees forward.y < 0 and the cross product with world-up + // is well-defined (the pitch is clamped on the host side). + V3 cam_pos = v3(cam_x, cam_y, cam_z); + V3 forward = v_normalize(v_sub(v3(0.0f, 0.0f, 0.0f), cam_pos)); + V3 world_up = v3(0.0f, 1.0f, 0.0f); + V3 right = v_normalize(v_cross(forward, world_up)); + V3 cam_up = v_cross(right, forward); + + // ---- Pixel ray (perspective) ---- + float aspect = (float)w / (float)h; + float fov = 1.0472f; // 60 degrees vertical FoV + float scale = tanf(fov * 0.5f); + float ndc_x = (2.0f * ((float)px + 0.5f) / (float)w - 1.0f) * aspect * scale; + float ndc_y = (1.0f - 2.0f * ((float)py + 0.5f) / (float)h) * scale; + V3 dir = v_normalize(v_add(v_add(forward, + v_scale(right, ndc_x)), + v_scale(cam_up, ndc_y))); + + // ---- Background sky if the ray misses the ocean plane ---- + // The ocean is the y=0 plane; we only count hits with rays going downward + // (dir.y < 0). Anything else is sky. A small eps avoids near-horizontal + // rays producing absurd hit distances. + V3 col; + const float HIT_EPS = 1e-3f; + if (dir.y > -HIT_EPS) { + col = sky_color(dir.y); + } else { + // ---- Hit the ocean plane ---- + float t_hit = -cam_y / dir.y; + if (t_hit <= 0.0f) { + // Camera under the surface -- treat as sky to avoid garbage. + col = sky_color(dir.y); + } else { + V3 p = v_add(cam_pos, v_scale(dir, t_hit)); + + // ---- Sample heightmap; estimate normal via finite differences ---- + // The heightmap tiles every tile_extent() world units (WRAP), so + // we use a small world-space epsilon. Four taps -> central + // differences in x and z. + const float FD = 0.01f; + float h_c = sample_height(tex, p.x, p.z); + float h_xp = sample_height(tex, p.x + FD, p.z); + float h_xm = sample_height(tex, p.x - FD, p.z); + float h_zp = sample_height(tex, p.x, p.z + FD); + float h_zm = sample_height(tex, p.x, p.z - FD); + float dh_dx = (h_xp - h_xm) / (2.0f * FD); + float dh_dz = (h_zp - h_zm) / (2.0f * FD); + // Normal of the surface y = h(x, z) is (-dh/dx, 1, -dh/dz). + V3 N = v_normalize(v3(-dh_dx, 1.0f, -dh_dz)); + + // ---- Lighting ---- + V3 L = v_normalize(v3(0.55f, 0.65f, 0.35f)); // sun: high+side + V3 V = v_normalize(v_sub(cam_pos, p)); // view direction + // Reflect L about N: R = 2*(N.L)*N - L + float ndotl = fmaxf(0.0f, v_dot(N, L)); + V3 R = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, L)), L)); + + // Phong specular highlight on wave crests. + float spec = powf(fmaxf(0.0f, v_dot(R, V)), 32.0f); + + // Diffuse: deep-sea blue-green. + V3 deep = v3(0.04f, 0.18f, 0.28f); + V3 shallow = v3(0.10f, 0.32f, 0.42f); + // Tiny height-based shading bias so crests look slightly brighter. + float tint = 0.5f + 0.5f * fmaxf(-1.0f, fminf(1.0f, h_c * 6.0f)); + V3 base = v_add(v_scale(deep, 1.0f - tint), + v_scale(shallow, tint)); + + // Diffuse term + ambient. + V3 diffuse = v_add(v_scale(base, 0.18f), + v_scale(base, 0.82f * ndotl)); + + // Fresnel-modulated sky reflection. Sample the sky in the + // reflected-view direction so reflections of overhead show + // overhead colors, etc. View reflection: Rv = 2*(N.V)*N - V. + float ndotv = fmaxf(0.0f, v_dot(N, V)); + V3 Rv = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, V)), V)); + V3 reflected_sky = sky_color(fmaxf(0.0f, Rv.y)); + float F = powf(1.0f - ndotv, 5.0f); + // Clamp Fresnel just in case of NaN-prone edge cases. + if (F < 0.0f) F = 0.0f; + if (F > 1.0f) F = 1.0f; + + // Blend: more reflection at grazing angles. + V3 lit = v_add(v_scale(diffuse, 1.0f - F), + v_scale(reflected_sky, F)); + + // Add specular highlight (sun color). + V3 sun_col = v3(1.0f, 0.96f, 0.85f); + col = v_add(lit, v_scale(sun_col, spec)); + } + } + + // ---- Tonemap + write ---- + // Simple Reinhard-ish curve keeps highlights in [0, 1]. + col.x = col.x / (1.0f + col.x); + col.y = col.y / (1.0f + col.y); + col.z = col.z / (1.0f + col.z); + + int idx = (py * w + px) * 4; + out[idx + 0] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.x)) * 255.0f); + out[idx + 1] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.y)) * 255.0f); + out[idx + 2] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.z)) * 255.0f); + out[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py new file mode 100644 index 00000000000..b30603721a1 --- /dev/null +++ b/cuda_core/examples/gl_interop_reaction_diffusion.py @@ -0,0 +1,727 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A Gray-Scott +# reaction-diffusion simulation is ping-ponged between two CUDA arrays each +# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads, and a +# SurfaceObject provides typed writes. The final state is colorized straight +# into an OpenGL PBO. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to allocate a CUDA Array with `surface_load_store=True` so the same +# memory can be bound as both a TextureObject (for sampled reads) and a +# SurfaceObject (for typed writes). +# - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates +# to get free hardware bilinear interpolation on a toroidal world. +# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so +# the entire simulation never leaves the GPU. +# +# How it works +# ============ +# Gray-Scott is a two-species (U, V) reaction-diffusion system. At each cell +# the rule is roughly: +# +# du/dt = Du * laplacian(u) - u*v*v + F*(1 - u) +# dv/dt = Dv * laplacian(v) + u*v*v - (F + k)*v +# +# Different choices of F and k yield strikingly different patterns: coral, +# mitosis, spots, and many more. We pack (U, V) into the two channels of a +# `float2` Array. +# +# PING-PONG (two arrays, swap each step) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +--------------+ tex2D +--------------+ +# | arr_a | ----------------> | | +# | (U, V) state | | gray_scott | +# +--------------+ | kernel | +# | | +# +--------------+ surf2Dwrite | | +# | arr_b | <---------------- | | +# | (U, V) state | +--------------+ +# +--------------+ +# (swap) +# +# Each frame we do N_STEPS iterations of the kernel above, then run a separate +# `colorize` kernel that samples V from the final state and writes RGBA bytes +# straight into the OpenGL PBO via GraphicsResource. No data ever travels +# across the PCIe bus during the frame. +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# Addressing modes WRAP and MIRROR are only supported with normalized +# coordinates (see the CUDA Programming Guide and the SDK's +# simplePitchLinearTexture sample). We use WRAP so that neighbor lookups at +# the image edge automatically wrap around -- i.e. a torus. LINEAR filtering +# is essentially free on the hardware and gives smoother diffusion than POINT +# sampling would. We sample at the texel center `(x + 0.5) / W` so the +# neighbor offsets line up exactly on integer texel positions. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this +# wrong silently corrupts every other column. +# +# What you should see +# =================== +# A window showing animated, organic-looking patterns growing and dividing +# (think coral, spots, or mitosing cells). Press 1/2/3 to switch presets, +# R to reseed, and Escape to exit. The window title shows the current FPS +# and active preset. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 512 +HEIGHT = 512 +N_STEPS = 8 # Gray-Scott iterations per displayed frame +DU = 0.16 # diffusion rate for U +DV = 0.08 # diffusion rate for V +DT = 1.0 # time step (Gray-Scott is stable at 1.0 with these D's) + +# Named presets: (F, k, label) tuples. F is the feed rate, k is the kill rate. +# These are classic Gray-Scott regimes documented all over the literature. +PRESETS = { + "1": (0.0545, 0.062, "coral"), + "2": (0.0367, 0.0649, "mitosis"), + "3": (0.030, 0.062, "spots"), +} +DEFAULT_PRESET = "1" + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + Returns a dict of kernels keyed by name and matching LaunchConfigs. + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("gray_scott_step", "colorize", "seed_initial"), + ) + + kernels = { + "step": mod.get_kernel("gray_scott_step"), + "colorize": mod.get_kernel("colorize"), + "seed": mod.get_kernel("seed_initial"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they + # can share a launch config. + configs = {"step": config, "colorize": config, "seed": config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core Array/Texture/Surface - Gray-Scott Reaction Diffusion", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_state_arrays(): + """Allocate the two `float2` ping-pong arrays that hold the (U, V) state.""" + arr_a = Array.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + surface_load_store=True, + ) + arr_b = Array.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + surface_load_store=True, + ) + return arr_a, arr_b + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def seed_state(stream, kernels, configs, write_surf, seed_value): + """Re-initialize the array behind `write_surf` with the Gray-Scott starting state. + + Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so + creating a SurfaceObject inside a `with` block that closes immediately + after `launch` returns would destroy the surface handle before the kernel + actually runs against it. + """ + launch( + stream, + configs["seed"], + kernels["seed"], + np.uint64(write_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint32(seed_value), + ) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the two ping-pong state Arrays --- + # Both are `float2` (channel 0 = U, channel 1 = V) with + # surface_load_store=True so they can be bound as SurfaceObjects. + arr_a, arr_b = make_state_arrays() + + # --- Step 7: Pre-create the four bindless handles --- + # Per advisor: doing this once is much cheaper than recreating them + # every step. We keep both texture and surface handles for each + # array; the simulation loop just picks which pair to use. + tex_a = make_texture(arr_a) + tex_b = make_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # --- Step 8: Seed the initial state into arr_a (writes via surf_a) --- + seed_state(stream, kernels, configs, surf_a, seed_value=0) + # After seeding, `arr_a` is the "current" state. + state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0} + + # --- Step 9: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_read_write(): + if state["current"] == "a": + return tex_a, surf_b, "b" # read a, write b, next current = b + return tex_b, surf_a, "a" + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + state["seed"] += 1 + seed_state(stream, kernels, configs, surf_a, seed_value=state["seed"]) + state["current"] = "a" + return + for digit_key, name in ( + (key._1, "1"), + (key._2, "2"), + (key._3, "3"), + ): + if symbol == digit_key: + state["preset"] = name + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + F, k, _label = PRESETS[state["preset"]] + + # (a) Run N_STEPS Gray-Scott iterations. Each step reads from one + # array via a TextureObject (LINEAR + WRAP gives wrapping + + # bilinear sampling) and writes to the other via a SurfaceObject. + for _ in range(N_STEPS): + tex_read, surf_write, next_current = current_read_write() + launch( + stream, + configs["step"], + kernels["step"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(DU), + np.float32(DV), + np.float32(F), + np.float32(k), + np.float32(DT), + ) + state["current"] = next_current + + # (b) Colorize the latest state into the OpenGL PBO. + tex_read = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_read.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + label = PRESETS[state["preset"]][2] + window.set_caption( + "cuda.core Array/Texture/Surface - Gray-Scott" + f" [{label}] ({WIDTH}x{HEIGHT}, {fps:.0f} FPS," + f" {N_STEPS} steps/frame)" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains three CUDA C++ kernels: +# * seed_initial -- sets initial (U, V) state via SurfaceObject writes +# * gray_scott_step -- reads previous state via TextureObject (with +# LINEAR + WRAP bilinear filtering) and writes the +# next state via SurfaceObject. Coordinates are +# normalized to [0, 1] because WRAP requires it. +# * colorize -- reads the V channel via TextureObject and writes +# RGBA bytes into the OpenGL PBO using a simple +# three-stop "magma-ish" gradient. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Inverse texture dimensions are precomputed by the host and passed as +// floats so the kernel can convert integer pixel coordinates to normalized +// texture coordinates with a single multiply. + +extern "C" +__global__ +void seed_initial(cudaSurfaceObject_t surf, + int width, int height, + unsigned int seed) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // U = 1 everywhere; V = 1 inside a ~40x40 centered square plus a small + // deterministic perturbation that breaks symmetry differently each reseed. + float u = 1.0f; + float v = 0.0f; + + int half_w = width / 2; + int half_h = height / 2; + if (x >= half_w - 20 && x < half_w + 20 && + y >= half_h - 20 && y < half_h + 20) { + v = 1.0f; + // Knock U down a bit inside the seed square so V can grow. + u = 0.5f; + } + + // Cheap deterministic pseudo-random noise (xorshift on packed coords). + unsigned int h = (unsigned int)x * 374761393u + + (unsigned int)y * 668265263u + seed * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + float noise = (h & 0xffffu) / 65535.0f; // in [0, 1] + v += 0.02f * (noise - 0.5f); // small +/- jitter + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES. + surf2Dwrite(make_float2(u, v), surf, x * (int)sizeof(float2), y); +} + +extern "C" +__global__ +void gray_scott_step(cudaTextureObject_t tex, + cudaSurfaceObject_t surf, + int width, int height, + float Du, float Dv, + float F, float k_kill, + float dt) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Normalized coordinates: WRAP addressing only works in normalized mode. + // Each texel center sits at ((i + 0.5) / W, (j + 0.5) / H). + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = (x + 0.5f) * inv_w; + float cy = (y + 0.5f) * inv_h; + + // 5-point Laplacian stencil. LINEAR filtering does nothing extra here + // because the offsets land exactly on neighboring texel centers, but the + // toroidal WRAP at the boundary is essential for a periodic world. + float2 c = tex2D(tex, cx, cy); + float2 l = tex2D(tex, cx - inv_w, cy); + float2 r = tex2D(tex, cx + inv_w, cy); + float2 u_n = tex2D(tex, cx, cy - inv_h); + float2 d_n = tex2D(tex, cx, cy + inv_h); + + float lap_u = (l.x + r.x + u_n.x + d_n.x) - 4.0f * c.x; + float lap_v = (l.y + r.y + u_n.y + d_n.y) - 4.0f * c.y; + + float u = c.x; + float v = c.y; + float uvv = u * v * v; + + float du = Du * lap_u - uvv + F * (1.0f - u); + float dv = Dv * lap_v + uvv - (F + k_kill) * v; + + float new_u = u + dt * du; + float new_v = v + dt * dv; + + // Clamp to keep things numerically sane after long runs. + if (new_u < 0.0f) new_u = 0.0f; + if (new_u > 1.0f) new_u = 1.0f; + if (new_v < 0.0f) new_v = 0.0f; + if (new_v > 1.0f) new_v = 1.0f; + + surf2Dwrite(make_float2(new_u, new_v), surf, + x * (int)sizeof(float2), y); +} + +extern "C" +__global__ +void colorize(cudaTextureObject_t tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = (x + 0.5f) * inv_w; + float cy = (y + 0.5f) * inv_h; + + float2 c = tex2D(tex, cx, cy); + float v = c.y; + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // Three-stop "magma-ish" gradient: dark purple -> orange -> pale yellow. + // Implemented as two linear interpolations stitched together at v = 0.5 + // so the result is reasonably perceptually smooth without a lookup table. + float r, g, b; + if (v < 0.5f) { + float t = v * 2.0f; // [0, 1] over the low half + r = 0.05f + t * (0.85f - 0.05f); + g = 0.02f + t * (0.30f - 0.02f); + b = 0.20f + t * (0.10f - 0.20f); + } else { + float t = (v - 0.5f) * 2.0f; // [0, 1] over the high half + r = 0.85f + t * (1.00f - 0.85f); + g = 0.30f + t * (0.95f - 0.30f); + b = 0.10f + t * (0.70f - 0.10f); + } + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py new file mode 100644 index 00000000000..05299cc278f --- /dev/null +++ b/cuda_core/examples/gl_interop_sdf_volume.py @@ -0,0 +1,827 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core's 3D Array + trilinear TextureObject by +# baking a procedural Signed Distance Field (SDF) volume once at startup and +# then ray-marching it every frame to render an orbitable 3D scene. The +# SurfaceObject is used during the one-shot bake; the TextureObject (with +# LINEAR + CLAMP + normalized coords) drives the per-frame ray march. The +# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to allocate a 3D cuda.core.Array (cuArray3DCreate under the hood) and +# bind it as both a SurfaceObject (for one-shot kernel writes) and a +# TextureObject (for hardware-accelerated trilinear sampling). +# - How to ray-march a baked SDF volume from a CUDA kernel, sampling via +# tex3D and writing pixels straight into an OpenGL PBO. +# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop. +# +# How it works +# ============ +# The signed distance field of a "gyroid intersected with a sphere" is baked +# once into a 128 x 128 x 128 single-channel float volume: +# +# gyroid(p) = sin(p.x*tau)cos(p.y*tau) +# + sin(p.y*tau)cos(p.z*tau) +# + sin(p.z*tau)cos(p.x*tau) +# sdf_gyroid = |gyroid(p)| - 0.20 # slab around the gyroid surface +# sdf_sphere = length(p) - 0.9 # bounding sphere +# sdf(p) = max(sdf_gyroid, sdf_sphere) # CSG intersection +# +# where p in [-1, 1]^3 is the voxel's world-space position. +# +# Each frame, the render kernel emits one ray per pixel from an orbiting +# camera, marches the volume in fixed voxel-sized steps (up to ~256), and on intersection +# computes a normal by central differences of tex3D, then applies a simple +# diffuse + ambient + specular shade. Misses fall back to a vertical sky +# gradient. +# +# STARTUP (one-shot bake) +# ~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Allocate 3D Array (128^3, FLOAT32 x1, surface_load_store=True). +# 2. Bind it as a SurfaceObject. +# 3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite. +# 4. Close the SurfaceObject; the Array stays alive. +# +# EACH FRAME +# ~~~~~~~~~~ +# 1. resource.map() -> CUDA device pointer into the OpenGL PBO. +# 2. Launch `render_sdf` (one thread per pixel). It samples the SDF via the +# long-lived TextureObject (LINEAR + CLAMP + normalized coords) using +# tex3D. RGBA8 lands directly in the PBO. +# 3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad. +# +# Controls +# ======== +# Left mouse drag orbit camera (dx -> yaw, dy -> pitch) +# Mouse wheel zoom (camera distance) +# R reset camera (yaw=0, pitch=0.3, dist=2.5) +# Escape / close quit +# +# The window title shows yaw, pitch, distance, FPS, and ms/frame. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Configuration (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +VOLUME_SIZE = 128 # 128^3 voxels; bake cost is one-shot. + +# Camera defaults / clamps. +RESET_YAW = 0.0 +RESET_PITCH = 0.3 +RESET_DIST = 2.5 +PITCH_MIN = -1.45 # stay inside (-pi/2, pi/2) so the up-vector stays sane. +PITCH_MAX = 1.45 +DIST_MIN = 1.2 +DIST_MAX = 8.0 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# 3D Array / TextureObject / SurfaceObject, skip ahead to main() -- the +# interesting part is there. These helpers exist so that main() reads like a +# short story instead of a wall of boilerplate. +# ============================================================================ + + +def _check_compute_capability(dev): + """3D arrays + bindless surface/texture objects require sm_30+.""" + cc = dev.compute_capability + if cc.major < 3: + print( + f"This example requires compute capability >= 3.0, " + f"got sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + +def setup_cuda(): + """Compile the two kernels and return (device, stream, kernels).""" + dev = Device(0) + dev.set_current() + _check_compute_capability(dev) + stream = dev.create_stream() + + # C++ is required so the templated tex3D / surf3Dwrite + # overloads resolve. extern "C" on the kernel symbols keeps the function + # names unmangled even when the rest of the TU is compiled as C++. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("bake_sdf", "render_sdf"), + ) + kernels = { + "bake": mod.get_kernel("bake_sdf"), + "render": mod.get_kernel("render_sdf"), + } + return dev, stream, kernels + + +def make_volume_array(): + """Allocate the 3D SDF volume. Single-channel float, surface-capable.""" + return Array.from_descriptor( + shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE), + format=ArrayFormat.FLOAT32, + num_channels=1, + surface_load_store=True, + ) + + +def make_volume_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized. + + Normalized coords let the kernel sample as (u, v, w) in [0, 1]; CLAMP at + the boundaries matches the rendering logic that bails out as soon as the + march leaves the volume's [-1, 1]^3 box, so out-of-range sampling never + pollutes a real hit. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def bake_volume(stream, kernels, arr): + """Run the one-shot bake kernel that fills the volume with the SDF. + + The SurfaceObject lives only for the duration of this call; once the bake + is enqueued and the kernel has captured the bindless handle into its + arguments, we sync the stream before letting the SurfaceObject close. + The Array itself outlives this scope -- it's the long-lived backing store + for the render-loop TextureObject. + """ + with SurfaceObject.from_array(arr) as bake_surf: + block = (8, 8, 8) + grid = ( + (VOLUME_SIZE + block[0] - 1) // block[0], + (VOLUME_SIZE + block[1] - 1) // block[1], + (VOLUME_SIZE + block[2] - 1) // block[2], + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["bake"], + np.uint64(bake_surf.handle), + np.int32(VOLUME_SIZE), + ) + # Synchronize before the SurfaceObject context exits so the bindless + # handle is still valid while the kernel runs. + stream.sync() + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core 3D Array - SDF Volume Ray-Marcher", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard GL boilerplate: shader, fullscreen quad, empty texture. + + Not CUDA-specific; identical to the other gl_interop_* examples. + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, -1, 0, 0, + 1, -1, 1, 0, + 1, 1, 1, 1, + -1, -1, 0, 0, + 1, 1, 1, 1, + -1, 1, 0, 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA8 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels = setup_cuda() + + # --- Step 2: Allocate the 3D SDF volume and bake it once --- + # The Array is the long-lived backing store; it must outlive the + # render loop. The SurfaceObject is only needed for the one-shot bake + # and is closed before we ever bind a TextureObject to the same Array. + arr = make_volume_array() + bake_volume(stream, kernels, arr) + + # --- Step 3: Bind the volume as a trilinear TextureObject --- + # LINEAR + CLAMP + normalized_coords gives us free hardware trilinear + # filtering, which is exactly what we want for both the SDF samples + # in the ray march and the normal-finite-difference samples. + volume_tex = make_volume_texture(arr) + + # --- Step 4: Open a window and set up the CUDA/GL bridge --- + window, gl, pyglet = create_window() + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Render loop state --- + # Camera is orbit-style: yaw and pitch are angles, dist is the orbit + # radius. The render kernel turns these into a (origin, basis) and + # constructs per-pixel rays itself. + cam = { + "yaw": RESET_YAW, + "pitch": RESET_PITCH, + "dist": RESET_DIST, + } + frame_count = [0] + fps_time = [time.monotonic()] + last_fps = [0.0] + last_frame_ms = [0.0] + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + @window.event + def on_draw(): + window.clear() + + # (a) Map the PBO so CUDA can write into it. + with resource.map(stream=stream) as buf: + # (b) Launch the ray-march kernel. The camera params are passed + # as scalars; the kernel computes the orbit eye position and + # per-pixel ray direction itself. + launch( + stream, + config, + kernels["render"], + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint64(volume_tex.handle), + np.float32(cam["yaw"]), + np.float32(cam["pitch"]), + np.float32(cam["dist"]), + ) + # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes + # the CUDA work against subsequent OpenGL use. + + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count[0] += 1 + now = time.monotonic() + if now - fps_time[0] >= 0.5: + last_fps[0] = frame_count[0] / (now - fps_time[0]) + last_frame_ms[0] = 1000.0 / last_fps[0] if last_fps[0] > 0 else 0.0 + frame_count[0] = 0 + fps_time[0] = now + window.set_caption( + "cuda.core 3D Array - SDF Volume Ray-Marcher " + f"yaw={cam['yaw']:+.2f} pitch={cam['pitch']:+.2f} " + f"dist={cam['dist']:.2f} " + f"{last_fps[0]:.0f} FPS {last_frame_ms[0]:.2f} ms/frame" + ) + + @window.event + def on_mouse_drag(x, y, dx, dy, buttons, modifiers): + # Left-click drag orbits the camera. dx -> yaw (sign convention chosen + # so that dragging right rotates the scene right); dy -> pitch (drag + # up tilts the camera up). + if not (buttons & pyglet.window.mouse.LEFT): + return + ORBIT_SCALE = 0.005 + cam["yaw"] += dx * ORBIT_SCALE + cam["pitch"] += dy * ORBIT_SCALE + # Clamp pitch so the up-vector never flips (we use world-up (0,1,0)). + if cam["pitch"] < PITCH_MIN: + cam["pitch"] = PITCH_MIN + elif cam["pitch"] > PITCH_MAX: + cam["pitch"] = PITCH_MAX + + @window.event + def on_mouse_scroll(x, y, scroll_x, scroll_y): + # Scroll wheel zoom: geometric so each tick feels uniform regardless + # of current distance. Positive scroll_y (wheel up) zooms in. + if scroll_y == 0: + return + cam["dist"] *= 0.9 ** scroll_y + if cam["dist"] < DIST_MIN: + cam["dist"] = DIST_MIN + elif cam["dist"] > DIST_MAX: + cam["dist"] = DIST_MAX + + @window.event + def on_key_press(symbol, modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + elif symbol == key.R: + cam["yaw"] = RESET_YAW + cam["pitch"] = RESET_PITCH + cam["dist"] = RESET_DIST + + @window.event + def on_close(): + # Release CUDA resources in reverse construction order. The GL + # objects clean up via pyglet on window close. + resource.close() + volume_tex.close() + arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# Two CUDA C++ kernels are concatenated into one program string so they share +# a single NVRTC compile. +# +# bake_sdf -- one thread per voxel. Computes the SDF of an +# "abs(gyroid) - 0.20" surface intersected with a bounding +# sphere, then writes the scalar via surf3Dwrite. NOTE: +# surf3Dwrite's x coordinate is in BYTES, y and z in +# elements -- a classic CUDA gotcha. +# +# render_sdf -- one thread per screen pixel. Builds the orbit-camera ray, +# fixed-step-marches the volume via tex3D on a trilinear- +# filtered, normalized-coord TextureObject, and shades the +# hit with diffuse + ambient + specular. Misses return a +# sky gradient. Writes RGBA8 directly into the OpenGL PBO. +# +# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA- +# specific there. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// -------------------------------------------------------------------------- +// Small inline helpers. Keeping them __device__ __forceinline__ encourages +// the compiler to drop them inline and avoids any cross-TU linkage worries. +// -------------------------------------------------------------------------- +__device__ __forceinline__ float clampf(float v, float a, float b) { + return fminf(fmaxf(v, a), b); +} + +__device__ __forceinline__ float dot3(float ax, float ay, float az, + float bx, float by, float bz) { + return ax * bx + ay * by + az * bz; +} + +__device__ __forceinline__ float length3(float x, float y, float z) { + return sqrtf(x * x + y * y + z * z); +} + +// -------------------------------------------------------------------------- +// bake_sdf: one thread per voxel writes the SDF of a gyroid-intersect-sphere +// into a single-channel float 3D Array via a SurfaceObject. +// +// surf is bound to a (size^3, FLOAT32 x 1) Array allocated with +// surface_load_store=True. +// surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float)); +// y and z are in elements. Off-by-one on the byte conversion silently +// corrupts every other column, so it's worth flagging explicitly. +// -------------------------------------------------------------------------- +extern "C" __global__ +void bake_sdf(cudaSurfaceObject_t surf, int size) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; + if (x >= size || y >= size || z >= size) return; + + // Map the voxel index to world-space p in [-1, 1]^3 (texel centers). + float fx = ((float)x + 0.5f) / (float)size; + float fy = ((float)y + 0.5f) / (float)size; + float fz = ((float)z + 0.5f) / (float)size; + float px = fx * 2.0f - 1.0f; + float py = fy * 2.0f - 1.0f; + float pz = fz * 2.0f - 1.0f; + + // Gyroid frequency: 3 cycles across [-1, 1] gives a busy but not noisy + // surface at 128^3 resolution. tau = 2 * pi * frequency. + const float TAU = 6.2831853071795864f * 3.0f; + + float sx = sinf(px * TAU), cx = cosf(px * TAU); + float sy = sinf(py * TAU), cy = cosf(py * TAU); + float sz = sinf(pz * TAU), cz = cosf(pz * TAU); + float gyroid = sx * cy + sy * cz + sz * cx; + // Slab thickness: the gyroid SDF is non-Lipschitz (its gradient scales + // with TAU ~= 19), so the stored values along the surface are dense but + // unreliable as a true distance metric. A wider slab (0.20 vs the + // canonical 0.05) gives the fixed-step ray marcher in render_sdf enough + // hit candidates per ray to render real geometry instead of mostly sky. + float sdf_gyroid = fabsf(gyroid) - 0.20f; // slab around iso-zero + float sdf_sphere = length3(px, py, pz) - 0.9f; // bounding sphere + float sdf = fmaxf(sdf_gyroid, sdf_sphere); // CSG intersection + + // surf3Dwrite: x in BYTES (cast sizeof to int so 32-bit arithmetic works + // even when x is large), y/z in elements. + surf3Dwrite(sdf, surf, x * (int)sizeof(float), y, z); +} + +// -------------------------------------------------------------------------- +// SDF sampler: tex3D wants normalized coords in [0, 1]; the volume covers +// [-1, 1] in world space, so we remap with `(p + 1) * 0.5`. Returns the +// raw stored SDF (a signed distance in world units). +// -------------------------------------------------------------------------- +__device__ __forceinline__ float sample_sdf(cudaTextureObject_t tex, + float px, float py, float pz) { + return tex3D(tex, + (px + 1.0f) * 0.5f, + (py + 1.0f) * 0.5f, + (pz + 1.0f) * 0.5f); +} + +// -------------------------------------------------------------------------- +// render_sdf: one thread per screen pixel. Builds the orbit camera, marches +// a ray through the SDF volume, and writes a shaded RGBA8 pixel to the PBO. +// +// Camera math (orbit, look-at origin, world-up (0, 1, 0)): +// eye = dist * (cos(pitch)*cos(yaw), sin(pitch), cos(pitch)*sin(yaw)) +// fwd = normalize(target - eye) (target = origin) +// right = normalize(cross(fwd, up)) +// up' = cross(right, fwd) +// For a pixel at (u, v) in NDC ([-1, 1] x [-1, 1] with v=1 at the top), +// dir = normalize(fwd + tan(fov/2) * (aspect * u * right + v * up')) +// +// Ray-march: +// Fixed-step march: t += STEP, where STEP is set to roughly one voxel. The +// gyroid SDF is non-Lipschitz, which makes classical sphere tracing +// (t += sdf(p)) overshoot through thin slabs and miss almost every ray. A +// uniform voxel-sized step is robust and cheap because the SDF is just a +// tex3D lookup. We declare a HIT when sdf < HIT_EPS. +// +// Bounds bail: outside the [-1, 1]^3 box, return the sky. +// Normal: 6-sample central differences with eps ~ 1.5/VOLUME_SIZE so the +// offsets are just over one voxel apart -- short enough to capture +// local surface direction, long enough that trilinear filtering +// actually moves the result. +// -------------------------------------------------------------------------- +extern "C" __global__ +void render_sdf(unsigned char* output, + int width, + int height, + cudaTextureObject_t tex, + float yaw, + float pitch, + float dist) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // ---- Build the orbit camera basis ---------------------------------- + float cp = cosf(pitch), sp = sinf(pitch); + float cy = cosf(yaw), sy = sinf(yaw); + + // Eye on a sphere of radius `dist` around the origin. + float ex = dist * cp * cy; + float ey = dist * sp; + float ez = dist * cp * sy; + + // fwd = normalize(target - eye), target = origin -> fwd = -eye / |eye|. + float fl = length3(ex, ey, ez); + // Guard against the (clamped) dist being zero (not reachable, but cheap). + if (fl < 1e-6f) fl = 1e-6f; + float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl; + + // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0). + // cross((fx,fy,fz), (0,1,0)) = (fy*0 - fz*1, fz*0 - fx*0, fx*1 - fy*0) + // = (-fz, 0, fx) + float rx = -fz; + float ry = 0.0f; + float rz = fx; + float rl = length3(rx, ry, rz); + if (rl < 1e-6f) rl = 1e-6f; + rx /= rl; ry /= rl; rz /= rl; + + // up' = cross(right, fwd). With right purely in the xz-plane, this is a + // proper orthonormal up; recompute to keep the basis consistent. + float ux = ry * fz - rz * fy; + float uy = rz * fx - rx * fz; + float uz = rx * fy - ry * fx; + + // ---- Per-pixel ray direction --------------------------------------- + // NDC with v=1 at the TOP. With our PBO layout (y=0 written first -> + // ends up at the bottom of the on-screen texture courtesy of the GL + // shader's [0, 1] texcoord), v = 2*v_norm - 1 already maps row 0 of the + // PBO to v = -1 (bottom of the image), which matches the camera's + // up'-axis convention. No flip needed. + float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width - 1.0f; + float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f; + + const float FOV_Y = 0.7853981633974483f; // 45 degrees + const float TAN_HALF = 0.41421356237309515f; // tanf(FOV_Y / 2) + float aspect = (float)width / (float)height; + + float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux; + float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy; + float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz; + float dl = length3(dx, dy, dz); + if (dl < 1e-6f) dl = 1e-6f; + dx /= dl; dy /= dl; dz /= dl; + + // ---- Ray vs. the [-1, 1]^3 box (slab method) ----------------------- + // The camera always sits outside the volume (DIST_MIN >= 1.2 and the + // orbit puts at least one component of the eye outside [-1, 1] for + // typical framings), so we must first advance `t` to the AABB entry + // before any in-volume sampling is meaningful. tNear is the entry + // distance (clamped to >= 0 so we don't march backwards if the eye is + // inside the box for some configuration); tFar is the exit distance. + // If the slab interval is empty (tNear > tFar), the ray misses outright. + float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f)); + float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f)); + float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f)); + float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx; + float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy; + float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz; + float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z)); + float tFar = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z)); + + bool hit = false; + float hx = 0.0f, hy = 0.0f, hz = 0.0f; + + if (tFar > fmaxf(tNear, 0.0f)) { + // ---- Fixed-step march through the SDF volume from the AABB entry + // Sphere tracing relies on a Lipschitz-1 SDF: the magnitude of the + // sample tells you a safe distance you can step without crossing + // the surface. But the gyroid SDF here, |sx*cy + sy*cz + sz*cx| + // - 0.20, has a gradient scaling with TAU ~= 19, so the stored + // magnitude vastly over-reports the true distance. Sphere tracing + // would routinely overshoot thin slab regions, leaving most rays + // missing geometry that's actually there. A fixed-step march is + // cheap (the SDF is just a tex3D lookup) and robust: each step + // advances by one voxel, so any positive crossing of the iso-zero + // surface lands inside a thin window where HIT_EPS catches it. + // + // 2 worldspace units / 256 steps = ~0.008 / step, slightly under + // one voxel at 128^3 resolution. + const int MAX_STEPS = 256; + const float STEP = 1.0f / 128.0f; + const float HIT_EPS = 1.0e-3f; + // Bias slightly inside the box so the very first sample isn't on + // the boundary (CLAMP addressing makes the boundary sample valid, + // but starting just inside avoids one wasted iteration). + float t = fmaxf(tNear, 0.0f) + 1e-4f; + float t_exit = tFar; + + #pragma unroll 1 + for (int i = 0; i < MAX_STEPS; ++i) { + float pxw = ex + t * dx; + float pyw = ey + t * dy; + float pzw = ez + t * dz; + + float s = sample_sdf(tex, pxw, pyw, pzw); + if (s < HIT_EPS) { + hit = true; + hx = pxw; hy = pyw; hz = pzw; + break; + } + t += STEP; + if (t > t_exit) break; + } + } + + // ---- Shade ----------------------------------------------------------- + float r, g, b; + if (hit) { + // Central-difference normal in world space. Each sample step is + // ~1.17 voxels: short enough to capture local geometry, long enough + // that trilinear filtering meaningfully moves the result. + const float NEPS = 1.5f / 128.0f; + float nx = sample_sdf(tex, hx + NEPS, hy, hz) - + sample_sdf(tex, hx - NEPS, hy, hz); + float ny = sample_sdf(tex, hx, hy + NEPS, hz) - + sample_sdf(tex, hx, hy - NEPS, hz); + float nz = sample_sdf(tex, hx, hy, hz + NEPS) - + sample_sdf(tex, hx, hy, hz - NEPS); + float nl = length3(nx, ny, nz); + if (nl < 1e-6f) nl = 1e-6f; + nx /= nl; ny /= nl; nz /= nl; + + // Fixed key light (normalized world direction). + const float LX = 0.5773502691896258f; // (1,1,-1)/sqrt(3) + const float LY = 0.5773502691896258f; + const float LZ = -0.5773502691896258f; + float diff = fmaxf(0.0f, dot3(nx, ny, nz, LX, LY, LZ)); + + // Specular: Blinn-Phong half-vector exponent. View dir = -ray dir. + float vx = -dx, vy = -dy, vz = -dz; + float hx2 = LX + vx, hy2 = LY + vy, hz2 = LZ + vz; + float hl = length3(hx2, hy2, hz2); + if (hl < 1e-6f) hl = 1e-6f; + hx2 /= hl; hy2 /= hl; hz2 /= hl; + float ndoth = fmaxf(0.0f, dot3(nx, ny, nz, hx2, hy2, hz2)); + float spec = powf(ndoth, 32.0f); + + // Base albedo varies with the hit position so the gyroid lattice + // reads as a single material with smooth variation, not flat plastic. + float base_r = 0.55f + 0.30f * nx; + float base_g = 0.50f + 0.30f * ny; + float base_b = 0.70f + 0.30f * nz; + + const float AMBIENT = 0.18f; + r = base_r * (AMBIENT + 0.82f * diff) + 0.6f * spec; + g = base_g * (AMBIENT + 0.82f * diff) + 0.6f * spec; + b = base_b * (AMBIENT + 0.82f * diff) + 0.7f * spec; + } else { + // Sky: dark blue at the top, near-black at the bottom. The PBO's row + // 0 is the bottom of the on-screen image (see the v_ndc comment), + // so we use the y coordinate of the ray direction (close to v_ndc + // in screen space) for the gradient. + float sky = 0.5f * (dy + 1.0f); // [0, 1] roughly + sky = clampf(sky, 0.0f, 1.0f); + r = 0.02f + 0.06f * sky; + g = 0.03f + 0.10f * sky; + b = 0.05f + 0.20f * sky; + } + + r = clampf(r, 0.0f, 1.0f); + g = clampf(g, 0.0f, 1.0f); + b = clampf(b, 0.0f, 1.0f); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py new file mode 100644 index 00000000000..82c880a8943 --- /dev/null +++ b/cuda_core/examples/gl_interop_texture_filter.py @@ -0,0 +1,607 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.TextureObject hardware filtering by +# comparing FilterMode.POINT and FilterMode.LINEAR side by side on the same +# source CUDA Array. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# How to back two TextureObjects with the SAME CUDA Array and observe the +# difference between POINT (nearest-texel) and LINEAR (bilinear) filtering +# under user-controlled zoom and pan. Also shows how the address mode +# (WRAP / CLAMP / MIRROR / BORDER) is baked into the texture descriptor at +# creation time, so changing it at runtime means rebuilding the textures. +# +# How it works +# ============ +# A single 256x256 RGBA8 Array holds a procedurally-generated test pattern +# (high-contrast checkerboard, diagonals, gradient stripe). Two +# TextureObjects are built on top of that Array: +# +# Array (256x256 RGBA UINT8) +# / \ +# tex_point tex_linear +# FilterMode.POINT FilterMode.LINEAR +# AddressMode.WRAP AddressMode.WRAP +# ReadMode.NORMALIZED_FLOAT ReadMode.NORMALIZED_FLOAT +# +# Each frame, a single CUDA kernel runs over a 1024x512 OpenGL PBO: +# +# - Left half of the screen samples tex_point. +# - Right half samples tex_linear. +# - Both halves use the same (zoom, pan) -> texture-space mapping, so the +# two views show the same content with different filtering. +# - A 2-pixel vertical white line marks the divider. +# +# Because ReadMode.NORMALIZED_FLOAT is used, tex2D() returns each +# channel as a float in [0, 1]; the kernel multiplies by 255 and writes +# unsigned bytes back into the PBO. +# +# The PBO is then copied to a GL texture and drawn on a fullscreen quad, +# identical to the plasma example. +# +# What you should see +# =================== +# A 1024x512 window split down the middle. The left half (POINT) shows +# blocky / pixelated magnification; the right half (LINEAR) shows smooth +# bilinear interpolation. Drag with the left mouse button to pan, +# scroll to zoom, press M to cycle the texture address mode, press R to +# reset, Escape or close the window to exit. The current address mode +# and FPS are shown in the window title. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Window and source-image dimensions (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 512 +SRC_W = 256 +SRC_H = 256 + +# Address modes cycled by pressing the M key. +ADDRESS_MODES = ( + AddressMode.WRAP, + AddressMode.CLAMP, + AddressMode.MIRROR, + AddressMode.BORDER, +) + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# TextureObject filtering, the most interesting parts are in main() and in +# make_pattern() / make_textures(); everything else is the same kind of +# CUDA-GL interop boilerplate used by gl_interop_plasma.py. +# ============================================================================ + + +def make_pattern(width, height): + """Build an RGBA8 test pattern that makes POINT vs LINEAR obvious. + + Layout (height, width, 4) of dtype uint8. Channels are R, G, B, A. + The pattern contains: + - 8x8 black/white checkerboard (high-frequency) + - Two diagonal red lines (1px wide) + - Horizontal blue->green gradient strip near y = height/4 + - A pair of thin horizontal rectangles ("text-like" blocks) + """ + img = np.zeros((height, width, 4), dtype=np.uint8) + + # Checkerboard (black / white) at 8x8 cells. + ys = np.arange(height)[:, None] + xs = np.arange(width)[None, :] + cell = ((xs // 8) + (ys // 8)) & 1 + white = np.broadcast_to(cell[..., None].astype(np.uint8) * 255, (height, width, 3)) + img[..., :3] = white + img[..., 3] = 255 + + # Two diagonal red lines. + diag1 = (xs == ys) + diag2 = (xs == (width - 1 - ys)) + red_mask = diag1 | diag2 + img[red_mask] = (255, 0, 0, 255) + + # Horizontal gradient strip (blue -> green) ~ 8 rows tall at y ~ height/4. + g_y = height // 4 + g_h = max(4, height // 32) + grad = np.linspace(0, 255, width, dtype=np.uint8) + for row in range(g_y, min(g_y + g_h, height)): + img[row, :, 0] = 0 + img[row, :, 1] = grad # G ramps up + img[row, :, 2] = 255 - grad # B ramps down + img[row, :, 3] = 255 + + # Two "text-like" thin rectangles, alternating bright/dim. + def fill_rect(y0, y1, x0, x1, rgba): + img[y0:y1, x0:x1] = rgba + + bar_y = (3 * height) // 4 + fill_rect(bar_y, bar_y + 4, width // 8, (width * 3) // 8, (255, 255, 0, 255)) + fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8, + (0, 255, 255, 255)) + + return np.ascontiguousarray(img) + + +def make_textures(array, address_mode): + """Build (tex_point, tex_linear) on the given Array with the given mode. + + The address mode is baked into the descriptor at cuTexObjectCreate time, so + we recreate both textures whenever the user cycles the mode. Caller owns + the returned objects and must close() them. + """ + res_desc = ResourceDescriptor.from_array(array) + + point_desc = TextureDescriptor( + address_mode=address_mode, + filter_mode=FilterMode.POINT, + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=False, + ) + linear_desc = TextureDescriptor( + address_mode=address_mode, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=False, + ) + tex_point = TextureObject.from_descriptor( + resource=res_desc, texture_descriptor=point_desc + ) + tex_linear = TextureObject.from_descriptor( + resource=res_desc, texture_descriptor=linear_desc + ) + return tex_point, tex_linear + + +def setup_cuda(kernel_source): + """Compile the CUDA kernel and return (device, stream, kernel, launch_config).""" + dev = Device(0) + dev.set_current() + stream = dev.create_stream() + + # C++ compile so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(kernel_source, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("split_screen_sample",)) + kernel = mod.get_kernel("split_screen_sample") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="TextureObject Filter Comparison - POINT vs LINEAR", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + Standard OpenGL boilerplate for a textured fullscreen quad, identical in + structure to the plasma example. Returns (shader_program, vao_id, tex_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles). Each vertex: x, y, s, t. + quad_verts = np.array( + [ + -1, -1, 0, 0, + 1, -1, 1, 0, + 1, 1, 1, 1, + -1, -1, 0, 0, + 1, 1, 1, 1, + -1, 1, 0, 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + # Empty GL texture; filled each frame from the PBO. + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + # Use nearest filtering on the display texture so the example's own + # POINT/LINEAR comparison is not muddied by GL's sampler. + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, 0, 0, 0, width, height, + gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda(KERNEL_SOURCE) + + # The hardware-texture path needs at least compute capability 3.x + # (it's available essentially everywhere modern, but check anyway so the + # failure is friendly). + if dev.compute_capability.major < 3: + print( + f"This example requires compute capability >= 3.0, " + f"got {dev.compute_capability.major}.{dev.compute_capability.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources (shader, quad, display texture) --- + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _nbytes = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the source Array and upload the test pattern --- + # The Array lives for the entire program, so we use a `with` block. + # Inside it we create / re-create two TextureObjects whenever the + # user cycles the address mode. + with Array.from_descriptor( + shape=(SRC_W, SRC_H), + format=ArrayFormat.UINT8, + num_channels=4, + ) as arr: + pattern = make_pattern(SRC_W, SRC_H) + # Sanity: 256 * 256 * 4 bytes = 262144. + assert pattern.nbytes == arr.size_bytes, ( + f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})" + ) + arr.copy_from(pattern, stream=stream) + stream.sync() # upload must finish before kernel reads + + # --- Step 7: Build initial POINT + LINEAR textures (WRAP mode). --- + # We can't use a `with` block here because the address mode is baked + # into the descriptor at creation time: cycling modes means closing + # and recreating these objects. We instead hold them in mutable + # closure state and release them in on_close(). + tex_state = { + "mode_idx": 0, + "tex_point": None, + "tex_linear": None, + } + + def rebuild_textures(): + # Close previous textures (if any) before creating new ones so we + # don't leak handles when cycling the address mode. + if tex_state["tex_point"] is not None: + tex_state["tex_point"].close() + if tex_state["tex_linear"] is not None: + tex_state["tex_linear"].close() + mode = ADDRESS_MODES[tex_state["mode_idx"]] + tp, tl = make_textures(arr, mode) + tex_state["tex_point"] = tp + tex_state["tex_linear"] = tl + + rebuild_textures() + + # --- Step 8: View state (zoom + pan), tight initial framing. --- + # zoom = pixels_per_texel. zoom=3 -> roughly 3x magnification, which + # makes POINT vs LINEAR obvious without any user input. + view = { + "zoom": 3.0, + "pan_x": SRC_W * 0.5, + "pan_y": SRC_H * 0.5, + "drag": False, + } + + def reset_view(): + view["zoom"] = 3.0 + view["pan_x"] = SRC_W * 0.5 + view["pan_y"] = SRC_H * 0.5 + + # --- Step 9: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_mode_name(): + return ADDRESS_MODES[tex_state["mode_idx"]].name + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + window.clear() + + # (a) Map the PBO so CUDA can write to it. + with resource.map(stream=stream) as buf: + # (b) Launch the split-screen sampling kernel. + launch( + stream, + config, + kernel, + np.uint64(tex_state["tex_point"].handle), + np.uint64(tex_state["tex_linear"].handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(view["zoom"]), + np.float32(view["pan_x"]), + np.float32(view["pan_y"]), + np.int32(SRC_W), + np.int32(SRC_H), + ) + # (c) Unmap happens automatically when the `with` block exits. + + # (d) PBO -> GL texture (GPU-to-GPU). + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (e) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + f"TextureObject Filter - POINT | LINEAR " + f"[address={current_mode_name()}, zoom={view['zoom']:.2f}x, " + f"{fps:.0f} FPS]" + ) + frame_count = 0 + fps_time = now + + # --- Mouse: drag to pan, scroll to zoom ------------------------------ + @window.event + def on_mouse_press(x, y, button, modifiers): + if button == pyglet.window.mouse.LEFT: + view["drag"] = True + + @window.event + def on_mouse_release(x, y, button, modifiers): + if button == pyglet.window.mouse.LEFT: + view["drag"] = False + + @window.event + def on_mouse_drag(x, y, dx, dy, buttons, modifiers): + if not (buttons & pyglet.window.mouse.LEFT): + return + # Pyglet dy is screen-up-positive; texture y is texel-down-positive. + # One screen pixel = 1/zoom texels in source space. + view["pan_x"] -= dx / view["zoom"] + view["pan_y"] += dy / view["zoom"] + + @window.event + def on_mouse_scroll(x, y, scroll_x, scroll_y): + # Geometric zoom; clamp to a sensible range. + factor = 1.1 ** scroll_y + new_zoom = view["zoom"] * factor + view["zoom"] = max(0.1, min(32.0, new_zoom)) + + # --- Keyboard: M cycles address mode, R resets view ------------------ + @window.event + def on_key_press(symbol, modifiers): + key = pyglet.window.key + if symbol == key.M: + tex_state["mode_idx"] = (tex_state["mode_idx"] + 1) % len(ADDRESS_MODES) + rebuild_textures() + elif symbol == key.R: + reset_view() + elif symbol == key.ESCAPE: + window.close() + + @window.event + def on_close(): + # Release CUDA resources in reverse order of creation. + if tex_state["tex_linear"] is not None: + tex_state["tex_linear"].close() + tex_state["tex_linear"] = None + if tex_state["tex_point"] is not None: + tex_state["tex_point"].close() + tex_state["tex_point"] = None + resource.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# KERNEL_SOURCE samples the same source Array through two TextureObjects +# (POINT vs LINEAR) and writes RGBA8 pixels into the PBO. ReadMode. +# NORMALIZED_FLOAT means tex2D() returns each channel in [0, 1]; +# the kernel scales by 255 and writes unsigned bytes back out. +# +# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws +# a texture on a fullscreen quad -- nothing CUDA-specific. +# ============================================================================ + +KERNEL_SOURCE = r""" +extern "C" __global__ +void split_screen_sample(cudaTextureObject_t point_tex, + cudaTextureObject_t linear_tex, + unsigned char* out, + int w, int h, + float zoom, + float pan_x, float pan_y, + int src_w, int src_h) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= w || y >= h) return; + + int half_w = w / 2; + + // 2-pixel-wide white separator down the middle. + if (x == half_w || x == half_w - 1) { + int idx = (y * w + x) * 4; + out[idx + 0] = 255; + out[idx + 1] = 255; + out[idx + 2] = 255; + out[idx + 3] = 255; + return; + } + + // Each half of the screen samples the same (src_x, src_y) so the two + // sides line up visually for an apples-to-apples filter comparison. + float local_x = (x < half_w) ? (float)x : (float)(x - half_w); + + // (src_x, src_y) in source-texture pixel coordinates. Non-normalized + // coords are used, so coordinate (i + 0.5, j + 0.5) selects texel (i, j). + float src_x = pan_x + (local_x - (float)half_w * 0.5f) / zoom; + float src_y = pan_y + ((float)y - (float)h * 0.5f) / zoom; + + float4 sample; + if (x < half_w) { + sample = tex2D(point_tex, src_x, src_y); + } else { + sample = tex2D(linear_tex, src_x, src_y); + } + + int idx = (y * w + x) * 4; + out[idx + 0] = (unsigned char)(sample.x * 255.0f); + out[idx + 1] = (unsigned char)(sample.y * 255.0f); + out[idx + 2] = (unsigned char)(sample.z * 255.0f); + out[idx + 3] = (unsigned char)(sample.w * 255.0f); +} +""" + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py new file mode 100644 index 00000000000..fc5b05f086f --- /dev/null +++ b/cuda_core/examples/texture_sample.py @@ -0,0 +1,220 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates building a 2D CUDA Array, binding it as a +# bindless TextureObject, and sampling it from a kernel with both POINT-exact +# and LINEAR-interpolated coordinates. +# +# Texture coordinate convention (non-normalized): each texel (i, j) is centered +# at (i + 0.5, j + 0.5). So tex2D(tex, 0.5, 0.5) returns texel (0, 0) exactly, +# while tex2D(tex, 1.0, 0.5) returns the linear blend of texels (0, 0) and (1, 0). +# All test coordinates below are chosen with that half-pixel offset in mind. +# +# ################################################################################ + +# /// script +# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc"] +# /// + +import numpy as np + +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + LaunchConfig, + LegacyPinnedMemoryResource, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# Kernel reads N (x, y) coordinates from `coords` (interleaved float pairs) and +# writes tex2D(tex, x, y) to out[i]. Compiled as C++ so the templated +# tex2D overload resolves. +code = r""" +extern "C" __global__ +void sample_texture(cudaTextureObject_t tex, + float *out, + const float *coords, + int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + float x = coords[2 * i + 0]; + float y = coords[2 * i + 1]; + out[i] = tex2D(tex, x, y); +} +""" + + +def main(): + dev = Device() + dev.set_current() + stream = dev.create_stream() + + coords_buf = None + out_buf = None + pinned_mr = LegacyPinnedMemoryResource() + try: + # Allocate a 2D Array: shape=(W, H), single-channel float32. + # Note: Array.from_descriptor takes shape=(width, height), so the host + # buffer fed into copy_from must be laid out as H rows of W elements + # (row-major), i.e. host_pattern.shape == (H, W). + width, height = 16, 16 + with Array.from_descriptor( + shape=(width, height), + format=ArrayFormat.FLOAT32, + num_channels=1, + ) as arr: + # Plant a known pattern: pattern[y, x] = x + 100*y. + # Cast to float32 so the byte count matches the array's storage. + ys, xs = np.meshgrid( + np.arange(height, dtype=np.float32), + np.arange(width, dtype=np.float32), + indexing="ij", + ) + pattern = (xs + 100.0 * ys).astype(np.float32) + assert pattern.shape == (height, width) + arr.copy_from(pattern, stream=stream) + + # Build a linear-filtering, clamped, non-normalized texture. + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=False, + ) + with TextureObject.from_descriptor( + resource=res_desc, texture_descriptor=tex_desc + ) as tex: + _run_kernel_and_verify( + dev, stream, tex, pattern, width, height, pinned_mr + ) + finally: + stream.close() + + +def _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr): + """Kernel launch + correctness check, isolated so the with-blocks in main() + stay readable. Owns its own pinned-buffer cleanup.""" + coords_buf = None + out_buf = None + try: + # Build the test coordinate list: + # - Texel-center samples should return the exact planted value. + # - Half-integer samples land between texels and exercise LINEAR + # filtering -- they should equal the average of the surrounding + # texels. + center_samples = [ + (0.5, 0.5), # -> pattern[0, 0] = 0 + (3.5, 0.5), # -> pattern[0, 3] = 3 + (0.5, 4.5), # -> pattern[4, 0] = 400 + (7.5, 9.5), # -> pattern[9, 7] = 907 + (15.5, 15.5), # -> pattern[15, 15] = 1515 + ] + half_samples = [ + # (1.0, 0.5): blend of texels (0, 0) and (1, 0) -> 0.5 + (1.0, 0.5), + # (0.5, 1.0): blend of texels (0, 0) and (0, 1) -> 50.0 + (0.5, 1.0), + # (1.0, 1.0): blend of the 2x2 block at (0..1, 0..1) -> 50.5 + (1.0, 1.0), + # (4.0, 5.0): blend of the 2x2 block at (3..4, 4..5) -> 453.5 + (4.0, 5.0), + ] + coords = np.array(center_samples + half_samples, dtype=np.float32) + n = coords.shape[0] + coords_flat = coords.reshape(-1) + coords_nbytes = int(coords_flat.nbytes) + out_nbytes = n * np.dtype(np.float32).itemsize + + # Use pinned host memory for inputs and outputs. Pinned allocations are + # GPU-accessible (zero-copy), so the kernel can read coords directly + # and we can read results without a separate device->host copy. + coords_buf = pinned_mr.allocate(coords_nbytes) + out_buf = pinned_mr.allocate(out_nbytes) + coords_view = np.from_dlpack(coords_buf).view(dtype=np.float32) + out_view = np.from_dlpack(out_buf).view(dtype=np.float32) + coords_view[:] = coords_flat + out_view[:] = 0.0 + + # Compile the kernel as C++ (templated tex2D requires this). + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("sample_texture",)) + kernel = mod.get_kernel("sample_texture") + + block = 64 + grid = (n + block - 1) // block + config = LaunchConfig(grid=grid, block=block) + # cudaTextureObject_t is a 64-bit handle; pass it as uint64 to be + # unambiguous (a bare Python int would also work since intptr_t is + # 8 bytes on 64-bit platforms). + launch( + stream, + config, + kernel, + np.uint64(tex.handle), + out_buf, + coords_buf, + np.int32(n), + ) + stream.sync() + results = np.asarray(out_view) + + # Verify texel-center samples (POINT-exact regardless of filter mode). + n_center = len(center_samples) + for i, (x, y) in enumerate(center_samples): + expected = (x - 0.5) + 100.0 * (y - 0.5) + got = float(results[i]) + assert np.isclose(got, expected, atol=1e-4), ( + f"center sample {i} at ({x}, {y}): expected {expected}, got {got}" + ) + + # Verify half-integer samples against the analytic mean of the 4 + # surrounding texels. Allow a small tolerance for the 1/256 fixed-point + # weight quantization that hardware filtering performs. + for j, (x, y) in enumerate(half_samples): + idx = n_center + j + # Surrounding integer texel coordinates: (xi, yi), (xi+1, yi), + # (xi, yi+1), (xi+1, yi+1). With x = xi + 1, y = yi + 1 (e.g. + # (1.0, 1.0)) the four neighbors are (0,0)..(1,1). + xi = int(np.floor(x - 0.5)) + yi = int(np.floor(y - 0.5)) + tx = (x - 0.5) - xi + ty = (y - 0.5) - yi + corners = [] + for dy in (0, 1): + for dx in (0, 1): + xv = min(max(xi + dx, 0), width - 1) + yv = min(max(yi + dy, 0), height - 1) + corners.append(pattern[yv, xv]) + v00, v10, v01, v11 = corners + expected = (1 - tx) * (1 - ty) * v00 + tx * (1 - ty) * v10 + (1 - tx) * ty * v01 + tx * ty * v11 + got = float(results[idx]) + assert np.isclose(got, expected, atol=1e-2), ( + f"half sample {j} at ({x}, {y}): expected {expected}, got {got}" + ) + + print("Texture sampling example completed successfully.") + print(f" texel-center samples verified: {n_center}") + print(f" half-integer samples verified: {len(half_samples)}") + finally: + if coords_buf is not None: + coords_buf.close() + if out_buf is not None: + out_buf.close() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index 31b9f86e0a1..e1666114cc9 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -82,6 +82,15 @@ def has_recent_memory_pool_support() -> bool: SYSTEM_REQUIREMENTS = { "memory_pool_resources.py": has_recent_memory_pool_support, "gl_interop_plasma.py": has_display, + "gl_interop_fire.py": has_display, + "gl_interop_image_show.py": has_display, + "gl_interop_lenia.py": has_display, + "gl_interop_mandelbrot.py": has_display, + "gl_interop_mipmap_lod.py": has_display, + "gl_interop_ocean.py": has_display, + "gl_interop_reaction_diffusion.py": has_display, + "gl_interop_sdf_volume.py": has_display, + "gl_interop_texture_filter.py": has_display, "pytorch_example.py": lambda: ( has_compute_capability_9_or_higher() and is_x86_64() ), # PyTorch only provides CUDA support for x86_64 diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py new file mode 100644 index 00000000000..00e67ed2398 --- /dev/null +++ b/cuda_core/tests/test_texture_surface.py @@ -0,0 +1,968 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import gc + +import pytest + +import cuda.core +from cuda.core import ( + AddressMode, + Array, + ArrayFormat, + Device, + FilterMode, + MipmappedArray, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, +) + + +def test_array_init_disabled(): + with pytest.raises(RuntimeError, match=r"^Array cannot be instantiated directly"): + cuda.core._array.Array() + + +def test_texture_object_init_disabled(): + with pytest.raises(RuntimeError, match=r"^TextureObject cannot be instantiated directly"): + cuda.core._texture.TextureObject() + + +def test_surface_object_init_disabled(): + with pytest.raises(RuntimeError, match=r"^SurfaceObject cannot be instantiated directly"): + cuda.core._surface.SurfaceObject() + + +def test_resource_descriptor_init_disabled(): + with pytest.raises(RuntimeError, match=r"^ResourceDescriptor cannot be instantiated"): + ResourceDescriptor() + + +def test_array_2d_create_and_properties(init_cuda): + arr = Array.from_descriptor( + shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + assert arr.shape == (32, 16) + assert arr.format == ArrayFormat.FLOAT32 + assert arr.num_channels == 1 + assert arr.element_size == 4 + assert arr.size_bytes == 32 * 16 * 4 + assert arr.surface_load_store is False + assert arr.handle != 0 + assert isinstance(arr.device, Device) + finally: + arr.close() + + +def test_array_3d_with_surface_flag(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8, 4), + format=ArrayFormat.UINT8, + num_channels=4, + surface_load_store=True, + ) + try: + assert arr.shape == (8, 8, 4) + assert arr.surface_load_store is True + assert arr.element_size == 4 + finally: + arr.close() + + +def test_array_rejects_bad_channels(init_cuda): + with pytest.raises(ValueError, match="num_channels"): + Array.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3) + + +def test_array_rejects_bad_rank(init_cuda): + with pytest.raises(ValueError, match="shape rank"): + Array.from_descriptor( + shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1 + ) + + +def test_array_roundtrip_copy(init_cuda): + import array as _array + + device = Device() + stream = device.create_stream() + arr = Array.from_descriptor( + shape=(16,), format=ArrayFormat.UINT32, num_channels=1 + ) + try: + src = _array.array("I", list(range(16))) + dst = _array.array("I", [0] * 16) + arr.copy_from(src, stream=stream) + arr.copy_to(dst, stream=stream) + stream.sync() + # Round-trip recovers data; src must not be mutated by copy_from. + assert list(dst) == list(range(16)) + assert list(src) == list(range(16)) + finally: + arr.close() + stream.close() + + +def test_array_copy_rejects_undersized_host_buffer(init_cuda): + import array as _array + + device = Device() + stream = device.create_stream() + arr = Array.from_descriptor( + shape=(16,), format=ArrayFormat.UINT32, num_channels=1 + ) + try: + # arr is 16 * 4 = 64 bytes; pass an 8-element (32-byte) host buffer. + too_small = _array.array("I", [0] * 8) + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_from(too_small, stream=stream) + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_to(too_small, stream=stream) + finally: + arr.close() + stream.close() + + +def test_array_copy_rejects_undersized_device_buffer(init_cuda): + device = Device() + stream = device.create_stream() + arr = Array.from_descriptor( + shape=(16,), format=ArrayFormat.UINT32, num_channels=1 + ) + # arr is 64 bytes; allocate a 32-byte device buffer. + small_buf = device.memory_resource.allocate(32, stream=device.default_stream) + try: + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_from(small_buf, stream=stream) + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_to(small_buf, stream=stream) + finally: + small_buf.close() + arr.close() + stream.close() + + +def test_texture_object_create(init_cuda): + arr = Array.from_descriptor( + shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + ) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc) + try: + assert tex.handle != 0 + assert tex.resource is res + assert tex.texture_descriptor is tex_desc + finally: + tex.close() + finally: + arr.close() + + +def test_surface_object_create(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), + format=ArrayFormat.UINT8, + num_channels=4, + surface_load_store=True, + ) + try: + surf = SurfaceObject.from_array(arr) + try: + assert surf.handle != 0 + assert isinstance(surf.resource, ResourceDescriptor) + finally: + surf.close() + finally: + arr.close() + + +def test_surface_requires_ldst_flag(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4 + ) + try: + with pytest.raises(ValueError, match="surface_load_store=True"): + SurfaceObject.from_array(arr) + finally: + arr.close() + + +def test_address_mode_normalization(init_cuda): + # Direct unit test of the private normalizer: a scalar should expand to a + # 3-tuple; a shorter tuple should be padded by repeating the last entry. + from cuda.core._texture import _normalize_address_modes + + assert _normalize_address_modes(AddressMode.WRAP) == ( + AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, + ) + assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP)) == ( + AddressMode.WRAP, AddressMode.CLAMP, AddressMode.CLAMP, + ) + assert _normalize_address_modes( + (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR) + ) == (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR) + + # Smoke test: a 2-entry tuple is also accepted end-to-end. + arr = Array.from_descriptor( + shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=(AddressMode.WRAP, AddressMode.CLAMP) + ) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc) + try: + assert tex.handle != 0 + finally: + tex.close() + finally: + arr.close() + + +# --- Linear / pitch2D resource descriptors ----------------------------------- + +def _alloc_device_buffer(device, nbytes): + """Allocate a device Buffer using the device's default memory resource.""" + return device.memory_resource.allocate(nbytes, stream=device.default_stream) + + +def test_resource_descriptor_from_linear_defaults_size(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + res = ResourceDescriptor.from_linear( + buf, format=ArrayFormat.FLOAT32, num_channels=1 + ) + assert res.kind == "linear" + assert res.format == ArrayFormat.FLOAT32 + assert res.num_channels == 1 + assert res.source is buf + # repr should include the kind/format hint + assert "linear" in repr(res) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_size_override(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + res = ResourceDescriptor.from_linear( + buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048 + ) + assert res._size_bytes == 2048 + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_oversize(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + with pytest.raises(ValueError, match="exceeds buffer.size"): + ResourceDescriptor.from_linear( + buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048 + ) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_bad_channels(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + with pytest.raises(ValueError, match="num_channels"): + ResourceDescriptor.from_linear( + buf, format=ArrayFormat.UINT8, num_channels=3 + ) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_non_buffer(): + with pytest.raises(TypeError, match="Buffer"): + ResourceDescriptor.from_linear( + object(), format=ArrayFormat.UINT8, num_channels=1 + ) + + +def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + with pytest.raises(ValueError, match="at least one element"): + ResourceDescriptor.from_linear( + buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0 + ) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_non_multiple(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + # UINT32 x 1 channel = 4 bytes/element; 10 bytes is not a multiple. + with pytest.raises(ValueError, match="multiple of element size"): + ResourceDescriptor.from_linear( + buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10 + ) + finally: + buf.close() + + +def test_texture_object_from_linear(init_cuda): + """A linear-backed texture should bind even though sampling fields are + effectively ignored by the driver.""" + device = Device() + # 1024 float elements + buf = _alloc_device_buffer(device, 1024 * 4) + try: + res = ResourceDescriptor.from_linear( + buf, format=ArrayFormat.FLOAT32, num_channels=1 + ) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor()) + try: + assert tex.handle != 0 + assert tex.resource is res + finally: + tex.close() + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_validates_pitch(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 64 * 1024) + try: + # element_size = 4 (UINT32 * 1 channel); width=16 -> min_pitch=64 + with pytest.raises(ValueError, match="pitch_bytes"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT32, + num_channels=1, + width=16, + height=8, + pitch_bytes=32, # < 64 = width*element_size + ) + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_validates_buffer_size(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(ValueError, match="exceeds buffer.size"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=4, + width=64, + height=128, + pitch_bytes=512, # 512 * 128 = 65536 > 4096 + ) + finally: + buf.close() + + +def test_texture_object_from_pitch2d(init_cuda): + """A pitch2D-backed texture should bind given driver-aligned pitch.""" + from cuda.bindings import driver + + device = Device() + # Query the device's required texture pitch alignment (typically 32-512). + err, align = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, + device.device_id, + ) + assert int(err) == 0 + pitch = max(int(align), 256) + height = 16 + buf = _alloc_device_buffer(device, pitch * height) + try: + res = ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=4, + width=32, + height=height, + pitch_bytes=pitch, + ) + assert res.kind == "pitch2d" + assert "pitch2d" in repr(res) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor()) + try: + assert tex.handle != 0 + finally: + tex.close() + finally: + buf.close() + + +def test_surface_rejects_linear_and_pitch2d(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + res_lin = ResourceDescriptor.from_linear( + buf, format=ArrayFormat.UINT32, num_channels=1 + ) + with pytest.raises(ValueError, match="array-backed"): + SurfaceObject.from_descriptor(resource=res_lin) + + res_p2 = ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=4, + width=8, + height=8, + pitch_bytes=64, + ) + with pytest.raises(ValueError, match="array-backed"): + SurfaceObject.from_descriptor(resource=res_p2) + finally: + buf.close() + + +# --- MipmappedArray ---------------------------------------------------------- + +def test_mipmapped_array_init_disabled(): + with pytest.raises( + RuntimeError, match=r"^MipmappedArray cannot be instantiated directly" + ): + cuda.core._mipmapped_array.MipmappedArray() + + +def test_mipmapped_array_from_descriptor_2d(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(64, 32), + format=ArrayFormat.FLOAT32, + num_channels=1, + num_levels=4, + ) + try: + assert mip.shape == (64, 32) + assert mip.format == ArrayFormat.FLOAT32 + assert mip.num_channels == 1 + assert mip.num_levels == 4 + assert mip.surface_load_store is False + assert mip.handle != 0 + assert isinstance(mip.device, Device) + finally: + mip.close() + + +def test_mipmapped_array_get_level_zero_matches_shape(init_cuda): + shape = (64, 32) + mip = MipmappedArray.from_descriptor( + shape=shape, + format=ArrayFormat.UINT8, + num_channels=4, + num_levels=4, + ) + try: + lvl0 = mip.get_level(0) + try: + assert isinstance(lvl0, Array) + # Level 0 must match the base shape and rank. + assert lvl0.shape == shape + assert lvl0.format == ArrayFormat.UINT8 + assert lvl0.num_channels == 4 + assert lvl0.handle != 0 + finally: + lvl0.close() + finally: + mip.close() + + +def test_mipmapped_array_get_level_halves_dims(init_cuda): + shape = (64, 32) + num_levels = 4 + mip = MipmappedArray.from_descriptor( + shape=shape, + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=num_levels, + ) + try: + for level in range(num_levels): + lvl = mip.get_level(level) + try: + # Each dim halves per level, with a floor of 1; rank is preserved. + expected = tuple(max(1, dim >> level) for dim in shape) + assert lvl.shape == expected, ( + f"level={level}: expected {expected}, got {lvl.shape}" + ) + finally: + lvl.close() + finally: + mip.close() + + +def test_mipmapped_array_get_level_out_of_range(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(16, 16), + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=2, + ) + try: + with pytest.raises(ValueError, match="num_levels"): + mip.get_level(mip.num_levels) + with pytest.raises(ValueError, match=">= 0"): + mip.get_level(-1) + finally: + mip.close() + + +def test_mipmapped_array_rejects_zero_levels(init_cuda): + with pytest.raises(ValueError, match="num_levels"): + MipmappedArray.from_descriptor( + shape=(8, 8), + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=0, + ) + + +def test_resource_descriptor_from_mipmapped_array(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(32, 16), + format=ArrayFormat.FLOAT32, + num_channels=1, + num_levels=3, + ) + try: + res = ResourceDescriptor.from_mipmapped_array(mip) + assert res.kind == "mipmapped_array" + assert res.source is mip + finally: + mip.close() + + +def test_resource_descriptor_from_mipmapped_array_rejects_non_mipmap(): + with pytest.raises(TypeError, match="MipmappedArray"): + ResourceDescriptor.from_mipmapped_array(object()) + + +def test_texture_object_from_mipmapped_array(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(32, 32), + format=ArrayFormat.FLOAT32, + num_channels=1, + num_levels=3, + ) + try: + res = ResourceDescriptor.from_mipmapped_array(mip) + # Use non-default mipmap params so the driver exercises that path. + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + normalized_coords=True, + mipmap_filter_mode=FilterMode.LINEAR, + mipmap_level_bias=0.0, + min_mipmap_level_clamp=0.0, + max_mipmap_level_clamp=float(mip.num_levels - 1), + ) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc) + try: + assert tex.handle != 0 + assert tex.resource is res + finally: + tex.close() + finally: + mip.close() + + +def test_surface_rejects_mipmapped_array(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(16, 16), + format=ArrayFormat.UINT8, + num_channels=4, + num_levels=2, + surface_load_store=True, + ) + try: + res = ResourceDescriptor.from_mipmapped_array(mip) + with pytest.raises(ValueError, match="array-backed"): + SurfaceObject.from_descriptor(resource=res) + finally: + mip.close() + + +def test_mipmapped_array_level_keeps_parent_alive(init_cuda): + """Dropping the local parent reference must not invalidate the level Array; + the level holds an internal strong ref back to the MipmappedArray. + + cdef classes don't natively support weakref, so we verify the parent + reference by inspecting the level Array's gc referents. + """ + mip = MipmappedArray.from_descriptor( + shape=(16, 16), + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=3, + ) + parent_id = id(mip) + lvl = mip.get_level(1) + # Drop our local reference and force GC; the parent must survive because + # the level Array holds a strong ref via the internal _parent_ref slot. + del mip + gc.collect() + + # The handle is still valid storage; the level still tracks the parent. + assert lvl.handle != 0 + referents = gc.get_referents(lvl) + parents = [r for r in referents if isinstance(r, MipmappedArray)] + assert len(parents) == 1, ( + f"level Array should reference exactly one MipmappedArray parent, got " + f"{parents!r}" + ) + assert id(parents[0]) == parent_id, ( + "level Array's parent ref is not the original MipmappedArray" + ) + # Closing the level drops its parent ref. Don't access the parent past + # this point; cuMipmappedArrayDestroy may then run. + lvl.close() + + +# --- Negative-path validation tests ------------------------------------------ + +def test_array_from_descriptor_rejects_bad_format(init_cuda): + with pytest.raises(TypeError, match="format must be an ArrayFormat"): + Array.from_descriptor(shape=(8,), format=0, num_channels=1) + + +def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda): + with pytest.raises(TypeError, match="shape must be a tuple"): + Array.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1) + + +def test_array_from_descriptor_rejects_zero_dim(init_cuda): + with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"): + Array.from_descriptor( + shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1 + ) + + +def test_array_copy_rejects_non_stream(init_cuda): + arr = Array.from_descriptor( + shape=(8,), format=ArrayFormat.UINT8, num_channels=1 + ) + try: + import array as _array + buf = _array.array("B", [0] * 8) + with pytest.raises(TypeError, match="stream must be a Stream"): + arr.copy_from(buf, stream="not-a-stream") + with pytest.raises(TypeError, match="stream must be a Stream"): + arr.copy_to(buf, stream="not-a-stream") + finally: + arr.close() + + +def test_resource_descriptor_from_pitch2d_rejects_non_buffer(): + with pytest.raises(TypeError, match="buffer must be a Buffer"): + ResourceDescriptor.from_pitch2d( + object(), + format=ArrayFormat.UINT8, + num_channels=1, + width=8, + height=8, + pitch_bytes=64, + ) + + +def test_resource_descriptor_from_pitch2d_rejects_bad_format(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(TypeError, match="format must be an ArrayFormat"): + ResourceDescriptor.from_pitch2d( + buf, + format=0, + num_channels=1, + width=8, + height=8, + pitch_bytes=64, + ) + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_rejects_bad_channels(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(ValueError, match="num_channels"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=3, + width=8, + height=8, + pitch_bytes=64, + ) + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_rejects_zero_dims(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(ValueError, match="width"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=1, + width=0, + height=8, + pitch_bytes=64, + ) + with pytest.raises(ValueError, match="height"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=1, + width=8, + height=0, + pitch_bytes=64, + ) + finally: + buf.close() + + +def test_mipmapped_array_rejects_bad_format(init_cuda): + with pytest.raises(TypeError, match="format must be an ArrayFormat"): + MipmappedArray.from_descriptor( + shape=(8, 8), format=0, num_channels=1, num_levels=2 + ) + + +def test_mipmapped_array_rejects_bad_channels(init_cuda): + with pytest.raises(ValueError, match="num_channels"): + MipmappedArray.from_descriptor( + shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2 + ) + + +def test_mipmapped_array_rejects_zero_dim(init_cuda): + with pytest.raises(ValueError, match=r"shape\[0\] must be >= 1"): + MipmappedArray.from_descriptor( + shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1 + ) + + +def test_texture_object_rejects_non_resource_descriptor(init_cuda): + with pytest.raises(TypeError, match="resource must be a ResourceDescriptor"): + TextureObject.from_descriptor( + resource=object(), texture_descriptor=TextureDescriptor() + ) + + +def test_texture_object_rejects_non_texture_descriptor(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + with pytest.raises( + TypeError, match="texture_descriptor must be a TextureDescriptor" + ): + TextureObject.from_descriptor(resource=res, texture_descriptor="nope") + finally: + arr.close() + + +def test_texture_object_rejects_bad_filter_mode(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(filter_mode=0) # int, not FilterMode + with pytest.raises(TypeError, match="filter_mode must be a FilterMode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_bad_read_mode(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(read_mode=0) # int, not ReadMode + with pytest.raises(TypeError, match="read_mode must be a ReadMode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(mipmap_filter_mode=0) # int, not FilterMode + with pytest.raises( + TypeError, match="mipmap_filter_mode must be a FilterMode" + ): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_negative_anisotropy(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(max_anisotropy=-1) + with pytest.raises(ValueError, match="max_anisotropy"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_bad_border_color_length(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(border_color=(0.0, 0.0)) # length 2, not 4 + with pytest.raises(ValueError, match="border_color must have 4"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_non_addressmode_scalar(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=42) # int, not AddressMode / iterable + with pytest.raises(TypeError, match="address_mode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_empty_tuple(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=()) + with pytest.raises(ValueError, match="address_mode tuple must have 1-3"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_too_long_tuple(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor( + address_mode=( + AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP + ) + ) + with pytest.raises(ValueError, match="address_mode tuple must have 1-3"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_non_addressmode_entry(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=(AddressMode.WRAP, "bad", AddressMode.CLAMP)) + with pytest.raises(TypeError, match=r"address_mode\[1\]"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_keeps_backing_array_alive(init_cuda): + """Dropping the local references to the backing Array and the + ResourceDescriptor must NOT invalidate an existing TextureObject. The + TextureObject holds a strong ref through its _source_ref slot.""" + arr = Array.from_descriptor( + shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1 + ) + res = ResourceDescriptor.from_array(arr) + tex = TextureObject.from_descriptor( + resource=res, texture_descriptor=TextureDescriptor() + ) + # Verify the keepalive chain via gc referents: TextureObject -> _source_ref + # -> ResourceDescriptor -> _source -> Array. We can only walk one level + # at a time, so check tex's referents include the ResourceDescriptor. + arr_id = id(arr) + res_id = id(res) + del arr, res + gc.collect() + + referents = gc.get_referents(tex) + res_refs = [r for r in referents if id(r) == res_id] + assert len(res_refs) == 1, ( + f"TextureObject should still reference the ResourceDescriptor; " + f"got referents {referents!r}" + ) + res_back = res_refs[0] + arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id] + assert len(arr_refs) == 1, "ResourceDescriptor should still reference its Array" + + # tex.handle should still be valid (non-zero). + assert tex.handle != 0 + tex.close() + + +def test_surface_object_keeps_backing_array_alive(init_cuda): + arr = Array.from_descriptor( + shape=(8, 8), + format=ArrayFormat.UINT8, + num_channels=4, + surface_load_store=True, + ) + surf = SurfaceObject.from_array(arr) + arr_id = id(arr) + del arr + gc.collect() + + # The surface keeps the ResourceDescriptor alive, which keeps the Array + # alive. We verify the chain end-to-end the same way as the texture case. + referents = gc.get_referents(surf) + res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)] + assert len(res_objs) == 1 + arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id] + assert len(arr_refs) == 1, ( + "SurfaceObject should still reference its backing Array via the ResourceDescriptor" + ) + assert surf.handle != 0 + surf.close()