diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index f2d7c85b62e..9769a39977f 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -78,6 +78,17 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     WorkqueueResource,
     WorkqueueResourceOptions,
 )
+from cuda.core._array import Array, ArrayFormat
+from cuda.core._mipmapped_array import MipmappedArray
+from cuda.core._texture import (
+    AddressMode,
+    FilterMode,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+)
+from cuda.core._surface import SurfaceObject
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
 from cuda.core._launch_config import LaunchConfig
diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
new file mode 100644
index 00000000000..73529cac48e
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pxd
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class Array:
+
+    cdef:
+        cydriver.CUarray _handle
+        tuple _shape                 # (w,), (w, h), or (w, h, d)
+        cydriver.CUarray_format _format
+        unsigned int _num_channels   # 1, 2, or 4
+        int _device_id
+        intptr_t _context
+        bint _owning
+        bint _surface_load_store
+        # Optional strong reference to a parent owner (e.g. a MipmappedArray
+        # whose level this Array views). When set, the parent must outlive
+        # this Array because the underlying CUarray belongs to the parent.
+        object _parent_ref
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
new file mode 100644
index 00000000000..7d02dcd5d21
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pyx
@@ -0,0 +1,439 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+cimport cpython
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._stream cimport Stream
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
+
+import enum
+
+
+class ArrayFormat(enum.IntEnum):
+    """Element format for a :class:`Array` allocation.
+
+    Mirrors ``CUarray_format`` from the CUDA driver API.
+    """
+    UINT8   = cydriver.CU_AD_FORMAT_UNSIGNED_INT8
+    UINT16  = cydriver.CU_AD_FORMAT_UNSIGNED_INT16
+    UINT32  = cydriver.CU_AD_FORMAT_UNSIGNED_INT32
+    INT8    = cydriver.CU_AD_FORMAT_SIGNED_INT8
+    INT16   = cydriver.CU_AD_FORMAT_SIGNED_INT16
+    INT32   = cydriver.CU_AD_FORMAT_SIGNED_INT32
+    FLOAT16 = cydriver.CU_AD_FORMAT_HALF
+    FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT
+
+
+# Bytes per element (single channel) for each format.
+_FORMAT_ELEM_SIZE = {
+    int(ArrayFormat.UINT8):   1,
+    int(ArrayFormat.INT8):    1,
+    int(ArrayFormat.UINT16):  2,
+    int(ArrayFormat.INT16):   2,
+    int(ArrayFormat.FLOAT16): 2,
+    int(ArrayFormat.UINT32):  4,
+    int(ArrayFormat.INT32):   4,
+    int(ArrayFormat.FLOAT32): 4,
+}
+
+
+cdef void _fill_array_endpoint(
+    cydriver.CUDA_MEMCPY3D* p, Array arr, bint is_src
+) noexcept:
+    """Populate the src or dst array fields of a CUDA_MEMCPY3D struct."""
+    if is_src:
+        p.srcMemoryType = cydriver.CU_MEMORYTYPE_ARRAY
+        p.srcArray = arr._handle
+        p.srcXInBytes = 0
+        p.srcY = 0
+        p.srcZ = 0
+    else:
+        p.dstMemoryType = cydriver.CU_MEMORYTYPE_ARRAY
+        p.dstArray = arr._handle
+        p.dstXInBytes = 0
+        p.dstY = 0
+        p.dstZ = 0
+
+
+cdef int _fill_host_endpoint(
+    cydriver.CUDA_MEMCPY3D* p,
+    object obj,
+    bint is_src,
+    size_t width_bytes,
+    size_t height,
+    size_t required,
+    cpython.Py_buffer* pybuf_out,
+) except -1:
+    """Populate src/dst host fields from a buffer-protocol ``obj``.
+
+    Acquires a Py_buffer view; the caller is responsible for releasing it
+    (this function always returns with the view held when it returns 1).
+    """
+    cdef int flags = cpython.PyBUF_SIMPLE
+    if not is_src:
+        flags |= cpython.PyBUF_WRITABLE
+    if cpython.PyObject_GetBuffer(obj, pybuf_out, flags) != 0:
+        raise TypeError(
+            f"Source/destination must be a Buffer or a contiguous "
+            f"buffer-protocol object, got {type(obj).__name__}"
+        )
+    if <size_t>pybuf_out.len < required:
+        cpython.PyBuffer_Release(pybuf_out)
+        raise ValueError(
+            f"Host buffer has {pybuf_out.len} bytes, smaller than the array "
+            f"extent ({required} bytes)"
+        )
+    if is_src:
+        p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST
+        p.srcHost = pybuf_out.buf
+        p.srcPitch = width_bytes
+        p.srcHeight = height
+        p.srcXInBytes = 0
+        p.srcY = 0
+        p.srcZ = 0
+    else:
+        p.dstMemoryType = cydriver.CU_MEMORYTYPE_HOST
+        p.dstHost = pybuf_out.buf
+        p.dstPitch = width_bytes
+        p.dstHeight = height
+        p.dstXInBytes = 0
+        p.dstY = 0
+        p.dstZ = 0
+    return 1
+
+
+cdef int _fill_linear_endpoint(
+    cydriver.CUDA_MEMCPY3D* p,
+    object obj,
+    bint is_src,
+    size_t width_bytes,
+    size_t height,
+    size_t depth,
+    cpython.Py_buffer* pybuf_out,
+) except -1:
+    """Populate the src or dst linear fields. Returns 1 if pybuf_out was
+    filled (caller must release it), 0 otherwise.
+    """
+    cdef intptr_t ptr
+    cdef size_t required = width_bytes * height * depth
+    if isinstance(obj, Buffer):
+        if <size_t>(<Buffer>obj).size < required:
+            raise ValueError(
+                f"Buffer size ({(<Buffer>obj).size} bytes) is smaller than "
+                f"the array extent ({required} bytes)"
+            )
+        ptr = int((<Buffer>obj).handle)
+        if is_src:
+            p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.srcDevice = <cydriver.CUdeviceptr>ptr
+            p.srcPitch = width_bytes
+            p.srcHeight = height
+            p.srcXInBytes = 0
+            p.srcY = 0
+            p.srcZ = 0
+        else:
+            p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.dstDevice = <cydriver.CUdeviceptr>ptr
+            p.dstPitch = width_bytes
+            p.dstHeight = height
+            p.dstXInBytes = 0
+            p.dstY = 0
+            p.dstZ = 0
+        return 0
+    return _fill_host_endpoint(
+        p, obj, is_src, width_bytes, height, required, pybuf_out
+    )
+
+
+cdef _copy3d(Array arr, object other, object stream, bint to_array):
+    """Issue a full-array async 3D memcpy between ``arr`` and ``other``.
+
+    Direction is determined by ``to_array``: True copies *into* arr, False
+    copies *out of* arr.
+    """
+    cdef cydriver.CUDA_MEMCPY3D params
+    cdef cpython.Py_buffer pybuf
+    cdef int got_buffer = 0
+    cdef intptr_t stream_handle
+    cdef cydriver.CUstream c_stream
+
+    if not isinstance(stream, Stream):
+        raise TypeError(f"stream must be a Stream, got {type(stream).__name__}")
+
+    memset(&params, 0, sizeof(params))
+    width_bytes, height, depth = arr._extent_bytes()
+    params.WidthInBytes = <size_t>width_bytes
+    params.Height = <size_t>height
+    params.Depth = <size_t>depth
+
+    try:
+        if to_array:
+            got_buffer = _fill_linear_endpoint(
+                &params, other, True, width_bytes, height, depth, &pybuf
+            )
+            _fill_array_endpoint(&params, arr, False)
+        else:
+            _fill_array_endpoint(&params, arr, True)
+            got_buffer = _fill_linear_endpoint(
+                &params, other, False, width_bytes, height, depth, &pybuf
+            )
+
+        stream_handle = int((<Stream>stream).handle)
+        c_stream = <cydriver.CUstream><void*>stream_handle
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemcpy3DAsync(&params, c_stream))
+    finally:
+        if got_buffer:
+            cpython.PyBuffer_Release(&pybuf)
+
+
+cdef class Array:
+    """An opaque, hardware-laid-out GPU allocation for texture/surface access.
+
+    Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer
+    and can only be accessed from kernels through a :class:`TextureObject` or
+    :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D
+    spatial locality.
+
+    Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are
+    supported in this initial version; layered/cubemap/sparse variants will
+    follow once their shape semantics are settled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "Array cannot be instantiated directly. Use Array.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, surface_load_store=False):
+        """Allocate a new CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
+            can be bound as a :class:`SurfaceObject` for kernel-side writes.
+            Default False.
+
+        Returns
+        -------
+        Array
+        """
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+
+        try:
+            shape_t = tuple(int(s) for s in shape)
+        except TypeError as e:
+            raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
+        if not 1 <= len(shape_t) <= 3:
+            raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
+        for i, dim in enumerate(shape_t):
+            if dim < 1:
+                raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+
+        cdef Array self = cls.__new__(cls)
+        self._owning = True
+        self._shape = shape_t
+        self._format = <cydriver.CUarray_format><int>format
+        self._num_channels = num_channels
+        self._surface_load_store = bool(surface_load_store)
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+        self._parent_ref = None
+
+        cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
+        cdef cydriver.CUDA_ARRAY_DESCRIPTOR desc2d
+        cdef int rank = len(shape_t)
+        cdef unsigned int flags = (
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0
+        )
+
+        # cuArrayCreate (2D path) does not accept flags; use the 3D descriptor
+        # whenever any flag is set or shape is 3D.
+        if rank == 3 or flags != 0:
+            memset(&desc3d, 0, sizeof(desc3d))
+            desc3d.Width = <size_t>shape_t[0]
+            desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0)
+            desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0)
+            desc3d.Format = c_format
+            desc3d.NumChannels = <unsigned int>num_channels
+            desc3d.Flags = flags
+            with nogil:
+                HANDLE_RETURN(cydriver.cuArray3DCreate(&self._handle, &desc3d))
+        else:
+            memset(&desc2d, 0, sizeof(desc2d))
+            desc2d.Width = <size_t>shape_t[0]
+            desc2d.Height = <size_t>(shape_t[1] if rank == 2 else 0)
+            desc2d.Format = c_format
+            desc2d.NumChannels = <unsigned int>num_channels
+            with nogil:
+                HANDLE_RETURN(cydriver.cuArrayCreate(&self._handle, &desc2d))
+
+        return self
+
+    @classmethod
+    def _from_handle(cls, intptr_t handle, bint owning, *, device_id=None):
+        """Wrap an externally-allocated ``CUarray``.
+
+        Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``)
+        where the array is owned by the graphics API. With ``owning=False``,
+        :meth:`close` and ``__dealloc__`` will not free the handle. Shape,
+        format, and channel count are queried from the driver.
+        """
+        cdef Array self = cls.__new__(cls)
+        self._handle = <cydriver.CUarray><void*>handle
+        self._owning = owning
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id() if device_id is None else int(device_id)
+        self._parent_ref = None
+
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc
+        with nogil:
+            HANDLE_RETURN(cydriver.cuArray3DGetDescriptor(&desc, self._handle))
+
+        if desc.Depth > 0:
+            self._shape = (int(desc.Width), int(desc.Height), int(desc.Depth))
+        elif desc.Height > 0:
+            self._shape = (int(desc.Width), int(desc.Height))
+        else:
+            self._shape = (int(desc.Width),)
+        self._format = desc.Format
+        self._num_channels = desc.NumChannels
+        self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST)
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUarray`` as an integer."""
+        return <intptr_t>self._handle
+
+    @property
+    def shape(self):
+        """Allocation shape, in elements."""
+        return self._shape
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+        return ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+        return self._num_channels
+
+    @property
+    def element_size(self):
+        """Bytes per element (format size * channels)."""
+        return _FORMAT_ELEM_SIZE[self._format] * self._num_channels
+
+    @property
+    def device(self):
+        """The :class:`Device` this array was allocated on."""
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    @property
+    def surface_load_store(self):
+        """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST``
+        and can be bound as a :class:`SurfaceObject`."""
+        return self._surface_load_store
+
+    def _extent_bytes(self):
+        """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth
+        normalized to >=1 for lower-rank arrays."""
+        cdef int rank = len(self._shape)
+        cdef size_t w = <size_t>self._shape[0] * <size_t>(
+            _FORMAT_ELEM_SIZE[self._format] * self._num_channels
+        )
+        cdef size_t h = <size_t>(self._shape[1] if rank >= 2 else 1)
+        cdef size_t d = <size_t>(self._shape[2] if rank >= 3 else 1)
+        return w, h, d
+
+    def copy_from(self, src, *, stream):
+        """Copy a full-array's worth of data into this array.
+
+        Parameters
+        ----------
+        src : Buffer or buffer-protocol object
+            Source data. Must contain at least ``self.size_bytes`` bytes
+            of contiguous data.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+        _copy3d(self, src, stream, to_array=True)
+
+    def copy_to(self, dst, *, stream):
+        """Copy a full-array's worth of data out of this array.
+
+        Parameters
+        ----------
+        dst : Buffer or writable buffer-protocol object
+            Destination. Must have at least ``self.size_bytes`` bytes of
+            writable, contiguous space.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+        _copy3d(self, dst, stream, to_array=False)
+
+    @property
+    def size_bytes(self):
+        """Total bytes of array storage (``prod(shape) * element_size``)."""
+        cdef size_t n = 1
+        for s in self._shape:
+            n *= <size_t>s
+        return n * <size_t>(_FORMAT_ELEM_SIZE[self._format] * self._num_channels)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUarray`` if owned by this object."""
+        cdef cydriver.CUarray h = self._handle
+        cdef bint owning = self._owning
+        self._handle = NULL
+        # Drop the parent reference (if any) so a non-owning level Array
+        # stops pinning its MipmappedArray after close().
+        self._parent_ref = None
+        if h != NULL and owning:
+            HANDLE_RETURN(cydriver.cuArrayDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuArrayDestroy error here is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != NULL and self._owning:
+            cydriver.cuArrayDestroy(self._handle)
+            self._handle = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return (
+            f"Array(shape={self._shape}, "
+            f"format={ArrayFormat(self._format).name}, "
+            f"num_channels={self._num_channels})"
+        )
diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd
new file mode 100644
index 00000000000..52afc1968cc
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pxd
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class MipmappedArray:
+
+    cdef:
+        cydriver.CUmipmappedArray _handle
+        tuple _shape                 # (w,), (w, h), or (w, h, d)
+        cydriver.CUarray_format _format
+        unsigned int _num_channels   # 1, 2, or 4
+        unsigned int _num_levels
+        int _device_id
+        intptr_t _context
+        bint _owning
+        bint _surface_load_store
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
new file mode 100644
index 00000000000..c149d907f62
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport Array
+from cuda.core._array import ArrayFormat
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
+
+
+cdef class MipmappedArray:
+    """A mipmapped CUDA array for texture/surface access across levels.
+
+    Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out
+    allocation accessible only via a :class:`TextureObject` (or by retrieving
+    the level's :class:`Array` and binding it as a :class:`SurfaceObject`).
+    Destroying the :class:`MipmappedArray` destroys all level arrays
+    implicitly, so the :class:`Array` instances returned by :meth:`get_level`
+    are non-owning and hold a strong reference back to their parent.
+
+    Construct via :meth:`from_descriptor`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "MipmappedArray cannot be instantiated directly. "
+            "Use MipmappedArray.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(
+        cls, *, shape, format, num_channels, num_levels, surface_load_store=False
+    ):
+        """Allocate a new mipmapped CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements, for the base (level 0) mip.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        num_levels : int
+            Number of mip levels to allocate; must be >= 1. The driver caps
+            this at the log2 of the largest dimension; passing a larger value
+            yields a driver error.
+        surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
+            levels (obtained via :meth:`get_level`) can be bound as
+            :class:`SurfaceObject` for kernel-side writes. Default False.
+
+        Returns
+        -------
+        MipmappedArray
+        """
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+
+        try:
+            shape_t = tuple(int(s) for s in shape)
+        except TypeError as e:
+            raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
+        if not 1 <= len(shape_t) <= 3:
+            raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
+        for i, dim in enumerate(shape_t):
+            if dim < 1:
+                raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+
+        levels = int(num_levels)
+        if levels < 1:
+            raise ValueError(f"num_levels must be >= 1, got {levels}")
+
+        cdef MipmappedArray self = cls.__new__(cls)
+        self._owning = True
+        self._shape = shape_t
+        self._format = <cydriver.CUarray_format><int>format
+        self._num_channels = num_channels
+        self._num_levels = <unsigned int>levels
+        self._surface_load_store = bool(surface_load_store)
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
+        cdef int rank = len(shape_t)
+        cdef unsigned int flags = (
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if surface_load_store else 0
+        )
+        cdef unsigned int c_levels = <unsigned int>levels
+
+        # Mipmap creation uses the 3D descriptor regardless of rank; lower-rank
+        # shapes use Height=0/Depth=0 sentinels, matching cuArray3DCreate.
+        memset(&desc3d, 0, sizeof(desc3d))
+        desc3d.Width = <size_t>shape_t[0]
+        desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0)
+        desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0)
+        desc3d.Format = c_format
+        desc3d.NumChannels = <unsigned int>num_channels
+        desc3d.Flags = flags
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuMipmappedArrayCreate(&self._handle, &desc3d, c_levels)
+            )
+
+        return self
+
+    def get_level(self, level):
+        """Return a non-owning :class:`Array` view of the given mip level.
+
+        Parameters
+        ----------
+        level : int
+            Mip level index in ``[0, num_levels)``.
+
+        Returns
+        -------
+        Array
+            A non-owning :class:`Array` wrapping the level's ``CUarray``.
+            The :class:`MipmappedArray` is kept alive for the lifetime of the
+            returned :class:`Array`; the underlying storage is released only
+            when this :class:`MipmappedArray` is destroyed.
+        """
+        lvl = int(level)
+        if lvl < 0:
+            raise ValueError(f"level must be >= 0, got {lvl}")
+        if lvl >= <int>self._num_levels:
+            raise ValueError(
+                f"level ({lvl}) must be < num_levels ({self._num_levels})"
+            )
+
+        cdef cydriver.CUarray level_handle
+        cdef unsigned int c_level = <unsigned int>lvl
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuMipmappedArrayGetLevel(&level_handle, self._handle, c_level)
+            )
+
+        # Wrap as a non-owning Array; the level's underlying CUarray belongs
+        # to this MipmappedArray and must not be destroyed independently.
+        arr = Array._from_handle(
+            <intptr_t>level_handle, False, device_id=self._device_id
+        )
+        # Strong ref back to the parent so the mipmap outlives the level view.
+        (<Array>arr)._parent_ref = self
+        return arr
+
+    @property
+    def handle(self):
+        """The underlying ``CUmipmappedArray`` as an integer."""
+        return <intptr_t>self._handle
+
+    @property
+    def shape(self):
+        """Base-level (level 0) allocation shape, in elements."""
+        return self._shape
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+        return ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+        return self._num_channels
+
+    @property
+    def num_levels(self):
+        """Number of mip levels."""
+        return int(self._num_levels)
+
+    @property
+    def surface_load_store(self):
+        """True if this mipmap (and each of its levels) was created with
+        ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`."""
+        return self._surface_load_store
+
+    @property
+    def device(self):
+        """The :class:`Device` this mipmap was allocated on."""
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUmipmappedArray`` if owned.
+
+        After ``close()`` any level :class:`Array` returned by :meth:`get_level`
+        becomes invalid; callers must not access them.
+        """
+        cdef cydriver.CUmipmappedArray h = self._handle
+        cdef bint owning = self._owning
+        self._handle = NULL
+        if h != NULL and owning:
+            HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuMipmappedArrayDestroy error
+        # here is silently dropped. Callers needing visibility should use
+        # close().
+        if self._handle != NULL and self._owning:
+            cydriver.cuMipmappedArrayDestroy(self._handle)
+            self._handle = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return (
+            f"MipmappedArray(shape={self._shape}, "
+            f"format={ArrayFormat(self._format).name}, "
+            f"num_channels={self._num_channels}, "
+            f"num_levels={self._num_levels})"
+        )
diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd
new file mode 100644
index 00000000000..ba7791d5172
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pxd
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class SurfaceObject:
+
+    cdef:
+        cydriver.CUsurfObject _handle
+        object _source_ref      # keep backing Array alive
+        int _device_id
+        intptr_t _context
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
new file mode 100644
index 00000000000..62cdecc9a01
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport Array
+from cuda.core._texture import ResourceDescriptor
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
+
+
+cdef class SurfaceObject:
+    """A bindless surface handle for kernel-side typed load/store.
+
+    Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface
+    has no sampling state (no filtering, no addressing modes, no normalization);
+    kernels read and write through it using integer pixel coordinates.
+
+    The backing :class:`Array` must have been created with
+    ``surface_load_store=True`` and is kept alive for the lifetime of this
+    object to prevent dangling handles.
+
+    Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
+    kernels as a 64-bit handle (via the ``handle`` property).
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "SurfaceObject cannot be instantiated directly. "
+            "Use SurfaceObject.from_array() or SurfaceObject.from_descriptor()."
+        )
+
+    @classmethod
+    def from_array(cls, array):
+        """Create a surface object directly from an :class:`Array`.
+
+        The array must have been created with ``surface_load_store=True``.
+        """
+        if not isinstance(array, Array):
+            raise TypeError(f"array must be an Array, got {type(array).__name__}")
+        return cls.from_descriptor(resource=ResourceDescriptor.from_array(array))
+
+    @classmethod
+    def from_descriptor(cls, *, resource):
+        """Create a surface object from a :class:`ResourceDescriptor`.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+            Must wrap an :class:`Array` allocated with
+            ``surface_load_store=True``. Linear/pitch2d resources are not
+            valid surface backings.
+        """
+        if not isinstance(resource, ResourceDescriptor):
+            raise TypeError(
+                f"resource must be a ResourceDescriptor, got "
+                f"{type(resource).__name__}"
+            )
+        if resource.kind != "array":
+            raise ValueError(
+                f"SurfaceObject requires an array-backed ResourceDescriptor, "
+                f"got kind={resource.kind!r}"
+            )
+
+        cdef Array arr = <Array>resource.source
+        if not arr.surface_load_store:
+            raise ValueError(
+                "Array must be created with surface_load_store=True to be "
+                "bound as a SurfaceObject"
+            )
+
+        cdef cydriver.CUDA_RESOURCE_DESC res_desc
+        memset(&res_desc, 0, sizeof(res_desc))
+        res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
+        res_desc.res.array.hArray = arr._handle
+
+        cdef SurfaceObject self = cls.__new__(cls)
+        self._source_ref = resource
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuSurfObjectCreate(&self._handle, &res_desc)
+            )
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg)."""
+        return <intptr_t>self._handle
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this surface was built from."""
+        return self._source_ref
+
+    @property
+    def device(self):
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUsurfObject``."""
+        cdef cydriver.CUsurfObject h = self._handle
+        self._handle = 0
+        self._source_ref = None
+        if h != 0:
+            HANDLE_RETURN(cydriver.cuSurfObjectDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuSurfObjectDestroy error is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != 0:
+            cydriver.cuSurfObjectDestroy(self._handle)
+            self._handle = 0
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return f"SurfaceObject(handle=0x{<intptr_t>self._handle:x})"
diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd
new file mode 100644
index 00000000000..4d2d5004069
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pxd
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from cuda.bindings cimport cydriver
+
+
+cdef class TextureObject:
+
+    cdef:
+        cydriver.CUtexObject _handle
+        object _source_ref      # keep backing Array (or other resource) alive
+        object _texture_desc    # original TextureDescriptor for introspection
+        int _device_id
+        intptr_t _context
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
new file mode 100644
index 00000000000..6ccffcadbb1
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -0,0 +1,572 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport Array
+from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._mipmapped_array cimport MipmappedArray
+from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_context_ptr,
+    _get_current_device_id,
+)
+
+import enum
+from dataclasses import dataclass
+
+
+# Driver texture-descriptor flag bits (CU_TRSF_*).
+_TRSF_READ_AS_INTEGER = 0x01
+_TRSF_NORMALIZED_COORDINATES = 0x02
+_TRSF_SRGB = 0x10
+_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 0x20
+_TRSF_SEAMLESS_CUBEMAP = 0x40
+
+
+class AddressMode(enum.IntEnum):
+    """Boundary behavior for out-of-range texture coordinates."""
+    WRAP   = cydriver.CU_TR_ADDRESS_MODE_WRAP
+    CLAMP  = cydriver.CU_TR_ADDRESS_MODE_CLAMP
+    MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR
+    BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER
+
+
+class FilterMode(enum.IntEnum):
+    """Texel sampling mode."""
+    POINT  = cydriver.CU_TR_FILTER_MODE_POINT
+    LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR
+
+
+class ReadMode(enum.IntEnum):
+    """How sampled values are returned to the kernel.
+
+    - ``ELEMENT_TYPE``: return the raw element value (integer formats stay
+      integer, float stays float).
+    - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized
+      ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed).
+      Float formats are unaffected.
+    """
+    ELEMENT_TYPE     = 0
+    NORMALIZED_FLOAT = 1
+
+
+class ResourceDescriptor:
+    """Describes the memory backing a :class:`TextureObject`.
+
+    Construct via the ``from_*`` classmethods:
+
+    - :meth:`from_array` wraps a :class:`Array` (works for both
+      :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
+      objects built from a linear resource do not support filtering,
+      normalized coordinates, or addressing modes.
+    - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image.
+      Supports filtering and 2D addressing, but only 2D access.
+
+    Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
+    require an :class:`Array` allocated with ``surface_load_store=True``.
+    """
+
+    __slots__ = (
+        "_kind", "_source",
+        "_format", "_num_channels",
+        "_size_bytes",
+        "_width", "_height", "_pitch_bytes",
+    )
+
+    def __init__(self):
+        raise RuntimeError(
+            "ResourceDescriptor cannot be instantiated directly. "
+            "Use ResourceDescriptor.from_* factories."
+        )
+
+    @classmethod
+    def from_array(cls, array):
+        """Build a resource descriptor backed by a :class:`Array`."""
+        if not isinstance(array, Array):
+            raise TypeError(f"array must be an Array, got {type(array).__name__}")
+        self = cls.__new__(cls)
+        self._kind = "array"
+        self._source = array
+        self._format = None
+        self._num_channels = None
+        self._size_bytes = None
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_mipmapped_array(cls, mipmapped_array):
+        """Build a resource descriptor backed by a :class:`MipmappedArray`.
+
+        Suitable for binding to a :class:`TextureObject` for mipmapped
+        sampling. Not valid as a :class:`SurfaceObject` backing: surfaces
+        require a single :class:`Array` level (obtain via
+        :meth:`MipmappedArray.get_level`).
+        """
+        if not isinstance(mipmapped_array, _PyMipmappedArray):
+            raise TypeError(
+                f"mipmapped_array must be a MipmappedArray, got "
+                f"{type(mipmapped_array).__name__}"
+            )
+        self = cls.__new__(cls)
+        self._kind = "mipmapped_array"
+        self._source = mipmapped_array
+        self._format = None
+        self._num_channels = None
+        self._size_bytes = None
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_linear(cls, buffer, *, format, num_channels, size_bytes=None):
+        """Build a resource descriptor for a linear (typed 1D) texture fetch.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        size_bytes : int, optional
+            Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not
+            exceed it.
+
+        Notes
+        -----
+        Texture objects built from a linear resource ignore the
+        :class:`TextureDescriptor` addressing/filtering fields — kernels read
+        through a typed 1D fetch with bounds checking only.
+        """
+        if not isinstance(buffer, Buffer):
+            raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+
+        buf_size = int(buffer.size)
+        elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
+        if size_bytes is None:
+            size = buf_size
+        else:
+            size = int(size_bytes)
+            if size > buf_size:
+                raise ValueError(
+                    f"size_bytes ({size}) exceeds buffer.size ({buf_size})"
+                )
+        if size < elem:
+            raise ValueError(
+                f"size_bytes ({size}) must be at least one element ({elem} bytes)"
+            )
+        if size % elem != 0:
+            raise ValueError(
+                f"size_bytes ({size}) must be a multiple of element size "
+                f"({elem} bytes for {format.name} x {num_channels})"
+            )
+
+        self = cls.__new__(cls)
+        self._kind = "linear"
+        self._source = buffer
+        self._format = int(format)
+        self._num_channels = int(num_channels)
+        self._size_bytes = size
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_pitch2d(
+        cls, buffer, *, format, num_channels, width, height, pitch_bytes
+    ):
+        """Build a resource descriptor for a row-pitched 2D image.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        width : int
+            Image width, in elements.
+        height : int
+            Image height, in rows.
+        pitch_bytes : int
+            Distance between consecutive rows, in bytes. Must be at least
+            ``width * format_size * num_channels`` and meet the driver's
+            ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``.
+        """
+        if not isinstance(buffer, Buffer):
+            raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
+        if not isinstance(format, ArrayFormat):
+            raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+        if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+            raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+
+        w = int(width)
+        h = int(height)
+        p = int(pitch_bytes)
+        if w < 1:
+            raise ValueError(f"width must be >= 1, got {w}")
+        if h < 1:
+            raise ValueError(f"height must be >= 1, got {h}")
+        elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
+        min_pitch = w * elem
+        if p < min_pitch:
+            raise ValueError(
+                f"pitch_bytes ({p}) must be >= width * element_size ({min_pitch})"
+            )
+        if p * h > int(buffer.size):
+            raise ValueError(
+                f"pitch_bytes * height ({p * h}) exceeds buffer.size ({int(buffer.size)})"
+            )
+
+        self = cls.__new__(cls)
+        self._kind = "pitch2d"
+        self._source = buffer
+        self._format = int(format)
+        self._num_channels = int(num_channels)
+        self._size_bytes = None
+        self._width = w
+        self._height = h
+        self._pitch_bytes = p
+        return self
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def source(self):
+        return self._source
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat` (``None`` for array-backed)."""
+        return None if self._format is None else ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (``None`` for array-backed)."""
+        return self._num_channels
+
+    @property
+    def size_bytes(self):
+        """Bytes bound for a linear resource (``None`` for other kinds)."""
+        return self._size_bytes
+
+    @property
+    def width(self):
+        """Pitch2D image width, in elements (``None`` for other kinds)."""
+        return self._width
+
+    @property
+    def height(self):
+        """Pitch2D image height, in rows (``None`` for other kinds)."""
+        return self._height
+
+    @property
+    def pitch_bytes(self):
+        """Pitch2D row pitch, in bytes (``None`` for other kinds)."""
+        return self._pitch_bytes
+
+    def __repr__(self):
+        if self._kind == "linear":
+            return (
+                f"ResourceDescriptor(kind='linear', format={self.format.name}, "
+                f"num_channels={self._num_channels}, size_bytes={self._size_bytes})"
+            )
+        if self._kind == "pitch2d":
+            return (
+                f"ResourceDescriptor(kind='pitch2d', format={self.format.name}, "
+                f"num_channels={self._num_channels}, "
+                f"width={self._width}, height={self._height}, "
+                f"pitch_bytes={self._pitch_bytes})"
+            )
+        return f"ResourceDescriptor(kind={self._kind!r})"
+
+
+@dataclass
+class TextureDescriptor:
+    """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``).
+
+    Attributes
+    ----------
+    address_mode : tuple of AddressMode
+        Boundary behavior per axis. May be a single :class:`AddressMode` (applied
+        to all axes) or a tuple of 1-3 entries (one per dimension).
+    filter_mode : FilterMode
+        Texel sampling mode. Default ``POINT``.
+    read_mode : ReadMode
+        How sampled integer values are returned. Default ``ELEMENT_TYPE``.
+    normalized_coords : bool
+        If True, coordinates are in ``[0, 1]`` instead of pixel indices.
+    srgb : bool
+        If True, perform sRGB → linear conversion on read (8-bit formats only).
+    disable_trilinear_optimization : bool
+        If True, request exact trilinear filtering.
+    seamless_cubemap : bool
+        If True, enable seamless cubemap edge filtering.
+    max_anisotropy : int
+        Maximum anisotropy; 0 disables anisotropic filtering.
+    mipmap_filter_mode : FilterMode
+        Filtering between mipmap levels. Default ``POINT``.
+    mipmap_level_bias : float
+    min_mipmap_level_clamp : float
+    max_mipmap_level_clamp : float
+    border_color : tuple of float or None
+        4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means
+        zero.
+    """
+
+    address_mode: object = AddressMode.CLAMP
+    filter_mode: FilterMode = FilterMode.POINT
+    read_mode: ReadMode = ReadMode.ELEMENT_TYPE
+    normalized_coords: bool = False
+    srgb: bool = False
+    disable_trilinear_optimization: bool = False
+    seamless_cubemap: bool = False
+    max_anisotropy: int = 0
+    mipmap_filter_mode: FilterMode = FilterMode.POINT
+    mipmap_level_bias: float = 0.0
+    min_mipmap_level_clamp: float = 0.0
+    max_mipmap_level_clamp: float = 0.0
+    border_color: tuple | None = None
+
+
+def _normalize_address_modes(address_mode):
+    """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple."""
+    if isinstance(address_mode, AddressMode):
+        return (address_mode, address_mode, address_mode)
+    try:
+        modes = tuple(address_mode)
+    except TypeError as e:
+        raise TypeError(
+            "address_mode must be an AddressMode or a tuple of AddressMode"
+        ) from e
+    if not 1 <= len(modes) <= 3:
+        raise ValueError(
+            f"address_mode tuple must have 1-3 entries, got {len(modes)}"
+        )
+    for i, m in enumerate(modes):
+        if not isinstance(m, AddressMode):
+            raise TypeError(
+                f"address_mode[{i}] must be an AddressMode, got {type(m).__name__}"
+            )
+    # Pad to 3 entries by repeating the last one.
+    padded = list(modes) + [modes[-1]] * (3 - len(modes))
+    return tuple(padded)
+
+
+cdef class TextureObject:
+    """A bindless texture handle for kernel-side sampled reads.
+
+    Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the
+    :class:`Array` referenced by the descriptor) is kept alive for the
+    lifetime of this object to prevent dangling handles.
+
+    Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit
+    handle (via the ``handle`` property).
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "TextureObject cannot be instantiated directly. "
+            "Use TextureObject.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(cls, *, resource, texture_descriptor):
+        """Create a texture object from a resource + sampling descriptor.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+        texture_descriptor : TextureDescriptor
+        """
+        if not isinstance(resource, ResourceDescriptor):
+            raise TypeError(
+                f"resource must be a ResourceDescriptor, got "
+                f"{type(resource).__name__}"
+            )
+        if not isinstance(texture_descriptor, TextureDescriptor):
+            raise TypeError(
+                f"texture_descriptor must be a TextureDescriptor, got "
+                f"{type(texture_descriptor).__name__}"
+            )
+
+        cdef cydriver.CUDA_RESOURCE_DESC res_desc
+        cdef cydriver.CUDA_TEXTURE_DESC tex_desc
+        memset(&res_desc, 0, sizeof(res_desc))
+        memset(&tex_desc, 0, sizeof(tex_desc))
+
+        # --- Resource descriptor ---
+        cdef Array arr
+        cdef MipmappedArray mip
+        cdef Buffer buf
+        cdef intptr_t devptr
+        if resource.kind == "array":
+            arr = <Array>resource.source
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
+            res_desc.res.array.hArray = arr._handle
+        elif resource.kind == "mipmapped_array":
+            mip = <MipmappedArray>resource.source
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+            res_desc.res.mipmap.hMipmappedArray = mip._handle
+        elif resource.kind == "linear":
+            buf = <Buffer>resource.source
+            devptr = int(buf.handle)
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_LINEAR
+            res_desc.res.linear.devPtr = <cydriver.CUdeviceptr>devptr
+            res_desc.res.linear.format = <cydriver.CUarray_format><int>resource._format
+            res_desc.res.linear.numChannels = <unsigned int>resource._num_channels
+            res_desc.res.linear.sizeInBytes = <size_t>resource._size_bytes
+        elif resource.kind == "pitch2d":
+            buf = <Buffer>resource.source
+            devptr = int(buf.handle)
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_PITCH2D
+            res_desc.res.pitch2D.devPtr = <cydriver.CUdeviceptr>devptr
+            res_desc.res.pitch2D.format = <cydriver.CUarray_format><int>resource._format
+            res_desc.res.pitch2D.numChannels = <unsigned int>resource._num_channels
+            res_desc.res.pitch2D.width = <size_t>resource._width
+            res_desc.res.pitch2D.height = <size_t>resource._height
+            res_desc.res.pitch2D.pitchInBytes = <size_t>resource._pitch_bytes
+        else:
+            raise NotImplementedError(
+                f"ResourceDescriptor kind {resource.kind!r} is not yet supported"
+            )
+
+        # --- Texture descriptor ---
+        modes = _normalize_address_modes(texture_descriptor.address_mode)
+        tex_desc.addressMode[0] = <cydriver.CUaddress_mode><int>modes[0]
+        tex_desc.addressMode[1] = <cydriver.CUaddress_mode><int>modes[1]
+        tex_desc.addressMode[2] = <cydriver.CUaddress_mode><int>modes[2]
+
+        if not isinstance(texture_descriptor.filter_mode, FilterMode):
+            raise TypeError(
+                f"filter_mode must be a FilterMode, got "
+                f"{type(texture_descriptor.filter_mode).__name__}"
+            )
+        tex_desc.filterMode = <cydriver.CUfilter_mode><int>texture_descriptor.filter_mode
+
+        if not isinstance(texture_descriptor.read_mode, ReadMode):
+            raise TypeError(
+                f"read_mode must be a ReadMode, got "
+                f"{type(texture_descriptor.read_mode).__name__}"
+            )
+
+        cdef unsigned int flags = 0
+        # CU_TRSF_READ_AS_INTEGER suppresses normalization, so it maps to
+        # ReadMode.ELEMENT_TYPE.
+        if texture_descriptor.read_mode == ReadMode.ELEMENT_TYPE:
+            flags |= _TRSF_READ_AS_INTEGER
+        if texture_descriptor.normalized_coords:
+            flags |= _TRSF_NORMALIZED_COORDINATES
+        if texture_descriptor.srgb:
+            flags |= _TRSF_SRGB
+        if texture_descriptor.disable_trilinear_optimization:
+            flags |= _TRSF_DISABLE_TRILINEAR_OPTIMIZATION
+        if texture_descriptor.seamless_cubemap:
+            flags |= _TRSF_SEAMLESS_CUBEMAP
+        tex_desc.flags = flags
+
+        if texture_descriptor.max_anisotropy < 0:
+            raise ValueError("max_anisotropy must be >= 0")
+        tex_desc.maxAnisotropy = <unsigned int>texture_descriptor.max_anisotropy
+
+        if not isinstance(texture_descriptor.mipmap_filter_mode, FilterMode):
+            raise TypeError(
+                f"mipmap_filter_mode must be a FilterMode, got "
+                f"{type(texture_descriptor.mipmap_filter_mode).__name__}"
+            )
+        tex_desc.mipmapFilterMode = <cydriver.CUfilter_mode><int>texture_descriptor.mipmap_filter_mode
+        tex_desc.mipmapLevelBias = <float>texture_descriptor.mipmap_level_bias
+        tex_desc.minMipmapLevelClamp = <float>texture_descriptor.min_mipmap_level_clamp
+        tex_desc.maxMipmapLevelClamp = <float>texture_descriptor.max_mipmap_level_clamp
+
+        cdef int i
+        if texture_descriptor.border_color is None:
+            for i in range(4):
+                tex_desc.borderColor[i] = 0.0
+        else:
+            bc = tuple(texture_descriptor.border_color)
+            if len(bc) != 4:
+                raise ValueError(
+                    f"border_color must have 4 elements, got {len(bc)}"
+                )
+            for i in range(4):
+                tex_desc.borderColor[i] = <float>bc[i]
+
+        cdef TextureObject self = cls.__new__(cls)
+        self._source_ref = resource
+        self._texture_desc = texture_descriptor
+        self._context = _get_current_context_ptr()
+        self._device_id = _get_current_device_id()
+
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuTexObjectCreate(&self._handle, &res_desc, &tex_desc, NULL)
+            )
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUtexObject`` as an integer (64-bit kernel arg)."""
+        return <intptr_t>self._handle
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this texture was built from."""
+        return self._source_ref
+
+    @property
+    def texture_descriptor(self):
+        """The :class:`TextureDescriptor` this texture was built from."""
+        return self._texture_desc
+
+    @property
+    def device(self):
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUtexObject``."""
+        cdef cydriver.CUtexObject h = self._handle
+        self._handle = 0
+        self._source_ref = None
+        if h != 0:
+            HANDLE_RETURN(cydriver.cuTexObjectDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuTexObjectDestroy error is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != 0:
+            cydriver.cuTexObjectDestroy(self._handle)
+            self._handle = 0
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return f"TextureObject(handle=0x{<intptr_t>self._handle:x})"
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
index 4562cd71355..a8115aaf3f9 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd
@@ -4,7 +4,7 @@
 
 cimport cpython
 from cpython.object cimport PyObject
-from libc.stdint cimport int64_t, int32_t, uint8_t, uint16_t, uint32_t
+from libc.stdint cimport int64_t, int32_t, intptr_t, uint8_t, uint16_t, uint32_t
 
 from cuda.bindings cimport cydriver, cynvrtc, cynvvm, cynvjitlink
 
@@ -25,6 +25,12 @@ cdef int HANDLE_RETURN_NVJITLINK(
     cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil
 
 
+# Helpers for retrieving the current CUDA context and device. Raise if no
+# active context is bound to the calling thread.
+cdef intptr_t _get_current_context_ptr() except? 0
+cdef int _get_current_device_id() except? -1
+
+
 # TODO: stop exposing these within the codebase?
 cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil
 cpdef int _check_runtime_error(error) except?-1
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index 1bcfa524884..9ffaf3531ff 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -66,6 +66,27 @@ cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil:
     return 0
 
 
+cdef intptr_t _get_current_context_ptr() except? 0:
+    """Return the current thread's bound CUcontext as an intptr_t.
+
+    Raises ``RuntimeError`` if no context is current.
+    """
+    cdef cydriver.CUcontext ctx
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
+    if ctx == NULL:
+        raise RuntimeError("an active CUDA context is required")
+    return <intptr_t>ctx
+
+
+cdef int _get_current_device_id() except? -1:
+    """Return the current thread's bound CUdevice ordinal."""
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
 cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil:
     """Handle NVRTC result codes, raising NVRTCError with program log on failure."""
     if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS:
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 0a88a5bd4b6..7c1d33e3393 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -159,6 +159,40 @@ Tensor Memory Accelerator (TMA)
    TensorMapDescriptorOptions
 
 
+Textures and surfaces
+---------------------
+
+CUDA arrays back bindless texture and surface objects for kernel-side sampled
+reads and typed load/store. :class:`Array` is allocated through
+:meth:`Array.from_descriptor` and bound through a :class:`ResourceDescriptor`
+factory; linear (1D) and row-pitched 2D :class:`Buffer` views as well as
+mipmapped allocations (:class:`MipmappedArray`) are also supported as texture
+backings.
+
+.. autosummary::
+   :toctree: generated/
+
+   :template: autosummary/cyclass.rst
+
+   Array
+   MipmappedArray
+   ResourceDescriptor
+   TextureObject
+   SurfaceObject
+
+   :template: dataclass.rst
+
+   TextureDescriptor
+
+.. autosummary::
+   :toctree: generated/
+
+   ArrayFormat
+   AddressMode
+   FilterMode
+   ReadMode
+
+
 CUDA compilation toolchain
 --------------------------
 
diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py
new file mode 100644
index 00000000000..c8f2c9165b6
--- /dev/null
+++ b/cuda_core/examples/gl_interop_fire.py
@@ -0,0 +1,774 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop: a classic
+# "Doom-style" procedural fire effect. A scalar heat field lives on a
+# ping-ponged float CUDA Array; each frame the field is advected upward with a
+# horizontal jitter and a small decay, then colorized through a 1D fire-palette
+# TextureObject straight into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to combine a 2D float Array (the heat field) and a 1D RGBA8 Array (the
+#   color palette) under the same texture/surface API.
+# - How to ping-pong a scalar field via Array + SurfaceObject writes and
+#   TextureObject reads, similar to the reaction-diffusion example but with a
+#   single channel.
+# - How to use TextureObject(NORMALIZED_FLOAT) on a UINT8 palette so a
+#   tex1D<float4> lookup returns RGBA in [0, 1] -- no manual unpacking needed.
+# - How to wire mouse / keyboard events into a CUDA simulation without
+#   blocking the event loop.
+#
+# How it works
+# ============
+# The heat field is a WIDTH x HEIGHT scalar in [0, 1]. Each frame we:
+#
+#   1. step kernel: for every pixel,
+#        - if y is near the bottom AND ambient injection is on, write random
+#          high heat ("the embers");
+#        - if the mouse button is held, paint a hot disc near the cursor;
+#        - otherwise read a horizontally-jittered sample from the row "below"
+#          (i.e. one texel toward the bottom of the screen) and subtract a
+#          small decay. This is what creates the upward-flickering motion.
+#   2. colorize kernel: per pixel, sample the heat, look it up in a 1D RGBA8
+#      fire palette via tex1D<float4>, and write RGBA bytes into the PBO.
+#
+#   PING-PONG (two single-channel float Arrays)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-------------+   tex2D<float>    +-------------+
+#   |   heat_a    | ----------------> |             |
+#   | (FLOAT32 x1)|                   |  step_fire  |
+#   +-------------+                   |   kernel    |
+#                                     |             |
+#   +-------------+   surf2Dwrite     |             |
+#   |   heat_b    | <---------------- |             |
+#   | (FLOAT32 x1)|                   +-------------+
+#   +-------------+
+#       (swap)
+#
+# Orientation
+# -----------
+# OpenGL displays texel row 0 at the bottom of the window. The fullscreen quad
+# in create_display_resources() flips t so that kernel y=0 lands at the TOP of
+# the screen -- this lets the kernel keep the intuitive "inject at y = h-1,
+# advect from y+1 -> y" convention while the visible flames rise upward.
+# Mouse coordinates from pyglet (y=0 at window bottom) are flipped to the
+# kernel's y-down convention on entry.
+#
+# surf2Dwrite x-in-bytes
+# ----------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# float surface that means `x * sizeof(float)` = `x * 4`. Getting this wrong
+# silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A flickering wall of doom-style fire rising from the bottom of the window.
+# Hold the mouse button and drag to paint a torch of heat at the cursor.
+# Press SPACE to toggle the ambient embers along the bottom row (the fire
+# will die out when ambient is OFF). Press R to clear the heat field.
+# Press Escape or close the window to exit. The window title shows FPS and
+# whether ambient injection is currently on.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+# Window dimensions (what the user sees).
+WINDOW_WIDTH = 640
+WINDOW_HEIGHT = 480
+
+# Simulation dimensions (the heat-field grid). Doom's actual screen was
+# 320x200; we use 320x100 so the canonical decay rate of ~1 intensity unit
+# per row (random {0, 1, 2}, average 1) produces flames that reach ~36% of
+# the screen height -- the recognizable "tall licking flames" look.
+# NEAREST-filtered upscale to the 640x480 window stretches vertically 4.8x,
+# giving the chunky retro pixel-doubled appearance.
+WIDTH = 320
+HEIGHT = 100
+
+# Canonical Doom fire palette: 37 hand-tuned colors (intensity 0..36 -> RGB).
+# Source: https://github.com/tiagomenegaz/doom-fire (and Fabien Sanglard's
+# analysis of the original PSX Doom fire effect).
+PALETTE_SIZE = 37
+MAX_INTENSITY = 36
+TORCH_RADIUS = 12  # pixel radius of the mouse-painted hot disc (sim space)
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex1D<float4> / tex2D<float> overloads
+    # resolve.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("step_fire", "colorize_fire"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("step_fire"),
+        "colorize": mod.get_kernel("colorize_fire"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # Both kernels are pixel-parallel over a WIDTH x HEIGHT grid.
+    configs = {"step": config, "colorize": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WINDOW_WIDTH,
+        WINDOW_HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Doom Fire",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate for a textured fullscreen quad. The texcoord
+    `t` is flipped versus the plasma example so that kernel y=0 lands at the
+    TOP of the screen. That lets the fire kernel keep the intuitive
+    "inject at the largest y, advect upward" convention while the visible
+    flames rise toward the top.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window). Note the
+    # flipped t coordinates compared to gl_interop_plasma: (-1, -1) gets t=1
+    # so screen-bottom samples the kernel's largest-y row.
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1, -1, 0, 1,
+             1, -1, 1, 1,
+             1,  1, 1, 0,
+            -1, -1, 0, 1,
+             1,  1, 1, 0,
+            -1,  1, 0, 0,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (filled each frame from the PBO).
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    # NEAREST upscale: makes the low-res simulation render with crisp,
+    # blocky pixels instead of bilinear-blended mush. Critical to the
+    # Doom-fire look.
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_heat_arrays():
+    """Allocate two single-channel UINT8 ping-pong Arrays for the heat field.
+
+    Intensity is an integer in [0, 36] indexing the canonical Doom palette.
+    UINT8 is exactly one byte per texel -- surf2Dwrite x-coord = x * 1.
+    """
+    arr_a = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    arr_b = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_heat_texture(arr):
+    """Bind `arr` as a TextureObject configured for POINT + CLAMP reads.
+
+    POINT filtering is what gives Doom fire its chunky retro look. LINEAR
+    smooths the per-frame horizontal jitter into a uniform glow that
+    doesn't read as fire.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # Non-normalized: the step kernel addresses texels in pixel space.
+        normalized_coords=False,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def build_fire_palette():
+    """Return the canonical Doom fire palette as a (37, 4) uint8 array.
+
+    The 37 entries map intensity 0 (black) -> 36 (white). Each entry is
+    indexed by the integer intensity in the heat field.
+
+    Source: Fabien Sanglard's PSX Doom analysis, reproduced in
+    https://github.com/tiagomenegaz/doom-fire.
+    """
+    rgb = [
+        (  7,   7,   7), ( 31,   7,   7), ( 47,  15,   7), ( 71,  15,   7),
+        ( 87,  23,   7), (103,  31,   7), (119,  31,   7), (143,  39,   7),
+        (159,  47,   7), (175,  63,   7), (191,  71,   7), (199,  71,   7),
+        (223,  79,   7), (223,  87,   7), (223,  87,   7), (215,  95,   7),
+        (215,  95,   7), (215, 103,  15), (207, 111,  15), (207, 119,  15),
+        (207, 127,  15), (207, 135,  23), (199, 135,  23), (199, 143,  23),
+        (199, 151,  31), (191, 159,  31), (191, 159,  31), (191, 167,  39),
+        (191, 167,  39), (191, 175,  47), (183, 175,  47), (183, 183,  47),
+        (183, 183,  55), (207, 207, 111), (223, 223, 159), (239, 239, 199),
+        (255, 255, 255),
+    ]
+    # Index 0 (the "no fire" color) is rendered as pure black so dead pixels
+    # don't glow. The canonical (7, 7, 7) reads as a dim background which is
+    # less dramatic against the dark window.
+    rgb[0] = (0, 0, 0)
+    assert len(rgb) == PALETTE_SIZE
+    rgba = np.empty((PALETTE_SIZE, 4), dtype=np.uint8)
+    rgba[:, :3] = np.array(rgb, dtype=np.uint8)
+    rgba[:, 3] = 255
+    return rgba
+
+
+def make_palette_array_and_texture(stream):
+    """Allocate the 1D RGBA8 palette Array, upload, and bind as a texture.
+
+    Returns (palette_array, palette_texture). Both must be closed by the
+    caller (or used inside `with` blocks).
+    """
+    palette = build_fire_palette()  # shape (PALETTE_SIZE, 4), uint8
+    arr = Array.from_descriptor(
+        shape=(PALETTE_SIZE,),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    # 1D Array bytes match a flat (PALETTE_SIZE * 4) uint8 buffer.
+    arr.copy_from(np.ascontiguousarray(palette), stream=stream)
+
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        # POINT keeps the palette stops as discrete color bands -- the
+        # classic Doom fire palette is indexed, not gradient-blended.
+        filter_mode=FilterMode.POINT,
+        # NORMALIZED_FLOAT: tex1D<float4> returns each UINT8 channel as a
+        # float in [0, 1], so the colorize kernel can multiply by 255 and
+        # store directly without manual unpacking.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        # Normalized: the kernel feeds a heat value in [0, 1] as the LUT
+        # coordinate. With normalized_coords=True the LINEAR filter blends
+        # adjacent palette entries smoothly.
+        normalized_coords=True,
+    )
+    tex = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+    return arr, tex
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate heat-field Arrays, palette Array, and the four
+    #             bindless handles (textures + surfaces). We hold them open
+    #             for the lifetime of the window and release in on_close(),
+    #             matching the reaction-diffusion example. (Using `with`
+    #             blocks here would close everything before the pyglet event
+    #             loop has a chance to use them.)
+    arr_a, arr_b = make_heat_arrays()
+    palette_arr, palette_tex = make_palette_array_and_texture(stream)
+    tex_a = make_heat_texture(arr_a)
+    tex_b = make_heat_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # The heat field is born zeroed by Array.from_descriptor. No seed pass.
+    state = {
+        "current": "a",            # which array holds the latest heat field
+        "frame_index": 0,           # passed into the step kernel as `t`
+        "ambient": True,            # SPACE toggles bottom-row injection
+        "mouse_down": False,
+        "mouse_x": 0,
+        "mouse_y": 0,
+    }
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    def clear_field():
+        """Zero both heat arrays and seed the bottom row at full intensity.
+
+        Array.copy_from is the simplest reset path -- a dedicated clear
+        kernel would be faster but is unnecessary for an interactive demo.
+        The bottom row is set to MAX_INTENSITY so the very first frame
+        already has a fire source to advect from.
+        """
+        seed = np.zeros((HEIGHT, WIDTH), dtype=np.uint8)
+        seed[HEIGHT - 1, :] = MAX_INTENSITY  # canonical Doom fire source
+        arr_a.copy_from(np.ascontiguousarray(seed), stream=stream)
+        arr_b.copy_from(np.ascontiguousarray(seed), stream=stream)
+        state["current"] = "a"
+
+    # Seed at startup so frame 1 already has a source row.
+    clear_field()
+    stream.sync()
+
+    # --- Step 7: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.SPACE:
+            state["ambient"] = not state["ambient"]
+            return
+        if symbol == key.R:
+            clear_field()
+            return
+
+    # Map window coords (WINDOW_WIDTH x WINDOW_HEIGHT, y=0 at bottom) to
+    # simulation coords (WIDTH x HEIGHT, y=0 at top).
+    def _window_to_sim(x, y):
+        sx = int(x * WIDTH / WINDOW_WIDTH)
+        sy = int((WINDOW_HEIGHT - 1 - y) * HEIGHT / WINDOW_HEIGHT)
+        return sx, sy
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        state["mouse_down"] = True
+        state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y)
+
+    @window.event
+    def on_mouse_release(_x, _y, _button, _modifiers):
+        state["mouse_down"] = False
+
+    @window.event
+    def on_mouse_drag(x, y, _dx, _dy, _buttons, _modifiers):
+        state["mouse_down"] = True
+        state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y)
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Advance the heat field by one step.
+        tex_read, surf_write, next_current = current_read_write()
+        launch(
+            stream,
+            configs["step"],
+            kernels["step"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.uint32(state["frame_index"]),
+            np.int32(state["mouse_x"]),
+            np.int32(state["mouse_y"]),
+            np.int32(1 if state["mouse_down"] else 0),
+            np.int32(1 if state["ambient"] else 0),
+        )
+        state["current"] = next_current
+        state["frame_index"] += 1
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_heat = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_heat.handle),
+                np.uint64(palette_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            ambient_label = "on" if state["ambient"] else "off"
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Doom Fire"
+                f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" ambient {ambient_label})"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly to be deterministic about ordering.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        palette_tex.close()
+        palette_arr.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains two CUDA C++ kernels:
+#       * step_fire     -- advances the heat field. Reads previous state via a
+#                          TextureObject (LINEAR + CLAMP, non-normalized) and
+#                          writes the next state via a SurfaceObject. Bakes
+#                          the bottom-row injection, mouse torch, and upward
+#                          jittered advection into a single pass.
+#       * colorize_fire -- per pixel: read heat from the heat TextureObject,
+#                          look up the fire palette via tex1D<float4>, write
+#                          RGBA bytes to the OpenGL PBO.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. The quad's t
+#     coordinate is flipped versus the plasma example so that y=0 maps to the
+#     top of the screen (see create_display_resources for why).
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Small, deterministic, GPU-friendly hash. Returns a value in [0, 1).
+// Used both for bottom-row ember intensity and for the per-pixel jitter that
+// gives the fire its characteristic horizontal flicker.
+__device__ __forceinline__ float hash3(unsigned int x, unsigned int y,
+                                       unsigned int t) {
+    unsigned int h = x * 374761393u + y * 668265263u + t * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h ^= (h >> 16);
+    return (float)(h & 0x00ffffffu) / (float)0x01000000u;
+}
+
+// Canonical Doom-fire step (gather form of the original scatter algorithm).
+//
+// Reference scatter (one cell per JS source row):
+//     decay = random in {0, 1, 2}
+//     below = state[x, y+1]
+//     new = max(0, below - decay)
+//     state[x - decay, y] = new        // writes LEFT of source -> leftward lean
+//
+// Equivalent gather (one CUDA thread per destination cell):
+//     decay = hash(x, y, t) in {0, 1, 2}
+//     below = state[x + decay, y+1]    // reads from the right-shifted source
+//     new = max(0, below - decay)
+//     state[x, y] = new
+//
+// The right-shifted gather reads the same data the leftward-shifted scatter
+// would have produced.
+
+extern "C"
+__global__
+void step_fire(cudaTextureObject_t tex_read,
+               cudaSurfaceObject_t surf_write,
+               int width, int height,
+               unsigned int t,
+               int mouse_x, int mouse_y, int mouse_active,
+               int ambient_on) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    const int MAX_I = 36;
+
+    // 1) Mouse torch: a hot disc painted at the cursor (overrides everything).
+    if (mouse_active) {
+        int dx = x - mouse_x;
+        int dy = y - mouse_y;
+        if (dx * dx + dy * dy <= 12 * 12) {  // matches host TORCH_RADIUS
+            surf2Dwrite((unsigned char)MAX_I, surf_write, x, y);
+            return;
+        }
+    }
+
+    // 2) Bottom row is the steady fire source. Hardcoded to MAX_I when the
+    //    ambient ember bed is on; zero otherwise (lets the fire die down).
+    if (y == height - 1) {
+        surf2Dwrite((unsigned char)(ambient_on ? MAX_I : 0),
+                    surf_write, x, y);
+        return;
+    }
+
+    // 3) Gather from the row below with random {0, 1, 2} horizontal shift
+    //    and matching intensity decay -- the canonical Doom-fire update.
+    float jitter_h = hash3((unsigned int)x, (unsigned int)y, t);
+    int decay = (int)(jitter_h * 3.0f);             // 0, 1, or 2
+    int src_x = x + decay;
+    if (src_x >= width) src_x = width - 1;
+    unsigned char below = tex2D<unsigned char>(tex_read,
+                                               (float)src_x + 0.5f,
+                                               (float)y + 1.5f);
+    int new_i = (int)below - decay;
+    if (new_i < 0) new_i = 0;
+
+    // UINT8 is 1 byte, so surf2Dwrite's x argument is already the byte offset.
+    surf2Dwrite((unsigned char)new_i, surf_write, x, y);
+}
+
+extern "C"
+__global__
+void colorize_fire(cudaTextureObject_t tex_heat,
+                   cudaTextureObject_t palette_tex,
+                   unsigned char* output,
+                   int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Heat texture is UINT8 + ELEMENT_TYPE: tex2D<unsigned char> returns the
+    // raw intensity byte (0..36).
+    unsigned char h = tex2D<unsigned char>(tex_heat,
+                                           (float)x + 0.5f,
+                                           (float)y + 0.5f);
+
+    // Palette texture is 1D normalized RGBA8 with POINT filtering and 37
+    // entries. Index i lands at coord (i + 0.5) / 37 -- the texel center,
+    // which POINT samples exactly.
+    const float palette_size = 37.0f;
+    float u = ((float)h + 0.5f) / palette_size;
+    float4 c = tex1D<float4>(palette_tex, u);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(c.x * 255.0f);
+    output[idx + 1] = (unsigned char)(c.y * 255.0f);
+    output[idx + 2] = (unsigned char)(c.z * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py
new file mode 100644
index 00000000000..4bdd55e1569
--- /dev/null
+++ b/cuda_core/examples/gl_interop_image_show.py
@@ -0,0 +1,428 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# Minimal "Hello World" for the cuda.core texture/surface stack.
+#
+# Allocates a small `Array`, fills it with a procedural image once, binds it
+# as a `TextureObject`, and uses a single CUDA kernel to sample that texture
+# at every screen pixel (with a scale + rotation transform) and write the
+# result into an OpenGL PBO for display.
+#
+# Nothing else: no `SurfaceObject`, no ping-pong, no simulation, no mipmaps.
+# If you have never touched the new APIs before, open this file first.
+#
+# ################################################################################
+#
+# What this example teaches
+# =========================
+# - Allocate an `Array` and upload data into it with `Array.copy_from`.
+# - Build a `TextureObject` from a `ResourceDescriptor` + `TextureDescriptor`.
+# - The visual difference between `FilterMode.POINT` and `FilterMode.LINEAR`
+#   (press F to toggle live).
+# - That filter mode is baked into the `TextureDescriptor` at creation time,
+#   so changing it requires destroying and rebuilding the `TextureObject`.
+#
+# How it works
+# ============
+#   Startup (once):
+#     +-------------------+   copy_from   +----------+
+#     | host numpy image  | ------------> |  Array   |  (UINT8 RGBA, 64x64)
+#     +-------------------+               +----+-----+
+#                                              |
+#                                              v
+#                                       +-------------+
+#                                       | TextureObj  |  (filter mode = POINT)
+#                                       +-------------+
+#
+#   Each frame:
+#     - kernel `sample_image` reads from the TextureObject at a transformed
+#       (u, v) per screen pixel and writes RGBA bytes to the GL PBO.
+#     - OpenGL copies the PBO into a screen texture and draws it.
+#
+# What you should see
+# ===================
+# A 64x64 procedural test pattern (checkerboard + colored gradient stripes +
+# diagonal lines) magnified to fill the window. Press F to switch between
+# POINT (blocky) and LINEAR (smooth) sampling; the difference is immediately
+# visible. Press R to start/stop a slow rotation. Esc to quit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+WIDTH = 640
+HEIGHT = 480
+IMAGE_SIZE = 64  # the source Array is IMAGE_SIZE x IMAGE_SIZE RGBA8
+
+
+# ============================= Helper functions =============================
+
+
+def make_test_image(size):
+    """Build a (size, size, 4) uint8 RGBA test pattern.
+
+    Designed so the filter-mode difference is obvious: hard-edged checkerboard
+    (POINT preserves the edges; LINEAR smooths them) plus a vertical color
+    gradient stripe (LINEAR blends smoothly between palette stops) plus two
+    diagonal hairlines (POINT preserves them; LINEAR softens them).
+    """
+    img = np.zeros((size, size, 4), dtype=np.uint8)
+    # 8x8 black/white checkerboard
+    cells = size // 8
+    for y in range(size):
+        for x in range(size):
+            if ((x // cells) + (y // cells)) & 1:
+                img[y, x, :3] = 255
+    # vertical RGB gradient strip down the left third
+    strip = size // 3
+    img[:, :strip, 0] = np.linspace(255, 0, size, dtype=np.uint8)[:, None].repeat(strip, axis=1)
+    img[:, :strip, 1] = np.linspace(0, 255, size, dtype=np.uint8)[:, None].repeat(strip, axis=1)
+    img[:, :strip, 2] = 128
+    # two diagonal red hairlines
+    for d in range(size):
+        img[d, d, :] = [255, 0, 0, 255]
+        if d < size - 4:
+            img[d, d + 4, :] = [255, 0, 0, 255]
+    img[:, :, 3] = 255  # opaque
+    return img
+
+
+def setup_cuda():
+    """Compile the kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("sample_image",))
+    kernel = mod.get_kernel("sample_image")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window. Returns (window, gl_module, pyglet_module)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array + TextureObject - Image Show",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard pyglet boilerplate: shader, fullscreen quad, screen texture."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create the GL PBO that CUDA writes RGBA pixels into each frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_texture(arr, filter_mode):
+    """Build a `TextureObject` for `arr` with the given FilterMode.
+
+    Filter mode is baked into the descriptor at creation; to switch modes
+    we close this object and call this helper again.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=filter_mode,
+        # UINT8 source + NORMALIZED_FLOAT means tex2D<float4> returns each
+        # channel as a float in [0, 1] -- handy for the colorize math below.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) ---
+    shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the PBO that CUDA will write into ---
+    pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Allocate the source `Array` and upload the test pattern ---
+    arr = Array.from_descriptor(
+        shape=(IMAGE_SIZE, IMAGE_SIZE),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    host_image = make_test_image(IMAGE_SIZE)
+    arr.copy_from(np.ascontiguousarray(host_image), stream=stream)
+    stream.sync()
+
+    # --- Step 6: Bind the Array as a TextureObject (initially POINT) ---
+    state = {"filter": FilterMode.POINT, "rotate": False, "angle": 0.0}
+    tex = make_texture(arr, state["filter"])
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        nonlocal tex
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.F:
+            # Filter mode is baked at TextureObject creation time. Swapping
+            # it means closing the old one and building a new one.
+            state["filter"] = (
+                FilterMode.LINEAR if state["filter"] == FilterMode.POINT
+                else FilterMode.POINT
+            )
+            tex.close()
+            tex = make_texture(arr, state["filter"])
+        elif symbol == key.R:
+            state["rotate"] = not state["rotate"]
+
+    # --- Step 7: Render loop ---
+    start = time.monotonic()
+    last_t = start
+    frame_count = 0
+    fps_time = start
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time, last_t
+        now = time.monotonic()
+        if state["rotate"]:
+            state["angle"] += (now - last_t) * 0.5  # rad/sec
+        last_t = now
+
+        window.clear()
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(state["angle"]),
+            )
+        copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex)
+
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                f"cuda.core Array + TextureObject - Image Show "
+                f"(filter={state['filter'].name}, "
+                f"rotate={'on' if state['rotate'] else 'off'}, "
+                f"{fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        tex.close()
+        arr.close()
+        resource.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ============================== GPU code (kernel) ============================
+
+KERNEL_SOURCE = r"""
+extern "C"
+__global__
+void sample_image(cudaTextureObject_t tex,
+                  unsigned char* output,
+                  int width, int height,
+                  float angle) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Center the screen pixel around (0, 0) in [-aspect, aspect] x [-1, 1].
+    float aspect = (float)width / (float)height;
+    float sx = ((float)x / (float)width  - 0.5f) * 2.0f * aspect;
+    float sy = ((float)y / (float)height - 0.5f) * 2.0f;
+
+    // Inverse-rotate the screen point: rotating the image by +angle means
+    // each output pixel reads from the source rotated by -angle.
+    float c = cosf(-angle), s = sinf(-angle);
+    float rx = c * sx - s * sy;
+    float ry = s * sx + c * sy;
+
+    // Map rotated screen point to the [0, 1] x [0, 1] texture domain so the
+    // image (drawn centered, fitting ~75% of the window height) lands on it.
+    const float scale = 0.75f;
+    float u = (rx / (2.0f * scale)) + 0.5f;
+    float v = (ry / (2.0f * scale)) + 0.5f;
+
+    // AddressMode.CLAMP means out-of-range u/v sample the edge texel.
+    float4 col = tex2D<float4>(tex, u, v);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(col.x * 255.0f);
+    output[idx + 1] = (unsigned char)(col.y * 255.0f);
+    output[idx + 2] = (unsigned char)(col.z * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py
new file mode 100644
index 00000000000..c1772514a70
--- /dev/null
+++ b/cuda_core/examples/gl_interop_lenia.py
@@ -0,0 +1,805 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Lenia
+# continuous cellular automaton is ping-ponged between two CUDA arrays each
+# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads through
+# a large bell-shaped neighborhood kernel, and a SurfaceObject provides typed
+# writes. The final state is colorized straight into an OpenGL PBO. Requires
+# pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to drive a wide-radius convolution from a TextureObject configured for
+#   LINEAR + WRAP + normalized coordinates. The same Array is then bound as a
+#   SurfaceObject for the typed write back, requiring `surface_load_store=True`
+#   at allocation time.
+# - How a single-channel `float` Array differs from the multi-channel layout
+#   used in the Gray-Scott example: `num_channels=1`, `tex2D<float>` reads, and
+#   a 4-byte x-stride in `surf2Dwrite`.
+# - How to host-precompute a normalization constant for a stencil with a
+#   variable-shape support (the bell-curve neighborhood), then pass it as a
+#   plain float kernel argument.
+#
+# How it works
+# ============
+# Lenia (Bert Wang-Chak Chan, 2018) generalizes Conway's Game of Life to
+# continuous space, time, and state. Each cell holds a real value in [0, 1].
+# Per step, every cell:
+#
+#   1. Integrates a smooth bell-shaped neighborhood kernel K against the
+#      current state to produce a "potential" U:
+#
+#          U(x) = sum over offsets (dx, dy) inside a disk of radius R of
+#                  K(|(dx, dy)|) * state(x + (dx, dy))
+#                 divided by  sum of K  (host-precomputed).
+#
+#      K(r) = exp(-((r / R) - mu_K)^2 / (2 * sigma_K^2)) for r <= R.
+#
+#   2. Applies the growth function G and updates the state:
+#
+#          state_new = clamp(state_old + dt * (2 * exp(-(U - mu)^2 /
+#                            (2 * sigma^2)) - 1),  0,  1).
+#
+# Two single-channel `float` arrays are ping-ponged each frame: a
+# TextureObject reads one (sampled with LINEAR + WRAP so the disk wraps
+# toroidally) and a SurfaceObject writes the other.
+#
+#   PING-PONG (two arrays, swap each step)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+   tex2D<float>    +------------------+
+#   |   arr_a      | ----------------> |                  |
+#   |    state     |                   |  convolve_lenia  |
+#   +--------------+                   |     kernel       |
+#                                      |  (+ growth fn)   |
+#   +--------------+   surf2Dwrite     |                  |
+#   |   arr_b      | <---------------- |                  |
+#   |    state     |                   +------------------+
+#   +--------------+
+#       (swap)
+#
+# After the step we run a separate `colorize_lenia` kernel that samples the
+# new state and writes RGBA bytes straight into the OpenGL PBO via
+# GraphicsResource. No data ever travels across the PCIe bus during the frame.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Lenia's neighborhood radius (R = 13) is wide enough that boundary handling
+# really matters. AddressMode.WRAP gives a toroidal world for free, and it is
+# only supported in normalized coordinate mode (see the CUDA Programming
+# Guide). LINEAR filtering is essentially free on the hardware -- here it
+# softens the integer-offset reads a hair, which keeps the dynamics smooth.
+# Sample coordinates are `(x + dx + 0.5) / W`; values < 0 or > 1 are fine,
+# WRAP handles them.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# single-channel `float` surface that means `x * sizeof(float)` = `x * 4`.
+# (The Gray-Scott example uses 8 because it stores `float2`.)
+#
+# One step per frame
+# ------------------
+# Each step convolves a (2R+1)^2 = 729-tap neighborhood for every pixel, which
+# is much heavier than a Gray-Scott 5-point Laplacian. With dt = 0.1 the
+# dynamics are slow enough that one step per displayed frame is plenty. There
+# is no `N_STEPS` loop.
+#
+# What you should see
+# ===================
+# A window showing soft, glider-like blobs drifting across the field on a
+# teal-on-black palette. Press R to reseed with a new Gaussian blob, 1 to
+# clear the field, and Escape to exit. The window title shows the current
+# FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 256
+HEIGHT = 256
+
+# Neighborhood / kernel shape
+R = 13  # convolution radius in pixels (texture-space)
+MU_K = 0.5  # bell center for the neighborhood weight K(r/R)
+SIGMA_K = 0.15  # bell width for K
+
+# Growth function shape
+MU = 0.15  # bell center for the growth function G(U)
+SIGMA = 0.015  # bell width for G
+
+DT = 0.1  # time step
+
+# Initial blob radius and peak for the Gaussian seed.
+# The radius must be large relative to the neighborhood radius R=13 so the
+# kernel-integrated potential U lands near the growth bell's center mu=0.15.
+# With SEED_RADIUS=36, U at the blob's centre starts near mu and the field
+# survives the first step; smaller seeds collapse to zero within one frame
+# because U is far outside the narrow (sigma=0.015) growth bell.
+SEED_RADIUS = 36.0
+SEED_PEAK = 0.5
+
+# Seed modes (kept in sync with the seed_blob kernel)
+SEED_MODE_CLEAR = 0
+SEED_MODE_BLOB = 1
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def compute_kernel_norm(radius, mu_k, sigma_k):
+    """Precompute 1 / (sum of K(r)) for the bell-shaped neighborhood weight.
+
+    Mirrors exactly what the device kernel does so the convolution is energy-
+    preserving: walks the (2R+1)x(2R+1) box, accumulates
+    `exp(-(r/R - mu_k)^2 / (2*sigma_k^2))` for `r <= R`, and returns the
+    reciprocal sum as a float32.
+    """
+    inv_two_sigma2 = 1.0 / (2.0 * sigma_k * sigma_k)
+    inv_r = 1.0 / float(radius)
+    total = 0.0
+    for dy in range(-radius, radius + 1):
+        for dx in range(-radius, radius + 1):
+            r = math.sqrt(dx * dx + dy * dy)
+            if r > radius:
+                continue
+            rn = r * inv_r - mu_k
+            total += math.exp(-(rn * rn) * inv_two_sigma2)
+    if total <= 0.0:
+        raise RuntimeError("kernel normalization sum collapsed to zero")
+    return np.float32(1.0 / total)
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("convolve_lenia", "colorize_lenia", "seed_blob"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("convolve_lenia"),
+        "colorize": mod.get_kernel("colorize_lenia"),
+        "seed": mod.get_kernel("seed_blob"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they
+    # can share a launch config.
+    configs = {"step": config, "colorize": config, "seed": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Lenia",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two single-channel `float` ping-pong arrays.
+
+    `surface_load_store=True` is what lets the same Array be bound as both a
+    TextureObject (sampled reads) and a SurfaceObject (typed writes).
+    """
+    arr_a = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    arr_b = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_state(stream, kernels, configs, write_surf, mode, seed_value):
+    """Re-initialize the array behind `write_surf` with a Gaussian blob or zeros.
+
+    `mode = SEED_MODE_CLEAR` zeroes the field; `mode = SEED_MODE_BLOB` places a
+    Gaussian blob with peak ~SEED_PEAK at the center, jittered by `seed_value`
+    so successive reseeds give different patterns.
+
+    Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so
+    creating a SurfaceObject inside a `with` block that closes immediately
+    after `launch` returns would destroy the surface handle before the kernel
+    actually runs against it.
+    """
+    launch(
+        stream,
+        configs["seed"],
+        kernels["seed"],
+        np.uint64(write_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.int32(mode),
+        np.uint32(seed_value),
+        np.float32(SEED_RADIUS),
+        np.float32(SEED_PEAK),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong state Arrays ---
+    #     Both are single-channel `float` with `surface_load_store=True` so
+    #     they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles ---
+    #     Creating these once is much cheaper than rebuilding them every
+    #     step. The simulation loop just picks which read/write pair to use.
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Precompute the bell-curve normalization constant ---
+    #     The neighborhood weight K(r) is unnormalized in the kernel; we
+    #     divide by sum(K) so the convolution is a weighted mean rather than
+    #     an unbounded integral. Doing this on the host once at startup is
+    #     much cheaper than redoing it on the device every step.
+    inv_weight_sum = compute_kernel_norm(R, MU_K, SIGMA_K)
+
+    # --- Step 9: Seed an initial Gaussian blob into arr_a (writes via surf_a) ---
+    seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, seed_value=0)
+    # After seeding, `arr_a` is the "current" state.
+    state = {"current": "a", "seed": 0}
+
+    # --- Step 10: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            # Reseed with a new Gaussian blob; bump the seed so the jitter
+            # pattern changes each time.
+            state["seed"] += 1
+            seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, state["seed"])
+            state["current"] = "a"
+            return
+        if symbol == key._1:
+            # Clear the field. Useful to confirm the simulation is quiet when
+            # the state is zero.
+            seed_state(stream, kernels, configs, surf_a, SEED_MODE_CLEAR, 0)
+            state["current"] = "a"
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Run one Lenia step. The convolution kernel reads the current
+        #     state via a TextureObject (LINEAR + WRAP gives toroidal
+        #     wrapping at the border), evaluates the growth function, and
+        #     writes the new state via a SurfaceObject. One step per frame
+        #     is intentional: dt = 0.1 is small, and the (2R+1)^2 = 729-tap
+        #     stencil is heavy enough that going faster would not help.
+        tex_read, surf_write, next_current = current_read_write()
+        launch(
+            stream,
+            configs["step"],
+            kernels["step"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(R),
+            np.float32(MU_K),
+            np.float32(SIGMA_K),
+            np.float32(MU),
+            np.float32(SIGMA),
+            np.float32(DT),
+            inv_weight_sum,
+        )
+        state["current"] = next_current
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_read = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_read.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Lenia"
+                f" ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * seed_blob       -- sets the initial state via SurfaceObject writes.
+#                            Either clears the field (mode = 0) or paints a
+#                            Gaussian blob centered in the field (mode = 1).
+#       * convolve_lenia  -- reads previous state via TextureObject (with
+#                            LINEAR + WRAP bilinear filtering), integrates a
+#                            bell-shaped neighborhood K(r/R) to produce the
+#                            potential U, applies the growth function G(U),
+#                            and writes the next state via SurfaceObject.
+#       * colorize_lenia  -- reads the new state via TextureObject and writes
+#                            RGBA bytes into the OpenGL PBO using a simple
+#                            teal-on-black gradient.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// All kernels run one thread per output pixel and bounds-check at the top.
+// `surf2Dwrite` takes the x offset in BYTES; for a single-channel float
+// surface that means `x * sizeof(float)` = `x * 4`.
+
+extern "C"
+__global__
+void seed_blob(cudaSurfaceObject_t surf,
+               int width, int height,
+               int mode,
+               unsigned int seed,
+               float radius,
+               float peak) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float value = 0.0f;
+    if (mode == 1) {
+        // Gaussian blob centered in the field with a small deterministic
+        // jitter that breaks symmetry differently on each reseed.
+        float cx = (float)(width  / 2);
+        float cy = (float)(height / 2);
+        float dx = (float)x - cx;
+        float dy = (float)y - cy;
+        float r2 = dx * dx + dy * dy;
+        float inv = 1.0f / (radius * radius);
+        value = peak * expf(-r2 * inv);
+
+        unsigned int h = (unsigned int)x * 374761393u +
+                         (unsigned int)y * 668265263u + seed * 2246822519u;
+        h = (h ^ (h >> 13)) * 1274126177u;
+        h = h ^ (h >> 16);
+        float noise = (h & 0xffffu) / 65535.0f;  // in [0, 1]
+        value += 0.02f * (noise - 0.5f);
+        if (value < 0.0f) value = 0.0f;
+        if (value > 1.0f) value = 1.0f;
+    }
+
+    // float is 4 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(value, surf, x * (int)sizeof(float), y);
+}
+
+extern "C"
+__global__
+void convolve_lenia(cudaTextureObject_t tex,
+                    cudaSurfaceObject_t surf,
+                    int width, int height,
+                    int R,
+                    float mu_k, float sigma_k,
+                    float mu, float sigma,
+                    float dt,
+                    float inv_weight_sum) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized texture coordinates: WRAP addressing requires them. The
+    // (x + dx + 0.5) / W idiom places the sample at the texel center; values
+    // outside [0, 1] are fine because WRAP wraps them toroidally.
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float inv_R = 1.0f / (float)R;
+    float inv_two_sigma_k2 = 1.0f / (2.0f * sigma_k * sigma_k);
+    float inv_two_sigma2   = 1.0f / (2.0f * sigma     * sigma);
+
+    // Integrate the bell-shaped weight K(r/R) against the current state.
+    float U = 0.0f;
+    for (int dy = -R; dy <= R; ++dy) {
+        for (int dx = -R; dx <= R; ++dx) {
+            float fdx = (float)dx;
+            float fdy = (float)dy;
+            float r2 = fdx * fdx + fdy * fdy;
+            float r  = sqrtf(r2);
+            if (r > (float)R) continue;   // restrict to the disk
+            float rn = r * inv_R - mu_k;
+            float w  = expf(-(rn * rn) * inv_two_sigma_k2);
+
+            float sx = ((float)x + fdx + 0.5f) * inv_w;
+            float sy = ((float)y + fdy + 0.5f) * inv_h;
+            float s  = tex2D<float>(tex, sx, sy);
+            U += w * s;
+        }
+    }
+    U *= inv_weight_sum;   // host-precomputed 1 / sum(K)
+
+    // Read the current cell value (point sample at the texel center).
+    float sx0 = ((float)x + 0.5f) * inv_w;
+    float sy0 = ((float)y + 0.5f) * inv_h;
+    float state = tex2D<float>(tex, sx0, sy0);
+
+    // Growth function G(U) = 2 * exp(-(U - mu)^2 / (2 * sigma^2)) - 1,
+    // mapping U near mu to +1 (grow) and U far from mu to -1 (shrink).
+    float du = U - mu;
+    float G  = 2.0f * expf(-(du * du) * inv_two_sigma2) - 1.0f;
+
+    float new_state = state + dt * G;
+    if (new_state < 0.0f) new_state = 0.0f;
+    if (new_state > 1.0f) new_state = 1.0f;
+
+    surf2Dwrite(new_state, surf, x * (int)sizeof(float), y);
+}
+
+extern "C"
+__global__
+void colorize_lenia(cudaTextureObject_t tex,
+                    unsigned char* output,
+                    int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = ((float)x + 0.5f) * inv_w;
+    float cy = ((float)y + 0.5f) * inv_h;
+
+    float v = tex2D<float>(tex, cx, cy);
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Linear interpolation from a deep teal at v = 0 to a bright teal at
+    // v = 1. Two stops -- simple, easy to read, no LUT required.
+    //   (0, 15, 30, 255)  ->  (50, 200, 180, 255)
+    float r = (  0.0f + v * ( 50.0f -   0.0f));
+    float g = ( 15.0f + v * (200.0f -  15.0f));
+    float b = ( 30.0f + v * (180.0f -  30.0f));
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)r;
+    output[idx + 1] = (unsigned char)g;
+    output[idx + 2] = (unsigned char)b;
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py
new file mode 100644
index 00000000000..11abca54c22
--- /dev/null
+++ b/cuda_core/examples/gl_interop_mandelbrot.py
@@ -0,0 +1,692 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array and TextureObject used as a *color
+# lookup table* (palette LUT) for a real-time Mandelbrot deep-zoom explorer.
+# A CUDA kernel computes smooth iteration counts and uses tex1D<float4> with
+# LINEAR + CLAMP + NORMALIZED_FLOAT sampling to read a 256-entry RGBA palette,
+# writing the final RGBA bytes straight into an OpenGL PBO via GraphicsResource.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to use a 1D cuda.core.Array as a palette and bind it via a
+#   TextureObject for hardware-filtered color lookups inside a kernel.
+# - How LINEAR + AddressMode.CLAMP + ReadMode.NORMALIZED_FLOAT + normalized
+#   coordinates give you a free `texture(palette, t)` style sampler that
+#   returns a float4 in [0, 1] regardless of the underlying storage format.
+# - How to drive a real-time interactive viewer: mouse pan, scroll-wheel zoom
+#   anchored at the cursor, and key-driven iteration cap.
+#
+# How it works
+# ============
+# The Mandelbrot set is defined by iterating z -> z^2 + c starting from
+# z = 0; pixels are colored by how quickly z escapes the disk of radius 2.
+#
+#     +---------+   ResourceDescriptor.from_array
+#     |  Array  | --------------------------------+
+#     | float4  |                                 v
+#     | size 256|                       +-------------------+
+#     +---------+                       |   TextureObject   |
+#       ^  copy_from(host)              |  (palette LUT)    |
+#       |                               +---------+---------+
+#     host palette                                |
+#     (numpy float32x4, 256 stops)                |
+#                                                 v
+#                                  tex1D<float4>(palette, t)
+#                                                 |
+#                                                 v
+#                                     +-----------------------+
+#                                     |  mandelbrot kernel    |
+#                                     |  (one thread / pixel) |
+#                                     +-----------+-----------+
+#                                                 |
+#                                                 v   GraphicsResource.map
+#                                     +-----------------------+
+#                                     |   OpenGL PBO (RGBA8)  |
+#                                     +-----------------------+
+#
+# Smooth iteration count
+# ----------------------
+# A plain integer escape count produces ugly banded colors. With a bailout
+# radius R = 2 (escape when |z|^2 > 4), we use the standard smooth formula:
+#
+#     mu = iter + 1 - log(log(|z|)) / log(2)
+#
+# At the escape step |z| > 2, so log(|z|) > log(2) > 0 and log(log(|z|)) is
+# finite. We compute this in double and cast to float for the palette lookup.
+#
+# Cursor-anchored zoom
+# --------------------
+# On scroll, we want the world point under the mouse cursor to remain under
+# the cursor after the zoom. We capture (wx, wy) under the cursor with the
+# old scale, multiply the scale by 0.9 (zoom in) or 1.1 (zoom out), then
+# back-solve cx, cy so the same screen pixel still maps to (wx, wy):
+#
+#     cx_new = wx - (mouse_x - W/2) * scale_new
+#     cy_new = wy - (mouse_y - H/2) * scale_new
+#
+# Why double precision for cx, cy, scale?
+# ---------------------------------------
+# Float32 runs out of mantissa bits around 1e6x zoom; double gets you to
+# roughly 1e13x before the pixel grid coarsens visibly. The kernel takes
+# cx, cy, scale as doubles and only narrows to float for the color lookup.
+#
+# Address mode note
+# -----------------
+# We use AddressMode.CLAMP (per the example brief). Combined with the
+# `fmodf(mu * 0.02f, 1.0f)` cycling formula, the palette index is already
+# guaranteed to be in [0, 1), so CLAMP and WRAP both produce identical
+# results in practice -- there is no visible seam.
+#
+# What you should see
+# ===================
+# A window showing the Mandelbrot set. Drag with the left mouse button to
+# pan, scroll the wheel to zoom in/out at the cursor, press R to reset the
+# view, and `[`/`]` to lower/raise the iteration cap. The window title shows
+# the current zoom level, center, max_iter, and FPS. Close the window or
+# press Escape to exit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and viewer parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 768
+PALETTE_SIZE = 256
+
+# Default view: classic Mandelbrot framing centered slightly left of origin.
+DEFAULT_CX = -0.5
+DEFAULT_CY = 0.0
+DEFAULT_SCALE = 4.0 / HEIGHT  # world-units per pixel (4-unit-tall view)
+DEFAULT_MAX_ITER = 512
+
+# Bounds for [/] iteration adjust.
+MIN_MAX_ITER = 64
+MAX_MAX_ITER = 8192
+ITER_STEP = 64
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject as a palette LUT, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernel and return (device, stream, kernel, config)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # Bindless texture objects (cuTexObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless texture objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex1D<float4> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("mandelbrot",))
+
+    kernel = mod.get_kernel("mandelbrot")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture - Mandelbrot Deep Zoom",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def build_palette():
+    """Build a 256-entry RGBA float32 palette by lerping through color stops.
+
+    Returns a flat numpy array of shape (PALETTE_SIZE * 4,) dtype=float32
+    suitable for Array.copy_from(). Each color channel is in [0, 1].
+    """
+    # Hand-picked stops: deep blue -> cyan -> yellow -> orange -> red ->
+    # magenta -> black (the final stop is used by points that hit max_iter
+    # and don't escape).
+    stops = np.array(
+        [
+            [0.00, 0.02, 0.05, 0.30, 1.0],  # position, R, G, B, A
+            [0.16, 0.10, 0.50, 0.90, 1.0],  # cyan
+            [0.42, 1.00, 0.95, 0.20, 1.0],  # yellow
+            [0.58, 1.00, 0.55, 0.10, 1.0],  # orange
+            [0.74, 0.95, 0.10, 0.10, 1.0],  # red
+            [0.90, 0.65, 0.10, 0.85, 1.0],  # magenta
+            [1.00, 0.00, 0.00, 0.00, 1.0],  # black
+        ],
+        dtype=np.float32,
+    )
+
+    pal = np.empty((PALETTE_SIZE, 4), dtype=np.float32)
+    positions = stops[:, 0]
+    colors = stops[:, 1:]
+    for i in range(PALETTE_SIZE):
+        t = i / (PALETTE_SIZE - 1)
+        # Find the bracketing segment.
+        j = int(np.searchsorted(positions, t, side="right")) - 1
+        j = max(0, min(j, len(positions) - 2))
+        t0 = positions[j]
+        t1 = positions[j + 1]
+        seg = (t - t0) / (t1 - t0) if t1 > t0 else 0.0
+        pal[i] = colors[j] + seg * (colors[j + 1] - colors[j])
+
+    # Flatten to (PALETTE_SIZE * 4,) so the byte layout matches a
+    # float4 x PALETTE_SIZE 1D Array.
+    return np.ascontiguousarray(pal.reshape(-1), dtype=np.float32)
+
+
+def make_palette_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        # NORMALIZED_FLOAT is a no-op for FLOAT32 storage (the data is already
+        # in [0, 1]); we set it because the spec calls for it and to document
+        # the intent for readers building palettes from UINT8 storage.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Build and upload the palette LUT ---
+    #     One 1D Array, 256 entries of float4 RGBA. The host-side palette is
+    #     a flat numpy float32 array; copy_from() does an async H2D copy, so
+    #     we sync the stream once afterwards to make sure the data has landed
+    #     before we start sampling from it in the render loop.
+    host_palette = build_palette()
+    palette_arr = Array.from_descriptor(
+        shape=(PALETTE_SIZE,),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+    )
+    palette_arr.copy_from(host_palette, stream=stream)
+    stream.sync()
+
+    # --- Step 7: Bind the palette Array as a TextureObject (LUT) ---
+    palette_tex = make_palette_texture(palette_arr)
+
+    # --- Step 8: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    # View state. cx, cy, scale are kept in Python floats (double precision)
+    # and converted to np.float64 on each kernel launch.
+    view = {
+        "cx": float(DEFAULT_CX),
+        "cy": float(DEFAULT_CY),
+        "scale": float(DEFAULT_SCALE),
+        "max_iter": int(DEFAULT_MAX_ITER),
+        # Pan-drag state (left mouse button).
+        "dragging": False,
+    }
+
+    def screen_to_world(mouse_x, mouse_y):
+        """Map a pyglet mouse coordinate to the world point currently under it.
+
+        Pyglet's window origin is bottom-left and the rendered texture's
+        origin is also bottom-left, so no y-flip is needed.
+        """
+        wx = view["cx"] + (mouse_x - WIDTH / 2.0) * view["scale"]
+        wy = view["cy"] + (mouse_y - HEIGHT / 2.0) * view["scale"]
+        return wx, wy
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            view["cx"] = float(DEFAULT_CX)
+            view["cy"] = float(DEFAULT_CY)
+            view["scale"] = float(DEFAULT_SCALE)
+            view["max_iter"] = int(DEFAULT_MAX_ITER)
+            return
+        if symbol == key.BRACKETLEFT:
+            view["max_iter"] = max(MIN_MAX_ITER, view["max_iter"] - ITER_STEP)
+            return
+        if symbol == key.BRACKETRIGHT:
+            view["max_iter"] = min(MAX_MAX_ITER, view["max_iter"] + ITER_STEP)
+            return
+
+    @window.event
+    def on_mouse_press(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            view["dragging"] = True
+
+    @window.event
+    def on_mouse_release(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            view["dragging"] = False
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        if buttons & pyglet.window.mouse.LEFT:
+            # Pan: move the center opposite to the cursor drag (so the scene
+            # follows the cursor). dy is positive when moving up in pyglet's
+            # bottom-left origin space, matching the texture orientation.
+            view["cx"] -= dx * view["scale"]
+            view["cy"] += dy * view["scale"]
+
+    @window.event
+    def on_mouse_scroll(x, y, _scroll_x, scroll_y):
+        # Cursor-anchored zoom: keep the world point under the cursor pinned.
+        wx, wy = screen_to_world(x, y)
+        factor = 0.9 if scroll_y > 0 else 1.1
+        view["scale"] *= factor
+        # Back-solve cx, cy so screen pixel (x, y) still maps to (wx, wy).
+        view["cx"] = wx - (x - WIDTH / 2.0) * view["scale"]
+        view["cy"] = wy - (y - HEIGHT / 2.0) * view["scale"]
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write to it. This gives us a Buffer
+        #     whose .handle is a device pointer pointing into the GL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(palette_tex.handle),  # bindless texture handle
+                buf.handle,                     # output PBO (RGBA8)
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float64(view["cx"]),
+                np.float64(view["cy"]),
+                np.float64(view["scale"]),
+                np.int32(view["max_iter"]),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (b) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (c) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            zoom = 1.0 / view["scale"] if view["scale"] > 0 else 0.0
+            window.set_caption(
+                "cuda.core Array/Texture - Mandelbrot"
+                f" | zoom {zoom:.3e}x"
+                f" | center ({view['cx']:.6f}, {view['cy']:.6f})"
+                f" | iter {view['max_iter']}"
+                f" | {fps:.0f} FPS"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        palette_tex.close()
+        palette_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE is a single CUDA C++ kernel `mandelbrot` that computes a
+#     smooth iteration count per pixel and looks up the color via
+#     tex1D<float4>(palette, t). Coordinates and the scale factor are doubles
+#     to support deep zooms; only the color lookup runs in single precision.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Mandelbrot deep-zoom kernel with a TextureObject palette LUT.
+//
+// Each thread computes one pixel. Coordinates and scale are doubles so the
+// zoom doesn't quantize at modest depth. Once we have the smooth iteration
+// count we narrow to float and use tex1D<float4> to read the palette.
+
+extern "C"
+__global__
+void mandelbrot(cudaTextureObject_t palette,
+                unsigned char* output,
+                int width, int height,
+                double cx, double cy, double scale,
+                int max_iter) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Map pixel -> complex plane (doubles).
+    double c_re = cx + ((double)x - 0.5 * (double)width)  * scale;
+    double c_im = cy + ((double)y - 0.5 * (double)height) * scale;
+
+    // Standard escape iteration with bailout radius 2 (compare squared norm
+    // against 4 to skip the sqrt in the inner loop).
+    double zr = 0.0;
+    double zi = 0.0;
+    double zr2 = 0.0;
+    double zi2 = 0.0;
+    int iter = 0;
+    while (iter < max_iter && (zr2 + zi2) <= 4.0) {
+        zi = 2.0 * zr * zi + c_im;
+        zr = zr2 - zi2 + c_re;
+        zr2 = zr * zr;
+        zi2 = zi * zi;
+        ++iter;
+    }
+
+    unsigned char r, g, b;
+    if (iter >= max_iter) {
+        // Inside the set (or close enough): solid black.
+        r = 0;
+        g = 0;
+        b = 0;
+    } else {
+        // Smooth iteration count:
+        //   mu = iter + 1 - log(log(|z|)) / log(2)
+        //      = iter + 1 - log(0.5 * log(|z|^2)) / log(2)
+        // At escape, |z|^2 > 4, so 0.5 * log(|z|^2) > log(2) > 0 -- the
+        // outer log is well-defined. Compute in double, narrow to float
+        // for the palette lookup.
+        double log_zn = 0.5 * log(zr2 + zi2);
+        double nu = log(log_zn) / log(2.0);
+        float mu = (float)((double)(iter + 1) - nu);
+
+        // Cycle through the palette: 0.02 controls how quickly we wrap
+        // through the gradient as the iteration count climbs.
+        float t = fmodf(mu * 0.02f, 1.0f);
+        if (t < 0.0f) t += 1.0f;  // fmodf can return negative for negative mu
+
+        float4 rgba = tex1D<float4>(palette, t);
+
+        // Clamp before narrowing to bytes.
+        float fr = rgba.x; if (fr < 0.0f) fr = 0.0f; if (fr > 1.0f) fr = 1.0f;
+        float fg = rgba.y; if (fg < 0.0f) fg = 0.0f; if (fg > 1.0f) fg = 1.0f;
+        float fb = rgba.z; if (fb < 0.0f) fb = 0.0f; if (fb > 1.0f) fb = 1.0f;
+        r = (unsigned char)(fr * 255.0f);
+        g = (unsigned char)(fg * 255.0f);
+        b = (unsigned char)(fb * 255.0f);
+    }
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = r;
+    output[idx + 1] = g;
+    output[idx + 2] = b;
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py
new file mode 100644
index 00000000000..38b09513464
--- /dev/null
+++ b/cuda_core/examples/gl_interop_mipmap_lod.py
@@ -0,0 +1,717 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the new cuda.core texture/surface stack:
+# MipmappedArray, SurfaceObject, and a TextureObject that does trilinear
+# (LINEAR mipmap + LINEAR filter) sampling with user-controlled LOD bias.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# How to allocate a mipmap pyramid as a single MipmappedArray, populate each
+# level from a CUDA kernel by binding it as a SurfaceObject, and then sample
+# the whole pyramid from a TextureObject with manual LOD bias.
+#
+# How it works
+# ============
+# A mipmap pyramid is a stack of progressively-halved images of the same
+# texture. The base level (level 0) holds the highest-resolution version; each
+# subsequent level is a 2x2 box-filtered downsample of the level below it:
+#
+#     level 0: 512 x 512   <- highest detail
+#     level 1: 256 x 256
+#     level 2: 128 x 128
+#     ...
+#     level 9:   1 x 1     <- a single average color
+#
+# At sample time, the GPU picks the mip level that best matches the on-screen
+# size of the texel, optionally blending between adjacent levels (trilinear).
+# Selecting a coarser level than the "right" one is called a positive LOD bias
+# and produces a softer/blurrier image; a negative bias selects finer levels
+# (sharper but more aliased when undersampled).
+#
+#   +----------------------+       +-----------------------+
+#   |   MipmappedArray     |       |   TextureObject       |
+#   | (single allocation,  | <---  | (samples the whole    |
+#   |  10 mip levels)      |       |  pyramid w/ trilinear |
+#   +----------------------+       |  filtering)           |
+#         ^      ^                 +-----------------------+
+#         |      |
+#         |      +---- one SurfaceObject per level, used at BUILD time only
+#         |            to let a kernel write pixels into that level.
+#         |
+#         +----------- get_level(L) returns a NON-OWNING Array view of level L;
+#                      the storage belongs to the parent MipmappedArray.
+#
+#   STARTUP -- one-time mipmap build
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate MipmappedArray (10 levels, float4 RGBA, surface_load_store=True).
+#   2. Level 0: launch `seed_base` kernel -> SurfaceObject -> high-frequency
+#      procedural pattern.
+#   3. For L = 1..num_levels-1: launch `downsample` kernel:
+#        - reads level L-1 through a TextureObject (POINT-filtered)
+#        - writes level L   through a SurfaceObject
+#        - 4-sample box average of the parent's 2x2 footprint.
+#
+#   PER FRAME (render loop)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   The display TextureObject samples the whole pyramid with `tex2DLod`,
+#   where the LOD is computed per-pixel as `log2(zoom) + lod_bias`. The result
+#   is written to a GL PBO via GraphicsResource, then drawn as a textured quad.
+#
+# What you should see
+# ===================
+# A 512x512 procedural pattern (concentric rings + diagonal grid) shown
+# stretched across the window. Use the mouse wheel to zoom in/out (this
+# implicitly changes the LOD), and use the bracket keys `[` / `]` to add a
+# manual LOD bias on top of that. Press `R` to reset.
+#
+#   Mouse wheel       zoom in / out
+#   [                 LOD bias -= 0.25  (sharper, more aliased)
+#   ]                 LOD bias += 0.25  (blurrier, samples a coarser level)
+#   R                 reset zoom + bias
+#   Escape / close    quit
+#
+# The window title shows the current zoom, manual bias, and effective LOD.
+# Close the window or press Escape to exit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    MipmappedArray,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BASE_SIZE = 512  # Texture base-level edge length (must be a power of two).
+LOD_BIAS_STEP = 0.25
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA, OpenGL, and the mipmap pyramid. If you're
+# here to learn about MipmappedArray / SurfaceObject / mipmapped TextureObject,
+# you can skip straight to main() -- the interesting part is there. These
+# helpers exist so that main() reads like a short story.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """Surface load/store + mipmapped arrays require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the three kernels and return everything we need to drive them.
+
+    Returns
+    -------
+    (dev, stream, kernels, arch_str)
+        kernels is a dict with keys "seed_base", "downsample", "display".
+    """
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("seed_base", "downsample", "display"),
+    )
+    kernels = {
+        "seed_base": mod.get_kernel("seed_base"),
+        "downsample": mod.get_kernel("downsample"),
+        "display": mod.get_kernel("display"),
+    }
+    return dev, stream, kernels, f"sm_{dev.arch}"
+
+
+def build_mipmap_pyramid(mip, num_levels, stream, kernels):
+    """Populate every level of `mip` using SurfaceObject writes.
+
+    Strategy
+    --------
+    * Level 0 is filled directly by `seed_base`, which writes a procedural
+      pattern through a SurfaceObject bound to level 0.
+    * Each subsequent level L is filled by `downsample`, which reads level L-1
+      through a POINT-filtered TextureObject and box-averages a 2x2 footprint
+      into level L through a SurfaceObject.
+    * All operations are issued on a single stream, so they serialize
+      implicitly -- no per-level sync is needed.
+    """
+    # ---- Level 0: seed the base image -------------------------------------
+    base_arr = mip.get_level(0)  # non-owning view; do NOT use a `with` block
+    with SurfaceObject.from_array(base_arr) as base_surf:
+        block = (16, 16, 1)
+        grid = (
+            (BASE_SIZE + block[0] - 1) // block[0],
+            (BASE_SIZE + block[1] - 1) // block[1],
+            1,
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["seed_base"],
+            np.uint64(base_surf.handle),
+            np.int32(BASE_SIZE),
+            np.int32(BASE_SIZE),
+        )
+    # base_arr (non-owning) is allowed to fall out of scope here; the parent
+    # MipmappedArray keeps the underlying storage alive.
+
+    # ---- Levels 1..N-1: box-filter downsample ------------------------------
+    # Each iteration reads level (L-1) through a temporary TextureObject and
+    # writes level L through a temporary SurfaceObject. Both close cleanly
+    # at the end of their `with` blocks.
+    src_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.POINT,        # explicit per-texel reads
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,             # integer pixel coordinates
+    )
+    for level in range(1, num_levels):
+        parent_size = BASE_SIZE >> (level - 1)
+        level_size = BASE_SIZE >> level
+        if level_size < 1:
+            break
+
+        src_arr = mip.get_level(level - 1)
+        dst_arr = mip.get_level(level)
+        src_res = ResourceDescriptor.from_array(src_arr)
+        with TextureObject.from_descriptor(
+            resource=src_res, texture_descriptor=src_tex_desc
+        ) as src_tex, SurfaceObject.from_array(dst_arr) as dst_surf:
+            block = (16, 16, 1)
+            grid = (
+                (level_size + block[0] - 1) // block[0],
+                (level_size + block[1] - 1) // block[1],
+                1,
+            )
+            launch(
+                stream,
+                LaunchConfig(grid=grid, block=block),
+                kernels["downsample"],
+                np.uint64(src_tex.handle),
+                np.uint64(dst_surf.handle),
+                np.int32(parent_size),
+                np.int32(level_size),
+            )
+        # src_arr, dst_arr (non-owning) fall out of scope; storage stays
+        # alive via the parent MipmappedArray.
+
+    # One sync at the end is enough -- the whole build chain ran on this
+    # stream and serialized naturally.
+    stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="MipmappedArray Example - Mipmap LOD viewer",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: a shader program, a fullscreen quad, and an
+    empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, _arch = setup_cuda()
+
+    # --- Step 2: Allocate the mipmap pyramid and build every level ---
+    #     surface_load_store=True is required for kernel-side writes.
+    num_levels = int(math.log2(BASE_SIZE)) + 1
+    mip = MipmappedArray.from_descriptor(
+        shape=(BASE_SIZE, BASE_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        num_levels=num_levels,
+        surface_load_store=True,
+    )
+    build_mipmap_pyramid(mip, num_levels, stream, kernels)
+
+    # --- Step 3: Bind the WHOLE pyramid as a trilinear-filtered texture ---
+    #     Normalized coordinates (0..1) make zoom-by-uv simple. The texture
+    #     descriptor's mipmap_level_bias stays 0.0; the display kernel
+    #     receives the user-controlled bias as a kernel argument and folds
+    #     it into the tex2DLod call (avoids rebuilding the TextureObject
+    #     whenever the user changes the bias).
+    display_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+        mipmap_filter_mode=FilterMode.LINEAR,    # trilinear
+        mipmap_level_bias=0.0,
+        min_mipmap_level_clamp=0.0,
+        max_mipmap_level_clamp=float(num_levels - 1),
+    )
+    display_tex = TextureObject.from_descriptor(
+        resource=ResourceDescriptor.from_mipmapped_array(mip),
+        texture_descriptor=display_tex_desc,
+    )
+
+    # --- Step 4: Open a window and set up the GL/CUDA bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # `zoom` controls how big a texel is on screen: zoom > 1 stretches the
+    # texture and selects coarser mip levels (positive LOD); zoom < 1 shrinks
+    # the texture and selects finer levels. `lod_bias` is a manual offset
+    # added on top.
+    state = {"zoom": 1.0, "lod_bias": 0.0}
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    def effective_lod():
+        # Same formula the display kernel uses, clamped to the legal range so
+        # the window title matches what the GPU actually sees.
+        raw = math.log2(max(state["zoom"], 1e-6)) + state["lod_bias"]
+        return max(0.0, min(float(num_levels - 1), raw))
+
+    @window.event
+    def on_draw():
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the display kernel -- samples the mipmap and writes RGBA.
+            launch(
+                stream,
+                config,
+                kernels["display"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(display_tex.handle),
+                np.float32(state["zoom"]),
+                np.float32(state["lod_bias"]),
+                np.float32(float(num_levels - 1)),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 1.0:
+            fps = frame_count[0] / (now - fps_time[0])
+            window.set_caption(
+                f"MipmappedArray LOD viewer "
+                f"({WIDTH}x{HEIGHT}, {fps:.0f} FPS) -- "
+                f"zoom={state['zoom']:.2f}, "
+                f"bias={state['lod_bias']:+.2f}, "
+                f"LOD={effective_lod():.2f}"
+            )
+            frame_count[0] = 0
+            fps_time[0] = now
+
+    @window.event
+    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        # One wheel step changes zoom by ~12.5%. Clamped to keep LOD in range.
+        if scroll_y == 0:
+            return
+        factor = 1.125 ** scroll_y
+        state["zoom"] = max(1.0 / 64.0, min(64.0, state["zoom"] * factor))
+
+    @window.event
+    def on_key_press(symbol, modifiers):
+        key = pyglet.window.key
+        if symbol == key.BRACKETLEFT:
+            state["lod_bias"] = max(
+                -float(num_levels), state["lod_bias"] - LOD_BIAS_STEP
+            )
+        elif symbol == key.BRACKETRIGHT:
+            state["lod_bias"] = min(
+                float(num_levels), state["lod_bias"] + LOD_BIAS_STEP
+            )
+        elif symbol == key.R:
+            state["zoom"] = 1.0
+            state["lod_bias"] = 0.0
+
+    @window.event
+    def on_close():
+        # Release CUDA-side resources in reverse construction order. GL
+        # objects clean up via pyglet on window close.
+        resource.close()
+        display_tex.close()
+        mip.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Three CUDA kernels are concatenated into one program string so they share a
+# single NVRTC compile. All three operate on float4 RGBA pixels.
+#
+#   seed_base   -- writes a high-frequency procedural pattern to level 0 via a
+#                  SurfaceObject. NOTE: surf2Dwrite's x-coordinate is in BYTES,
+#                  not in elements, so we multiply by sizeof(float4) every time.
+#
+#   downsample  -- reads level L-1 through a POINT-filtered TextureObject and
+#                  writes the 2x2 box average to level L through a SurfaceObject.
+#                  tex2D with non-normalized coords needs the +0.5 half-texel
+#                  offset to hit exact texel centers.
+#
+#   display     -- samples the WHOLE mipmap pyramid with tex2DLod, where the
+#                  per-thread LOD is `clamp(log2(zoom) + lod_bias, 0, maxLod)`.
+#                  Writes 8-bit RGBA into the PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Helper: clamp a float to [a, b].
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+// CUDA does not ship a builtin "fract" so we provide one (used by seed_base).
+__device__ __forceinline__ float fracf(float v) {
+    return v - floorf(v);
+}
+
+// --------------------------------------------------------------------------
+// seed_base: write a procedural high-frequency pattern to level 0.
+//
+// surf is a SurfaceObject bound to the level-0 Array (float4 RGBA). The
+// pattern is a colorful blend of concentric rings, a diagonal grid, and a
+// radial sweep, designed to have plenty of fine detail so the difference
+// between mip levels is visually obvious.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void seed_base(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Concentric rings centered on the image.
+    float cx = u - 0.5f;
+    float cy = v - 0.5f;
+    float r = sqrtf(cx * cx + cy * cy);
+    float rings = 0.5f + 0.5f * sinf(r * 80.0f);
+
+    // Diagonal grid -- thin lines about every 1/16 of the image.
+    float gx = fabsf(fracf(u * 16.0f) - 0.5f);
+    float gy = fabsf(fracf(v * 16.0f) - 0.5f);
+    float grid = (gx < 0.05f || gy < 0.05f) ? 1.0f : 0.0f;
+
+    // Angular sweep gives the rings some color variation.
+    float theta = atan2f(cy, cx);
+    float sweep = 0.5f + 0.5f * sinf(theta * 6.0f);
+
+    // Combine into an RGBA color. Keep values in [0, 1].
+    float red   = clampf(rings * (0.4f + 0.6f * sweep) + 0.3f * grid, 0.0f, 1.0f);
+    float green = clampf(rings * (0.6f - 0.4f * sweep) + 0.3f * grid, 0.0f, 1.0f);
+    float blue  = clampf(0.4f + 0.4f * sweep + 0.5f * grid,            0.0f, 1.0f);
+    float alpha = 1.0f;
+
+    float4 px = make_float4(red, green, blue, alpha);
+
+    // Surface writes index x in BYTES (this is the classic gotcha).
+    surf2Dwrite<float4>(px, surf, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// downsample: box-filter a 2x2 footprint of the parent level into one texel.
+//
+// src is a POINT-filtered TextureObject bound to level (L-1).
+// dst is a SurfaceObject bound to level L.
+// (dst_w, dst_h) is the size of level L.
+// (src_w = 2 * dst_w, src_h = 2 * dst_h is implicit and unused; we pass it
+// only for the bounds check.)
+//
+// Texture coordinates: tex2D with non-normalized coords returns texel (i, j)
+// when sampled at (i + 0.5, j + 0.5). So for output texel (x, y) the four
+// parent texels live at parent-coords (2x + 0.5, 2y + 0.5), (2x + 1.5, ...).
+// --------------------------------------------------------------------------
+extern "C" __global__
+void downsample(cudaTextureObject_t src,
+                cudaSurfaceObject_t dst,
+                int src_size,
+                int dst_size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_size || y >= dst_size) return;
+
+    float fx = 2.0f * (float)x;
+    float fy = 2.0f * (float)y;
+
+    float4 a = tex2D<float4>(src, fx + 0.5f, fy + 0.5f);
+    float4 b = tex2D<float4>(src, fx + 1.5f, fy + 0.5f);
+    float4 c = tex2D<float4>(src, fx + 0.5f, fy + 1.5f);
+    float4 d = tex2D<float4>(src, fx + 1.5f, fy + 1.5f);
+
+    float4 px;
+    px.x = 0.25f * (a.x + b.x + c.x + d.x);
+    px.y = 0.25f * (a.y + b.y + c.y + d.y);
+    px.z = 0.25f * (a.z + b.z + c.z + d.z);
+    px.w = 0.25f * (a.w + b.w + c.w + d.w);
+
+    // Silence unused-variable warning for the convenience parameter.
+    (void)src_size;
+
+    surf2Dwrite<float4>(px, dst, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// display: per-pixel mipmap sample with manual LOD bias.
+//
+// tex is a TextureObject built from the whole MipmappedArray (LINEAR +
+// LINEAR mipmap filter, normalized coords). For each output pixel we compute
+// a single per-thread LOD from `zoom` and `lod_bias`, then sample with
+// tex2DLod. Output is written as RGBA8 into a linear byte buffer.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void display(unsigned char *output,
+             int width,
+             int height,
+             cudaTextureObject_t tex,
+             float zoom,
+             float lod_bias,
+             float max_lod) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized window coords in [0, 1].
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Zoom around the window center so the user sees the effect symmetrically.
+    u = (u - 0.5f) * zoom + 0.5f;
+    v = (v - 0.5f) * zoom + 0.5f;
+
+    // LOD: zoom > 1 means the texture is being stretched (each texel covers
+    // more screen area), which intuitively corresponds to selecting a coarser
+    // (higher) mip level. log2(zoom) yields exactly that. lod_bias is added
+    // on top, and the final value is clamped to the legal range.
+    float lod = log2f(fmaxf(zoom, 1e-6f)) + lod_bias;
+    lod = clampf(lod, 0.0f, max_lod);
+
+    float4 c = tex2DLod<float4>(tex, u, v, lod);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(clampf(c.x, 0.0f, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(clampf(c.y, 0.0f, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(clampf(c.z, 0.0f, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py
new file mode 100644
index 00000000000..177e7b8d320
--- /dev/null
+++ b/cuda_core/examples/gl_interop_ocean.py
@@ -0,0 +1,836 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A real-time
+# Gerstner-wave ocean is rebuilt every frame: a heightmap Array is rewritten
+# through a SurfaceObject, sampled through a TextureObject with LINEAR + WRAP
+# filtering for normal estimation, and shaded with Phong + Fresnel sky
+# reflection straight into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to use a CUDA Array as a typed heightmap that is simultaneously
+#   written by one kernel (via SurfaceObject) and sampled by another (via
+#   TextureObject) within the same frame.
+# - How LINEAR filtering + WRAP addressing + normalized coordinates gives
+#   essentially-free bilinear neighbor lookups for finite-difference normal
+#   estimation on a tiling heightmap.
+# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire render path never leaves the GPU.
+#
+# How it works
+# ============
+# Gerstner waves are a sum of N moving sinusoids with directional vectors --
+# a classic ocean approximation that looks shockingly close to FFT ocean at a
+# glance without any external library dependencies. For each heightmap texel:
+#
+#     h(x, z, t) = sum_i  A_i * sin( D_i . (x, z) * k_i  -  w_i * t  +  phi_i )
+#
+# where k_i = 2*pi / wavelength_i and w_i = sqrt(g * k_i) is the dispersion
+# relation for deep-water gravity waves. We bake 12 waves with hand-picked
+# directions / wavelengths / amplitudes / phases into the kernel as constant
+# arrays. Weather presets just scale amplitude and speed at the host level.
+#
+#   PER FRAME (all on GPU)
+#   ~~~~~~~~~~~~~~~~~~~~~~
+#   +-----------------+   surf2Dwrite   +--------------+
+#   |   update_height | --------------> |  heightmap   |
+#   |     kernel      |                 |    Array     |
+#   +-----------------+                 |  (FLOAT32)   |
+#                                       +--------------+
+#                                              |
+#                                              | tex2D<float> (LINEAR + WRAP)
+#                                              v
+#                                       +-----------------+    write RGBA8
+#                                       |  render_ocean   | ----------------> PBO
+#                                       |     kernel      |
+#                                       +-----------------+
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# WRAP / MIRROR addressing modes require normalized coordinates (see the CUDA
+# Programming Guide). The ocean naturally tiles, so WRAP gives free seamless
+# horizon repetition. LINEAR filtering means our four-tap finite-difference
+# normal estimate gets bilinear interpolation between texels for free, which
+# smooths the lighting noticeably without a single extra ALU instruction.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# surf2Dwrite takes the x coordinate in BYTES, not in elements. For a
+# single-channel float surface that means `x * sizeof(float)` = `x * 4`.
+# Getting this wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window showing a real-time animated ocean rendered with Phong shading and
+# a Fresnel-modulated sky reflection. Drag with the left mouse button to
+# orbit, scroll to zoom, press 1/2/3 to switch weather presets (calm /
+# breezy / stormy), press P to pause animation, Escape to exit. Window title
+# shows preset name and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and heightmap dimensions (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 768
+GRID = 512  # heightmap resolution (GRID x GRID texels)
+
+# Weather presets: (amplitude_scale, speed_scale, label).
+# These are applied as multiplicative scalars on top of the per-wave amplitude
+# and angular-frequency arrays baked into the kernel, so a single compiled
+# binary can render every preset.
+PRESETS = {
+    "1": (0.35, 0.7, "calm"),
+    "2": (1.00, 1.0, "breezy"),
+    "3": (1.85, 1.4, "stormy"),
+}
+DEFAULT_PRESET = "2"
+
+# Initial camera (orbit-around-origin) parameters.
+INITIAL_YAW = 0.6        # radians around world-y
+INITIAL_PITCH = 0.35     # radians above the horizon (small positive = looking down)
+INITIAL_DISTANCE = 5.0   # camera distance from origin
+PITCH_LIMIT = 1.4        # clamp |pitch| to keep basis non-degenerate (< pi/2)
+ZOOM_MIN = 1.5
+ZOOM_MAX = 30.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    The two kernels live on different grids:
+      - update_height runs over the heightmap (GRID x GRID texels).
+      - render_ocean  runs over output pixels  (WIDTH x HEIGHT).
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # C++ compile so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("update_height", "render_ocean"),
+    )
+
+    kernels = {
+        "update": mod.get_kernel("update_height"),
+        "render": mod.get_kernel("render_ocean"),
+    }
+
+    block = (16, 16, 1)
+    update_grid = (
+        (GRID + block[0] - 1) // block[0],
+        (GRID + block[1] - 1) // block[1],
+        1,
+    )
+    render_grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    configs = {
+        "update": LaunchConfig(grid=update_grid, block=block),
+        "render": LaunchConfig(grid=render_grid, block=block),
+    }
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Gerstner Ocean",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate -- not CUDA-specific. Returns
+    (shader_program, vao_id, tex_id). The shader_program is a pyglet
+    ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window).
+    quad_verts = np.array(
+        [
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D, 0, gl.GL_RGBA8, width, height, 0,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_heightmap_array():
+    """Allocate the single-channel float heightmap Array."""
+    return Array.from_descriptor(
+        shape=(GRID, GRID),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+
+
+def make_height_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def orbit_camera_position(yaw, pitch, distance):
+    """Convert (yaw, pitch, distance) to a world-space camera position.
+
+    The camera orbits the origin looking at it. World up is +y. Pitch is the
+    angle above the xz-plane: pitch=0 puts the camera on the horizon,
+    pitch=+1.4 nearly directly overhead.
+    """
+    cp = math.cos(pitch)
+    sp = math.sin(pitch)
+    cy = math.cos(yaw)
+    sy = math.sin(yaw)
+    cam_x = distance * cp * sy
+    cam_y = distance * sp
+    cam_z = distance * cp * cy
+    return cam_x, cam_y, cam_z
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the heightmap Array and build its texture/surface ---
+    #     We pre-create both the TextureObject (read path) and the
+    #     SurfaceObject (write path) once and reuse them every frame. Creating
+    #     them inside the per-frame loop would work but adds per-frame overhead
+    #     and risks lifetime issues with async kernel launches.
+    height_arr = make_heightmap_array()
+    height_tex = make_height_texture(height_arr)
+    height_surf = SurfaceObject.from_array(height_arr)
+
+    # --- Step 7: Camera + animation state ---
+    state = {
+        "preset": DEFAULT_PRESET,
+        "yaw": INITIAL_YAW,
+        "pitch": INITIAL_PITCH,
+        "distance": INITIAL_DISTANCE,
+        "drag": False,
+        "paused": False,
+        "t_anim": 0.0,
+        "t_prev": time.monotonic(),
+    }
+
+    # --- Step 8: Render loop ---
+    frame_count = 0
+    fps_time = state["t_prev"]
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # Advance animation time only when not paused, so pausing freezes the
+        # ocean exactly where it was rather than letting it lurch when resumed.
+        now = time.monotonic()
+        dt = now - state["t_prev"]
+        state["t_prev"] = now
+        if not state["paused"]:
+            state["t_anim"] += dt
+        t = state["t_anim"]
+
+        amp_scale, speed_scale, _label = PRESETS[state["preset"]]
+
+        # (a) Rebuild the heightmap for time t.
+        launch(
+            stream,
+            configs["update"],
+            kernels["update"],
+            np.uint64(height_surf.handle),
+            np.int32(GRID),
+            np.int32(GRID),
+            np.float32(t),
+            np.float32(amp_scale),
+            np.float32(speed_scale),
+        )
+
+        # (b) Render the scene: sample the heightmap through the texture,
+        #     estimate normals via finite differences, shade with Phong +
+        #     Fresnel sky reflection, write RGBA8 into the OpenGL PBO.
+        cam_x, cam_y, cam_z = orbit_camera_position(
+            state["yaw"], state["pitch"], state["distance"]
+        )
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["render"],
+                kernels["render"],
+                np.uint64(height_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(cam_x),
+                np.float32(cam_y),
+                np.float32(cam_z),
+                np.float32(t),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) PBO -> GL texture (GPU-to-GPU).
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][2]
+            paused = " [paused]" if state["paused"] else ""
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Gerstner Ocean"
+                f" [{label}]{paused} ({WIDTH}x{HEIGHT}, {fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    # --- Mouse: drag to orbit, scroll to zoom ------------------------------
+    @window.event
+    def on_mouse_press(x, y, button, modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            state["drag"] = True
+
+    @window.event
+    def on_mouse_release(x, y, button, modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            state["drag"] = False
+
+    @window.event
+    def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        # Rotate yaw on horizontal drag, pitch on vertical drag. The yaw
+        # direction matches the camera moving with the cursor.
+        state["yaw"] -= dx * 0.005
+        state["pitch"] -= dy * 0.005
+        # Clamp pitch to keep the camera basis non-degenerate (never look
+        # straight down/up the world-y axis).
+        if state["pitch"] > PITCH_LIMIT:
+            state["pitch"] = PITCH_LIMIT
+        if state["pitch"] < -PITCH_LIMIT:
+            state["pitch"] = -PITCH_LIMIT
+
+    @window.event
+    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        # Geometric zoom in camera distance; clamp to a sensible range.
+        factor = 1.1 ** (-scroll_y)
+        new_d = state["distance"] * factor
+        state["distance"] = max(ZOOM_MIN, min(ZOOM_MAX, new_d))
+
+    # --- Keyboard: 1/2/3 weather presets, P pauses, Escape exits ----------
+    @window.event
+    def on_key_press(symbol, modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.P:
+            state["paused"] = not state["paused"]
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse order of creation.
+        resource.close()
+        height_tex.close()
+        height_surf.close()
+        height_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# KERNEL_SOURCE contains two CUDA C++ kernels:
+#   - update_height: per-heightmap-texel. Sums 12 Gerstner waves and writes
+#                    one float per texel via SurfaceObject.
+#   - render_ocean:  per-screen-pixel. Builds a camera ray, intersects the
+#                    ocean plane (y=0), samples the heightmap via
+#                    TextureObject (LINEAR + WRAP), estimates the normal via
+#                    finite differences, and shades with Phong + Fresnel sky
+#                    reflection. Misses go to a vertical sky gradient.
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws a
+# texture on a fullscreen quad -- nothing CUDA-specific.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// ---------------------------------------------------------------------------
+// Wave bank: 12 Gerstner-ish waves with hand-picked parameters.
+//
+// Wavelengths span 0.05 .. 1.0 world units. Amplitudes decrease with
+// frequency so that long swells dominate and short ripples ride on top
+// (a rough Phillips/JONSWAP-style envelope, but coarsely hand-tuned for
+// visual punch rather than physical accuracy).
+//
+// Directions are spread non-uniformly around the unit circle to avoid the
+// streaky-grid look you get from evenly-spaced directions.
+// ---------------------------------------------------------------------------
+__constant__ float c_dirx[12] = {
+    1.000f,  0.866f,  0.500f,  0.000f, -0.500f, -0.866f,
+   -1.000f, -0.940f, -0.500f,  0.174f,  0.643f,  0.940f
+};
+__constant__ float c_dirz[12] = {
+    0.000f,  0.500f,  0.866f,  1.000f,  0.866f,  0.500f,
+    0.000f,  0.342f,  0.866f,  0.985f,  0.766f,  0.342f
+};
+__constant__ float c_wavelen[12] = {
+    1.000f, 0.730f, 0.520f, 0.380f, 0.260f, 0.190f,
+    0.140f, 0.105f, 0.085f, 0.070f, 0.058f, 0.050f
+};
+__constant__ float c_amp[12] = {
+    0.080f, 0.060f, 0.045f, 0.034f, 0.025f, 0.018f,
+    0.013f, 0.010f, 0.0075f, 0.0055f, 0.0040f, 0.0030f
+};
+__constant__ float c_phase[12] = {
+    0.00f, 1.20f, 2.10f, 0.40f, 3.70f, 5.10f,
+    2.65f, 4.85f, 1.55f, 6.05f, 3.20f, 0.95f
+};
+
+// Deep-water dispersion: w = sqrt(g * k), with k = 2*pi / wavelength.
+__device__ __forceinline__ float angular_freq(float wavelength) {
+    const float G = 9.81f;
+    float k = 6.2831853f / wavelength;
+    return sqrtf(G * k);
+}
+
+// World extent (in world units) covered by one tile of the heightmap.
+// The heightmap WRAPs, so the ocean tiles seamlessly every TILE world units.
+__device__ __forceinline__ float tile_extent() { return 4.0f; }
+
+// ---------------------------------------------------------------------------
+// Tiny vec3 helpers. Kept inline + __forceinline__ so they stay free.
+// ---------------------------------------------------------------------------
+struct V3 { float x, y, z; };
+
+__device__ __forceinline__ V3 v3(float x, float y, float z) {
+    V3 r; r.x = x; r.y = y; r.z = z; return r;
+}
+__device__ __forceinline__ V3 v_add(V3 a, V3 b) {
+    return v3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+__device__ __forceinline__ V3 v_sub(V3 a, V3 b) {
+    return v3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+__device__ __forceinline__ V3 v_scale(V3 a, float s) {
+    return v3(a.x * s, a.y * s, a.z * s);
+}
+__device__ __forceinline__ float v_dot(V3 a, V3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+__device__ __forceinline__ V3 v_cross(V3 a, V3 b) {
+    return v3(a.y * b.z - a.z * b.y,
+              a.z * b.x - a.x * b.z,
+              a.x * b.y - a.y * b.x);
+}
+__device__ __forceinline__ V3 v_normalize(V3 a) {
+    float inv = rsqrtf(fmaxf(v_dot(a, a), 1e-20f));
+    return v_scale(a, inv);
+}
+
+// ---------------------------------------------------------------------------
+// update_height: each thread computes one heightmap texel.
+//
+// Sums the 12 Gerstner waves at world position (x, z), using the
+// amplitude_scale and speed_scale knobs to switch between weather presets
+// without recompiling the kernel. Writes one float via surf2Dwrite.
+// ---------------------------------------------------------------------------
+extern "C" __global__
+void update_height(cudaSurfaceObject_t surf,
+                   int width, int height,
+                   float t,
+                   float amp_scale, float speed_scale) {
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    if (ix >= width || iy >= height) return;
+
+    // Map texel (ix, iy) to world position (x, z) inside one tile.
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float te = tile_extent();
+    float wx = ((float)ix + 0.5f) * inv_w * te;
+    float wz = ((float)iy + 0.5f) * inv_h * te;
+
+    float h = 0.0f;
+    #pragma unroll
+    for (int i = 0; i < 12; ++i) {
+        float k = 6.2831853f / c_wavelen[i];
+        float w = angular_freq(c_wavelen[i]) * speed_scale;
+        float arg = (c_dirx[i] * wx + c_dirz[i] * wz) * k - w * t + c_phase[i];
+        h += c_amp[i] * sinf(arg);
+    }
+    h *= amp_scale;
+
+    // Single-channel float surface: byte offset is x * sizeof(float).
+    surf2Dwrite(h, surf, ix * (int)sizeof(float), iy);
+}
+
+// ---------------------------------------------------------------------------
+// Sample the heightmap at a world position. Texture is normalized + WRAP,
+// so we just divide world coords by tile_extent. WRAP gives us the tiling
+// for free at the horizon.
+// ---------------------------------------------------------------------------
+__device__ __forceinline__ float sample_height(cudaTextureObject_t tex,
+                                               float wx, float wz) {
+    float inv_te = 1.0f / tile_extent();
+    return tex2D<float>(tex, wx * inv_te, wz * inv_te);
+}
+
+// ---------------------------------------------------------------------------
+// Sky gradient: a vertical interpolation from a soft horizon to a deeper
+// overhead blue. `up_angle` is in [-1, 1] (the y component of the ray dir).
+// ---------------------------------------------------------------------------
+__device__ __forceinline__ V3 sky_color(float up_angle) {
+    // Clamp to [0, 1] so straight-down rays still get a horizon color.
+    float a = fmaxf(0.0f, fminf(1.0f, up_angle));
+    // Soft pale-blue horizon
+    V3 horizon = v3(0.70f, 0.82f, 0.92f);
+    // Deeper blue overhead
+    V3 zenith  = v3(0.18f, 0.34f, 0.62f);
+    // Curve so the gradient isn't linear -- horizon stays brighter longer.
+    float t = powf(a, 0.6f);
+    return v_add(v_scale(horizon, 1.0f - t), v_scale(zenith, t));
+}
+
+// ---------------------------------------------------------------------------
+// render_ocean: each thread shades one screen pixel.
+//
+// 1. Reconstruct the camera basis from cam_pos (orbiting origin, world-up).
+// 2. Build a perspective ray through the pixel.
+// 3. Intersect ray with y = 0 plane; if it misses, return sky gradient.
+// 4. Sample heightmap at hit point; finite-difference for the normal.
+// 5. Phong diffuse + specular, blended with Fresnel sky reflection.
+// 6. Write RGBA8 into the OpenGL PBO.
+// ---------------------------------------------------------------------------
+extern "C" __global__
+void render_ocean(cudaTextureObject_t tex,
+                  unsigned char* out,
+                  int w, int h,
+                  float cam_x, float cam_y, float cam_z,
+                  float /*t*/) {
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    if (px >= w || py >= h) return;
+
+    // ---- Camera basis ----
+    // Forward looks from cam_pos toward origin. World up is +y.
+    // cam_y > 0 guarantees forward.y < 0 and the cross product with world-up
+    // is well-defined (the pitch is clamped on the host side).
+    V3 cam_pos = v3(cam_x, cam_y, cam_z);
+    V3 forward = v_normalize(v_sub(v3(0.0f, 0.0f, 0.0f), cam_pos));
+    V3 world_up = v3(0.0f, 1.0f, 0.0f);
+    V3 right = v_normalize(v_cross(forward, world_up));
+    V3 cam_up = v_cross(right, forward);
+
+    // ---- Pixel ray (perspective) ----
+    float aspect = (float)w / (float)h;
+    float fov = 1.0472f;                 // 60 degrees vertical FoV
+    float scale = tanf(fov * 0.5f);
+    float ndc_x = (2.0f * ((float)px + 0.5f) / (float)w - 1.0f) * aspect * scale;
+    float ndc_y = (1.0f - 2.0f * ((float)py + 0.5f) / (float)h) * scale;
+    V3 dir = v_normalize(v_add(v_add(forward,
+                                     v_scale(right, ndc_x)),
+                               v_scale(cam_up, ndc_y)));
+
+    // ---- Background sky if the ray misses the ocean plane ----
+    // The ocean is the y=0 plane; we only count hits with rays going downward
+    // (dir.y < 0). Anything else is sky. A small eps avoids near-horizontal
+    // rays producing absurd hit distances.
+    V3 col;
+    const float HIT_EPS = 1e-3f;
+    if (dir.y > -HIT_EPS) {
+        col = sky_color(dir.y);
+    } else {
+        // ---- Hit the ocean plane ----
+        float t_hit = -cam_y / dir.y;
+        if (t_hit <= 0.0f) {
+            // Camera under the surface -- treat as sky to avoid garbage.
+            col = sky_color(dir.y);
+        } else {
+            V3 p = v_add(cam_pos, v_scale(dir, t_hit));
+
+            // ---- Sample heightmap; estimate normal via finite differences ----
+            // The heightmap tiles every tile_extent() world units (WRAP), so
+            // we use a small world-space epsilon. Four taps -> central
+            // differences in x and z.
+            const float FD = 0.01f;
+            float h_c = sample_height(tex, p.x,       p.z);
+            float h_xp = sample_height(tex, p.x + FD, p.z);
+            float h_xm = sample_height(tex, p.x - FD, p.z);
+            float h_zp = sample_height(tex, p.x,      p.z + FD);
+            float h_zm = sample_height(tex, p.x,      p.z - FD);
+            float dh_dx = (h_xp - h_xm) / (2.0f * FD);
+            float dh_dz = (h_zp - h_zm) / (2.0f * FD);
+            // Normal of the surface y = h(x, z) is (-dh/dx, 1, -dh/dz).
+            V3 N = v_normalize(v3(-dh_dx, 1.0f, -dh_dz));
+
+            // ---- Lighting ----
+            V3 L = v_normalize(v3(0.55f, 0.65f, 0.35f));   // sun: high+side
+            V3 V = v_normalize(v_sub(cam_pos, p));         // view direction
+            // Reflect L about N: R = 2*(N.L)*N - L
+            float ndotl = fmaxf(0.0f, v_dot(N, L));
+            V3 R = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, L)), L));
+
+            // Phong specular highlight on wave crests.
+            float spec = powf(fmaxf(0.0f, v_dot(R, V)), 32.0f);
+
+            // Diffuse: deep-sea blue-green.
+            V3 deep = v3(0.04f, 0.18f, 0.28f);
+            V3 shallow = v3(0.10f, 0.32f, 0.42f);
+            // Tiny height-based shading bias so crests look slightly brighter.
+            float tint = 0.5f + 0.5f * fmaxf(-1.0f, fminf(1.0f, h_c * 6.0f));
+            V3 base = v_add(v_scale(deep, 1.0f - tint),
+                            v_scale(shallow, tint));
+
+            // Diffuse term + ambient.
+            V3 diffuse = v_add(v_scale(base, 0.18f),
+                               v_scale(base, 0.82f * ndotl));
+
+            // Fresnel-modulated sky reflection. Sample the sky in the
+            // reflected-view direction so reflections of overhead show
+            // overhead colors, etc. View reflection: Rv = 2*(N.V)*N - V.
+            float ndotv = fmaxf(0.0f, v_dot(N, V));
+            V3 Rv = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, V)), V));
+            V3 reflected_sky = sky_color(fmaxf(0.0f, Rv.y));
+            float F = powf(1.0f - ndotv, 5.0f);
+            // Clamp Fresnel just in case of NaN-prone edge cases.
+            if (F < 0.0f) F = 0.0f;
+            if (F > 1.0f) F = 1.0f;
+
+            // Blend: more reflection at grazing angles.
+            V3 lit = v_add(v_scale(diffuse, 1.0f - F),
+                           v_scale(reflected_sky, F));
+
+            // Add specular highlight (sun color).
+            V3 sun_col = v3(1.0f, 0.96f, 0.85f);
+            col = v_add(lit, v_scale(sun_col, spec));
+        }
+    }
+
+    // ---- Tonemap + write ----
+    // Simple Reinhard-ish curve keeps highlights in [0, 1].
+    col.x = col.x / (1.0f + col.x);
+    col.y = col.y / (1.0f + col.y);
+    col.z = col.z / (1.0f + col.z);
+
+    int idx = (py * w + px) * 4;
+    out[idx + 0] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.x)) * 255.0f);
+    out[idx + 1] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.y)) * 255.0f);
+    out[idx + 2] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.z)) * 255.0f);
+    out[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py
new file mode 100644
index 00000000000..b30603721a1
--- /dev/null
+++ b/cuda_core/examples/gl_interop_reaction_diffusion.py
@@ -0,0 +1,727 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.Array, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Gray-Scott
+# reaction-diffusion simulation is ping-ponged between two CUDA arrays each
+# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads, and a
+# SurfaceObject provides typed writes. The final state is colorized straight
+# into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a CUDA Array with `surface_load_store=True` so the same
+#   memory can be bound as both a TextureObject (for sampled reads) and a
+#   SurfaceObject (for typed writes).
+# - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates
+#   to get free hardware bilinear interpolation on a toroidal world.
+# - How to compose Array/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire simulation never leaves the GPU.
+#
+# How it works
+# ============
+# Gray-Scott is a two-species (U, V) reaction-diffusion system. At each cell
+# the rule is roughly:
+#
+#     du/dt = Du * laplacian(u) - u*v*v + F*(1 - u)
+#     dv/dt = Dv * laplacian(v) + u*v*v - (F + k)*v
+#
+# Different choices of F and k yield strikingly different patterns: coral,
+# mitosis, spots, and many more. We pack (U, V) into the two channels of a
+# `float2` Array.
+#
+#   PING-PONG (two arrays, swap each step)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+   tex2D<float2>   +--------------+
+#   |   arr_a      | ----------------> |              |
+#   | (U, V) state |                   |  gray_scott  |
+#   +--------------+                   |    kernel    |
+#                                      |              |
+#   +--------------+   surf2Dwrite     |              |
+#   |   arr_b      | <---------------- |              |
+#   | (U, V) state |                   +--------------+
+#   +--------------+
+#       (swap)
+#
+# Each frame we do N_STEPS iterations of the kernel above, then run a separate
+# `colorize` kernel that samples V from the final state and writes RGBA bytes
+# straight into the OpenGL PBO via GraphicsResource. No data ever travels
+# across the PCIe bus during the frame.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Addressing modes WRAP and MIRROR are only supported with normalized
+# coordinates (see the CUDA Programming Guide and the SDK's
+# simplePitchLinearTexture sample). We use WRAP so that neighbor lookups at
+# the image edge automatically wrap around -- i.e. a torus. LINEAR filtering
+# is essentially free on the hardware and gives smoother diffusion than POINT
+# sampling would. We sample at the texel center `(x + 0.5) / W` so the
+# neighbor offsets line up exactly on integer texel positions.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this
+# wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window showing animated, organic-looking patterns growing and dividing
+# (think coral, spots, or mitosing cells). Press 1/2/3 to switch presets,
+# R to reseed, and Escape to exit. The window title shows the current FPS
+# and active preset.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+N_STEPS = 8  # Gray-Scott iterations per displayed frame
+DU = 0.16  # diffusion rate for U
+DV = 0.08  # diffusion rate for V
+DT = 1.0  # time step (Gray-Scott is stable at 1.0 with these D's)
+
+# Named presets: (F, k, label) tuples. F is the feed rate, k is the kill rate.
+# These are classic Gray-Scott regimes documented all over the literature.
+PRESETS = {
+    "1": (0.0545, 0.062, "coral"),
+    "2": (0.0367, 0.0649, "mitosis"),
+    "3": (0.030, 0.062, "spots"),
+}
+DEFAULT_PRESET = "1"
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# Array/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("gray_scott_step", "colorize", "seed_initial"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("gray_scott_step"),
+        "colorize": mod.get_kernel("colorize"),
+        "seed": mod.get_kernel("seed_initial"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they
+    # can share a launch config.
+    configs = {"step": config, "colorize": config, "seed": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core Array/Texture/Surface - Gray-Scott Reaction Diffusion",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two `float2` ping-pong arrays that hold the (U, V) state."""
+    arr_a = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        surface_load_store=True,
+    )
+    arr_b = Array.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_state(stream, kernels, configs, write_surf, seed_value):
+    """Re-initialize the array behind `write_surf` with the Gray-Scott starting state.
+
+    Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so
+    creating a SurfaceObject inside a `with` block that closes immediately
+    after `launch` returns would destroy the surface handle before the kernel
+    actually runs against it.
+    """
+    launch(
+        stream,
+        configs["seed"],
+        kernels["seed"],
+        np.uint64(write_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.uint32(seed_value),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong state Arrays ---
+    #     Both are `float2` (channel 0 = U, channel 1 = V) with
+    #     surface_load_store=True so they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles ---
+    #     Per advisor: doing this once is much cheaper than recreating them
+    #     every step. We keep both texture and surface handles for each
+    #     array; the simulation loop just picks which pair to use.
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Seed the initial state into arr_a (writes via surf_a) ---
+    seed_state(stream, kernels, configs, surf_a, seed_value=0)
+    # After seeding, `arr_a` is the "current" state.
+    state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0}
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            seed_state(stream, kernels, configs, surf_a, seed_value=state["seed"])
+            state["current"] = "a"
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        F, k, _label = PRESETS[state["preset"]]
+
+        # (a) Run N_STEPS Gray-Scott iterations. Each step reads from one
+        #     array via a TextureObject (LINEAR + WRAP gives wrapping +
+        #     bilinear sampling) and writes to the other via a SurfaceObject.
+        for _ in range(N_STEPS):
+            tex_read, surf_write, next_current = current_read_write()
+            launch(
+                stream,
+                configs["step"],
+                kernels["step"],
+                np.uint64(tex_read.handle),
+                np.uint64(surf_write.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(DU),
+                np.float32(DV),
+                np.float32(F),
+                np.float32(k),
+                np.float32(DT),
+            )
+            state["current"] = next_current
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_read = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_read.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][2]
+            window.set_caption(
+                "cuda.core Array/Texture/Surface - Gray-Scott"
+                f" [{label}] ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" {N_STEPS} steps/frame)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * seed_initial   -- sets initial (U, V) state via SurfaceObject writes
+#       * gray_scott_step -- reads previous state via TextureObject (with
+#                            LINEAR + WRAP bilinear filtering) and writes the
+#                            next state via SurfaceObject. Coordinates are
+#                            normalized to [0, 1] because WRAP requires it.
+#       * colorize       -- reads the V channel via TextureObject and writes
+#                            RGBA bytes into the OpenGL PBO using a simple
+#                            three-stop "magma-ish" gradient.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Inverse texture dimensions are precomputed by the host and passed as
+// floats so the kernel can convert integer pixel coordinates to normalized
+// texture coordinates with a single multiply.
+
+extern "C"
+__global__
+void seed_initial(cudaSurfaceObject_t surf,
+                  int width, int height,
+                  unsigned int seed) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // U = 1 everywhere; V = 1 inside a ~40x40 centered square plus a small
+    // deterministic perturbation that breaks symmetry differently each reseed.
+    float u = 1.0f;
+    float v = 0.0f;
+
+    int half_w = width / 2;
+    int half_h = height / 2;
+    if (x >= half_w - 20 && x < half_w + 20 &&
+        y >= half_h - 20 && y < half_h + 20) {
+        v = 1.0f;
+        // Knock U down a bit inside the seed square so V can grow.
+        u = 0.5f;
+    }
+
+    // Cheap deterministic pseudo-random noise (xorshift on packed coords).
+    unsigned int h = (unsigned int)x * 374761393u +
+                     (unsigned int)y * 668265263u + seed * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float noise = (h & 0xffffu) / 65535.0f;   // in [0, 1]
+    v += 0.02f * (noise - 0.5f);              // small +/- jitter
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(make_float2(u, v), surf, x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void gray_scott_step(cudaTextureObject_t tex,
+                     cudaSurfaceObject_t surf,
+                     int width, int height,
+                     float Du, float Dv,
+                     float F, float k_kill,
+                     float dt) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized coordinates: WRAP addressing only works in normalized mode.
+    // Each texel center sits at ((i + 0.5) / W, (j + 0.5) / H).
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    // 5-point Laplacian stencil. LINEAR filtering does nothing extra here
+    // because the offsets land exactly on neighboring texel centers, but the
+    // toroidal WRAP at the boundary is essential for a periodic world.
+    float2 c = tex2D<float2>(tex, cx, cy);
+    float2 l = tex2D<float2>(tex, cx - inv_w, cy);
+    float2 r = tex2D<float2>(tex, cx + inv_w, cy);
+    float2 u_n = tex2D<float2>(tex, cx, cy - inv_h);
+    float2 d_n = tex2D<float2>(tex, cx, cy + inv_h);
+
+    float lap_u = (l.x + r.x + u_n.x + d_n.x) - 4.0f * c.x;
+    float lap_v = (l.y + r.y + u_n.y + d_n.y) - 4.0f * c.y;
+
+    float u = c.x;
+    float v = c.y;
+    float uvv = u * v * v;
+
+    float du = Du * lap_u - uvv + F * (1.0f - u);
+    float dv = Dv * lap_v + uvv - (F + k_kill) * v;
+
+    float new_u = u + dt * du;
+    float new_v = v + dt * dv;
+
+    // Clamp to keep things numerically sane after long runs.
+    if (new_u < 0.0f) new_u = 0.0f;
+    if (new_u > 1.0f) new_u = 1.0f;
+    if (new_v < 0.0f) new_v = 0.0f;
+    if (new_v > 1.0f) new_v = 1.0f;
+
+    surf2Dwrite(make_float2(new_u, new_v), surf,
+                x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    float2 c = tex2D<float2>(tex, cx, cy);
+    float v = c.y;
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Three-stop "magma-ish" gradient: dark purple -> orange -> pale yellow.
+    // Implemented as two linear interpolations stitched together at v = 0.5
+    // so the result is reasonably perceptually smooth without a lookup table.
+    float r, g, b;
+    if (v < 0.5f) {
+        float t = v * 2.0f;                  // [0, 1] over the low half
+        r = 0.05f + t * (0.85f - 0.05f);
+        g = 0.02f + t * (0.30f - 0.02f);
+        b = 0.20f + t * (0.10f - 0.20f);
+    } else {
+        float t = (v - 0.5f) * 2.0f;         // [0, 1] over the high half
+        r = 0.85f + t * (1.00f - 0.85f);
+        g = 0.30f + t * (0.95f - 0.30f);
+        b = 0.10f + t * (0.70f - 0.10f);
+    }
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py
new file mode 100644
index 00000000000..05299cc278f
--- /dev/null
+++ b/cuda_core/examples/gl_interop_sdf_volume.py
@@ -0,0 +1,827 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core's 3D Array + trilinear TextureObject by
+# baking a procedural Signed Distance Field (SDF) volume once at startup and
+# then ray-marching it every frame to render an orbitable 3D scene. The
+# SurfaceObject is used during the one-shot bake; the TextureObject (with
+# LINEAR + CLAMP + normalized coords) drives the per-frame ray march. The
+# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a 3D cuda.core.Array (cuArray3DCreate under the hood) and
+#   bind it as both a SurfaceObject (for one-shot kernel writes) and a
+#   TextureObject (for hardware-accelerated trilinear sampling).
+# - How to ray-march a baked SDF volume from a CUDA kernel, sampling via
+#   tex3D<float> and writing pixels straight into an OpenGL PBO.
+# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop.
+#
+# How it works
+# ============
+# The signed distance field of a "gyroid intersected with a sphere" is baked
+# once into a 128 x 128 x 128 single-channel float volume:
+#
+#     gyroid(p)   = sin(p.x*tau)cos(p.y*tau)
+#                 + sin(p.y*tau)cos(p.z*tau)
+#                 + sin(p.z*tau)cos(p.x*tau)
+#     sdf_gyroid  = |gyroid(p)| - 0.20         # slab around the gyroid surface
+#     sdf_sphere  = length(p) - 0.9            # bounding sphere
+#     sdf(p)      = max(sdf_gyroid, sdf_sphere) # CSG intersection
+#
+# where p in [-1, 1]^3 is the voxel's world-space position.
+#
+# Each frame, the render kernel emits one ray per pixel from an orbiting
+# camera, marches the volume in fixed voxel-sized steps (up to ~256), and on intersection
+# computes a normal by central differences of tex3D, then applies a simple
+# diffuse + ambient + specular shade. Misses fall back to a vertical sky
+# gradient.
+#
+#   STARTUP (one-shot bake)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate 3D Array (128^3, FLOAT32 x1, surface_load_store=True).
+#   2. Bind it as a SurfaceObject.
+#   3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite.
+#   4. Close the SurfaceObject; the Array stays alive.
+#
+#   EACH FRAME
+#   ~~~~~~~~~~
+#   1. resource.map() -> CUDA device pointer into the OpenGL PBO.
+#   2. Launch `render_sdf` (one thread per pixel). It samples the SDF via the
+#      long-lived TextureObject (LINEAR + CLAMP + normalized coords) using
+#      tex3D<float>. RGBA8 lands directly in the PBO.
+#   3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad.
+#
+# Controls
+# ========
+#   Left mouse drag    orbit camera (dx -> yaw, dy -> pitch)
+#   Mouse wheel        zoom (camera distance)
+#   R                  reset camera (yaw=0, pitch=0.3, dist=2.5)
+#   Escape / close     quit
+#
+# The window title shows yaw, pitch, distance, FPS, and ms/frame.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+VOLUME_SIZE = 128   # 128^3 voxels; bake cost is one-shot.
+
+# Camera defaults / clamps.
+RESET_YAW = 0.0
+RESET_PITCH = 0.3
+RESET_DIST = 2.5
+PITCH_MIN = -1.45    # stay inside (-pi/2, pi/2) so the up-vector stays sane.
+PITCH_MAX = 1.45
+DIST_MIN = 1.2
+DIST_MAX = 8.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# 3D Array / TextureObject / SurfaceObject, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """3D arrays + bindless surface/texture objects require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the two kernels and return (device, stream, kernels)."""
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    # C++ is required so the templated tex3D<float> / surf3Dwrite<float>
+    # overloads resolve. extern "C" on the kernel symbols keeps the function
+    # names unmangled even when the rest of the TU is compiled as C++.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_sdf", "render_sdf"),
+    )
+    kernels = {
+        "bake": mod.get_kernel("bake_sdf"),
+        "render": mod.get_kernel("render_sdf"),
+    }
+    return dev, stream, kernels
+
+
+def make_volume_array():
+    """Allocate the 3D SDF volume. Single-channel float, surface-capable."""
+    return Array.from_descriptor(
+        shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        surface_load_store=True,
+    )
+
+
+def make_volume_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.
+
+    Normalized coords let the kernel sample as (u, v, w) in [0, 1]; CLAMP at
+    the boundaries matches the rendering logic that bails out as soon as the
+    march leaves the volume's [-1, 1]^3 box, so out-of-range sampling never
+    pollutes a real hit.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def bake_volume(stream, kernels, arr):
+    """Run the one-shot bake kernel that fills the volume with the SDF.
+
+    The SurfaceObject lives only for the duration of this call; once the bake
+    is enqueued and the kernel has captured the bindless handle into its
+    arguments, we sync the stream before letting the SurfaceObject close.
+    The Array itself outlives this scope -- it's the long-lived backing store
+    for the render-loop TextureObject.
+    """
+    with SurfaceObject.from_array(arr) as bake_surf:
+        block = (8, 8, 8)
+        grid = (
+            (VOLUME_SIZE + block[0] - 1) // block[0],
+            (VOLUME_SIZE + block[1] - 1) // block[1],
+            (VOLUME_SIZE + block[2] - 1) // block[2],
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["bake"],
+            np.uint64(bake_surf.handle),
+            np.int32(VOLUME_SIZE),
+        )
+        # Synchronize before the SurfaceObject context exits so the bindless
+        # handle is still valid while the kernel runs.
+        stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core 3D Array - SDF Volume Ray-Marcher",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: shader, fullscreen quad, empty texture.
+
+    Not CUDA-specific; identical to the other gl_interop_* examples.
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the 3D SDF volume and bake it once ---
+    #     The Array is the long-lived backing store; it must outlive the
+    #     render loop. The SurfaceObject is only needed for the one-shot bake
+    #     and is closed before we ever bind a TextureObject to the same Array.
+    arr = make_volume_array()
+    bake_volume(stream, kernels, arr)
+
+    # --- Step 3: Bind the volume as a trilinear TextureObject ---
+    #     LINEAR + CLAMP + normalized_coords gives us free hardware trilinear
+    #     filtering, which is exactly what we want for both the SDF samples
+    #     in the ray march and the normal-finite-difference samples.
+    volume_tex = make_volume_texture(arr)
+
+    # --- Step 4: Open a window and set up the CUDA/GL bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # Camera is orbit-style: yaw and pitch are angles, dist is the orbit
+    # radius. The render kernel turns these into a (origin, basis) and
+    # constructs per-pixel rays itself.
+    cam = {
+        "yaw": RESET_YAW,
+        "pitch": RESET_PITCH,
+        "dist": RESET_DIST,
+    }
+    frame_count = [0]
+    fps_time = [time.monotonic()]
+    last_fps = [0.0]
+    last_frame_ms = [0.0]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the ray-march kernel. The camera params are passed
+            #     as scalars; the kernel computes the orbit eye position and
+            #     per-pixel ray direction itself.
+            launch(
+                stream,
+                config,
+                kernels["render"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(volume_tex.handle),
+                np.float32(cam["yaw"]),
+                np.float32(cam["pitch"]),
+                np.float32(cam["dist"]),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 0.5:
+            last_fps[0] = frame_count[0] / (now - fps_time[0])
+            last_frame_ms[0] = 1000.0 / last_fps[0] if last_fps[0] > 0 else 0.0
+            frame_count[0] = 0
+            fps_time[0] = now
+            window.set_caption(
+                "cuda.core 3D Array - SDF Volume Ray-Marcher  "
+                f"yaw={cam['yaw']:+.2f} pitch={cam['pitch']:+.2f} "
+                f"dist={cam['dist']:.2f}  "
+                f"{last_fps[0]:.0f} FPS  {last_frame_ms[0]:.2f} ms/frame"
+            )
+
+    @window.event
+    def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+        # Left-click drag orbits the camera. dx -> yaw (sign convention chosen
+        # so that dragging right rotates the scene right); dy -> pitch (drag
+        # up tilts the camera up).
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        ORBIT_SCALE = 0.005
+        cam["yaw"] += dx * ORBIT_SCALE
+        cam["pitch"] += dy * ORBIT_SCALE
+        # Clamp pitch so the up-vector never flips (we use world-up (0,1,0)).
+        if cam["pitch"] < PITCH_MIN:
+            cam["pitch"] = PITCH_MIN
+        elif cam["pitch"] > PITCH_MAX:
+            cam["pitch"] = PITCH_MAX
+
+    @window.event
+    def on_mouse_scroll(x, y, scroll_x, scroll_y):
+        # Scroll wheel zoom: geometric so each tick feels uniform regardless
+        # of current distance. Positive scroll_y (wheel up) zooms in.
+        if scroll_y == 0:
+            return
+        cam["dist"] *= 0.9 ** scroll_y
+        if cam["dist"] < DIST_MIN:
+            cam["dist"] = DIST_MIN
+        elif cam["dist"] > DIST_MAX:
+            cam["dist"] = DIST_MAX
+
+    @window.event
+    def on_key_press(symbol, modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.R:
+            cam["yaw"] = RESET_YAW
+            cam["pitch"] = RESET_PITCH
+            cam["dist"] = RESET_DIST
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse construction order. The GL
+        # objects clean up via pyglet on window close.
+        resource.close()
+        volume_tex.close()
+        arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Two CUDA C++ kernels are concatenated into one program string so they share
+# a single NVRTC compile.
+#
+#   bake_sdf    -- one thread per voxel. Computes the SDF of an
+#                  "abs(gyroid) - 0.20" surface intersected with a bounding
+#                  sphere, then writes the scalar via surf3Dwrite. NOTE:
+#                  surf3Dwrite's x coordinate is in BYTES, y and z in
+#                  elements -- a classic CUDA gotcha.
+#
+#   render_sdf  -- one thread per screen pixel. Builds the orbit-camera ray,
+#                  fixed-step-marches the volume via tex3D<float> on a trilinear-
+#                  filtered, normalized-coord TextureObject, and shades the
+#                  hit with diffuse + ambient + specular. Misses return a
+#                  sky gradient. Writes RGBA8 directly into the OpenGL PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Small inline helpers. Keeping them __device__ __forceinline__ encourages
+// the compiler to drop them inline and avoids any cross-TU linkage worries.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float dot3(float ax, float ay, float az,
+                                      float bx, float by, float bz) {
+    return ax * bx + ay * by + az * bz;
+}
+
+__device__ __forceinline__ float length3(float x, float y, float z) {
+    return sqrtf(x * x + y * y + z * z);
+}
+
+// --------------------------------------------------------------------------
+// bake_sdf: one thread per voxel writes the SDF of a gyroid-intersect-sphere
+//           into a single-channel float 3D Array via a SurfaceObject.
+//
+//   surf is bound to a (size^3, FLOAT32 x 1) Array allocated with
+//   surface_load_store=True.
+//   surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float));
+//   y and z are in elements. Off-by-one on the byte conversion silently
+//   corrupts every other column, so it's worth flagging explicitly.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void bake_sdf(cudaSurfaceObject_t surf, int size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+    if (x >= size || y >= size || z >= size) return;
+
+    // Map the voxel index to world-space p in [-1, 1]^3 (texel centers).
+    float fx = ((float)x + 0.5f) / (float)size;
+    float fy = ((float)y + 0.5f) / (float)size;
+    float fz = ((float)z + 0.5f) / (float)size;
+    float px = fx * 2.0f - 1.0f;
+    float py = fy * 2.0f - 1.0f;
+    float pz = fz * 2.0f - 1.0f;
+
+    // Gyroid frequency: 3 cycles across [-1, 1] gives a busy but not noisy
+    // surface at 128^3 resolution. tau = 2 * pi * frequency.
+    const float TAU = 6.2831853071795864f * 3.0f;
+
+    float sx = sinf(px * TAU), cx = cosf(px * TAU);
+    float sy = sinf(py * TAU), cy = cosf(py * TAU);
+    float sz = sinf(pz * TAU), cz = cosf(pz * TAU);
+    float gyroid     = sx * cy + sy * cz + sz * cx;
+    // Slab thickness: the gyroid SDF is non-Lipschitz (its gradient scales
+    // with TAU ~= 19), so the stored values along the surface are dense but
+    // unreliable as a true distance metric. A wider slab (0.20 vs the
+    // canonical 0.05) gives the fixed-step ray marcher in render_sdf enough
+    // hit candidates per ray to render real geometry instead of mostly sky.
+    float sdf_gyroid = fabsf(gyroid) - 0.20f;          // slab around iso-zero
+    float sdf_sphere = length3(px, py, pz) - 0.9f;     // bounding sphere
+    float sdf        = fmaxf(sdf_gyroid, sdf_sphere);  // CSG intersection
+
+    // surf3Dwrite: x in BYTES (cast sizeof to int so 32-bit arithmetic works
+    // even when x is large), y/z in elements.
+    surf3Dwrite<float>(sdf, surf, x * (int)sizeof(float), y, z);
+}
+
+// --------------------------------------------------------------------------
+// SDF sampler: tex3D wants normalized coords in [0, 1]; the volume covers
+// [-1, 1] in world space, so we remap with `(p + 1) * 0.5`. Returns the
+// raw stored SDF (a signed distance in world units).
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float sample_sdf(cudaTextureObject_t tex,
+                                            float px, float py, float pz) {
+    return tex3D<float>(tex,
+                        (px + 1.0f) * 0.5f,
+                        (py + 1.0f) * 0.5f,
+                        (pz + 1.0f) * 0.5f);
+}
+
+// --------------------------------------------------------------------------
+// render_sdf: one thread per screen pixel. Builds the orbit camera, marches
+// a ray through the SDF volume, and writes a shaded RGBA8 pixel to the PBO.
+//
+// Camera math (orbit, look-at origin, world-up (0, 1, 0)):
+//   eye = dist * (cos(pitch)*cos(yaw), sin(pitch), cos(pitch)*sin(yaw))
+//   fwd = normalize(target - eye)         (target = origin)
+//   right = normalize(cross(fwd, up))
+//   up'   = cross(right, fwd)
+//   For a pixel at (u, v) in NDC ([-1, 1] x [-1, 1] with v=1 at the top),
+//   dir = normalize(fwd + tan(fov/2) * (aspect * u * right + v * up'))
+//
+// Ray-march:
+//   Fixed-step march: t += STEP, where STEP is set to roughly one voxel. The
+//   gyroid SDF is non-Lipschitz, which makes classical sphere tracing
+//   (t += sdf(p)) overshoot through thin slabs and miss almost every ray. A
+//   uniform voxel-sized step is robust and cheap because the SDF is just a
+//   tex3D lookup. We declare a HIT when sdf < HIT_EPS.
+//
+// Bounds bail: outside the [-1, 1]^3 box, return the sky.
+// Normal: 6-sample central differences with eps ~ 1.5/VOLUME_SIZE so the
+//         offsets are just over one voxel apart -- short enough to capture
+//         local surface direction, long enough that trilinear filtering
+//         actually moves the result.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_sdf(unsigned char* output,
+                int width,
+                int height,
+                cudaTextureObject_t tex,
+                float yaw,
+                float pitch,
+                float dist) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // ---- Build the orbit camera basis ----------------------------------
+    float cp = cosf(pitch), sp = sinf(pitch);
+    float cy = cosf(yaw),   sy = sinf(yaw);
+
+    // Eye on a sphere of radius `dist` around the origin.
+    float ex = dist * cp * cy;
+    float ey = dist * sp;
+    float ez = dist * cp * sy;
+
+    // fwd = normalize(target - eye), target = origin -> fwd = -eye / |eye|.
+    float fl = length3(ex, ey, ez);
+    // Guard against the (clamped) dist being zero (not reachable, but cheap).
+    if (fl < 1e-6f) fl = 1e-6f;
+    float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl;
+
+    // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0).
+    // cross((fx,fy,fz), (0,1,0)) = (fy*0 - fz*1, fz*0 - fx*0, fx*1 - fy*0)
+    //                            = (-fz, 0, fx)
+    float rx = -fz;
+    float ry = 0.0f;
+    float rz = fx;
+    float rl = length3(rx, ry, rz);
+    if (rl < 1e-6f) rl = 1e-6f;
+    rx /= rl; ry /= rl; rz /= rl;
+
+    // up' = cross(right, fwd). With right purely in the xz-plane, this is a
+    // proper orthonormal up; recompute to keep the basis consistent.
+    float ux = ry * fz - rz * fy;
+    float uy = rz * fx - rx * fz;
+    float uz = rx * fy - ry * fx;
+
+    // ---- Per-pixel ray direction ---------------------------------------
+    // NDC with v=1 at the TOP. With our PBO layout (y=0 written first ->
+    // ends up at the bottom of the on-screen texture courtesy of the GL
+    // shader's [0, 1] texcoord), v = 2*v_norm - 1 already maps row 0 of the
+    // PBO to v = -1 (bottom of the image), which matches the camera's
+    // up'-axis convention. No flip needed.
+    float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width  - 1.0f;
+    float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f;
+
+    const float FOV_Y    = 0.7853981633974483f;        // 45 degrees
+    const float TAN_HALF = 0.41421356237309515f;       // tanf(FOV_Y / 2)
+    float aspect = (float)width / (float)height;
+
+    float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux;
+    float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy;
+    float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz;
+    float dl = length3(dx, dy, dz);
+    if (dl < 1e-6f) dl = 1e-6f;
+    dx /= dl; dy /= dl; dz /= dl;
+
+    // ---- Ray vs. the [-1, 1]^3 box (slab method) -----------------------
+    // The camera always sits outside the volume (DIST_MIN >= 1.2 and the
+    // orbit puts at least one component of the eye outside [-1, 1] for
+    // typical framings), so we must first advance `t` to the AABB entry
+    // before any in-volume sampling is meaningful. tNear is the entry
+    // distance (clamped to >= 0 so we don't march backwards if the eye is
+    // inside the box for some configuration); tFar is the exit distance.
+    // If the slab interval is empty (tNear > tFar), the ray misses outright.
+    float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f));
+    float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f));
+    float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f));
+    float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx;
+    float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy;
+    float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz;
+    float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z));
+    float tFar  = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z));
+
+    bool  hit = false;
+    float hx = 0.0f, hy = 0.0f, hz = 0.0f;
+
+    if (tFar > fmaxf(tNear, 0.0f)) {
+        // ---- Fixed-step march through the SDF volume from the AABB entry
+        // Sphere tracing relies on a Lipschitz-1 SDF: the magnitude of the
+        // sample tells you a safe distance you can step without crossing
+        // the surface. But the gyroid SDF here, |sx*cy + sy*cz + sz*cx|
+        // - 0.20, has a gradient scaling with TAU ~= 19, so the stored
+        // magnitude vastly over-reports the true distance. Sphere tracing
+        // would routinely overshoot thin slab regions, leaving most rays
+        // missing geometry that's actually there. A fixed-step march is
+        // cheap (the SDF is just a tex3D lookup) and robust: each step
+        // advances by one voxel, so any positive crossing of the iso-zero
+        // surface lands inside a thin window where HIT_EPS catches it.
+        //
+        // 2 worldspace units / 256 steps = ~0.008 / step, slightly under
+        // one voxel at 128^3 resolution.
+        const int   MAX_STEPS = 256;
+        const float STEP      = 1.0f / 128.0f;
+        const float HIT_EPS   = 1.0e-3f;
+        // Bias slightly inside the box so the very first sample isn't on
+        // the boundary (CLAMP addressing makes the boundary sample valid,
+        // but starting just inside avoids one wasted iteration).
+        float t = fmaxf(tNear, 0.0f) + 1e-4f;
+        float t_exit = tFar;
+
+        #pragma unroll 1
+        for (int i = 0; i < MAX_STEPS; ++i) {
+            float pxw = ex + t * dx;
+            float pyw = ey + t * dy;
+            float pzw = ez + t * dz;
+
+            float s = sample_sdf(tex, pxw, pyw, pzw);
+            if (s < HIT_EPS) {
+                hit = true;
+                hx = pxw; hy = pyw; hz = pzw;
+                break;
+            }
+            t += STEP;
+            if (t > t_exit) break;
+        }
+    }
+
+    // ---- Shade -----------------------------------------------------------
+    float r, g, b;
+    if (hit) {
+        // Central-difference normal in world space. Each sample step is
+        // ~1.17 voxels: short enough to capture local geometry, long enough
+        // that trilinear filtering meaningfully moves the result.
+        const float NEPS = 1.5f / 128.0f;
+        float nx = sample_sdf(tex, hx + NEPS, hy, hz) -
+                   sample_sdf(tex, hx - NEPS, hy, hz);
+        float ny = sample_sdf(tex, hx, hy + NEPS, hz) -
+                   sample_sdf(tex, hx, hy - NEPS, hz);
+        float nz = sample_sdf(tex, hx, hy, hz + NEPS) -
+                   sample_sdf(tex, hx, hy, hz - NEPS);
+        float nl = length3(nx, ny, nz);
+        if (nl < 1e-6f) nl = 1e-6f;
+        nx /= nl; ny /= nl; nz /= nl;
+
+        // Fixed key light (normalized world direction).
+        const float LX = 0.5773502691896258f;          // (1,1,-1)/sqrt(3)
+        const float LY = 0.5773502691896258f;
+        const float LZ = -0.5773502691896258f;
+        float diff = fmaxf(0.0f, dot3(nx, ny, nz, LX, LY, LZ));
+
+        // Specular: Blinn-Phong half-vector exponent. View dir = -ray dir.
+        float vx = -dx, vy = -dy, vz = -dz;
+        float hx2 = LX + vx, hy2 = LY + vy, hz2 = LZ + vz;
+        float hl  = length3(hx2, hy2, hz2);
+        if (hl < 1e-6f) hl = 1e-6f;
+        hx2 /= hl; hy2 /= hl; hz2 /= hl;
+        float ndoth = fmaxf(0.0f, dot3(nx, ny, nz, hx2, hy2, hz2));
+        float spec = powf(ndoth, 32.0f);
+
+        // Base albedo varies with the hit position so the gyroid lattice
+        // reads as a single material with smooth variation, not flat plastic.
+        float base_r = 0.55f + 0.30f * nx;
+        float base_g = 0.50f + 0.30f * ny;
+        float base_b = 0.70f + 0.30f * nz;
+
+        const float AMBIENT = 0.18f;
+        r = base_r * (AMBIENT + 0.82f * diff) + 0.6f * spec;
+        g = base_g * (AMBIENT + 0.82f * diff) + 0.6f * spec;
+        b = base_b * (AMBIENT + 0.82f * diff) + 0.7f * spec;
+    } else {
+        // Sky: dark blue at the top, near-black at the bottom. The PBO's row
+        // 0 is the bottom of the on-screen image (see the v_ndc comment),
+        // so we use the y coordinate of the ray direction (close to v_ndc
+        // in screen space) for the gradient.
+        float sky = 0.5f * (dy + 1.0f);                // [0, 1] roughly
+        sky = clampf(sky, 0.0f, 1.0f);
+        r = 0.02f + 0.06f * sky;
+        g = 0.03f + 0.10f * sky;
+        b = 0.05f + 0.20f * sky;
+    }
+
+    r = clampf(r, 0.0f, 1.0f);
+    g = clampf(g, 0.0f, 1.0f);
+    b = clampf(b, 0.0f, 1.0f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py
new file mode 100644
index 00000000000..82c880a8943
--- /dev/null
+++ b/cuda_core/examples/gl_interop_texture_filter.py
@@ -0,0 +1,607 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.TextureObject hardware filtering by
+# comparing FilterMode.POINT and FilterMode.LINEAR side by side on the same
+# source CUDA Array. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# How to back two TextureObjects with the SAME CUDA Array and observe the
+# difference between POINT (nearest-texel) and LINEAR (bilinear) filtering
+# under user-controlled zoom and pan.  Also shows how the address mode
+# (WRAP / CLAMP / MIRROR / BORDER) is baked into the texture descriptor at
+# creation time, so changing it at runtime means rebuilding the textures.
+#
+# How it works
+# ============
+# A single 256x256 RGBA8 Array holds a procedurally-generated test pattern
+# (high-contrast checkerboard, diagonals, gradient stripe).  Two
+# TextureObjects are built on top of that Array:
+#
+#       Array (256x256 RGBA UINT8)
+#       /                       \
+#   tex_point                  tex_linear
+#   FilterMode.POINT           FilterMode.LINEAR
+#   AddressMode.WRAP           AddressMode.WRAP
+#   ReadMode.NORMALIZED_FLOAT  ReadMode.NORMALIZED_FLOAT
+#
+# Each frame, a single CUDA kernel runs over a 1024x512 OpenGL PBO:
+#
+#   - Left half of the screen samples tex_point.
+#   - Right half samples tex_linear.
+#   - Both halves use the same (zoom, pan) -> texture-space mapping, so the
+#     two views show the same content with different filtering.
+#   - A 2-pixel vertical white line marks the divider.
+#
+# Because ReadMode.NORMALIZED_FLOAT is used, tex2D<float4>() returns each
+# channel as a float in [0, 1]; the kernel multiplies by 255 and writes
+# unsigned bytes back into the PBO.
+#
+# The PBO is then copied to a GL texture and drawn on a fullscreen quad,
+# identical to the plasma example.
+#
+# What you should see
+# ===================
+# A 1024x512 window split down the middle.  The left half (POINT) shows
+# blocky / pixelated magnification; the right half (LINEAR) shows smooth
+# bilinear interpolation.  Drag with the left mouse button to pan,
+# scroll to zoom, press M to cycle the texture address mode, press R to
+# reset, Escape or close the window to exit.  The current address mode
+# and FPS are shown in the window title.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and source-image dimensions (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 512
+SRC_W = 256
+SRC_H = 256
+
+# Address modes cycled by pressing the M key.
+ADDRESS_MODES = (
+    AddressMode.WRAP,
+    AddressMode.CLAMP,
+    AddressMode.MIRROR,
+    AddressMode.BORDER,
+)
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL.  If you're here to learn about
+# TextureObject filtering, the most interesting parts are in main() and in
+# make_pattern() / make_textures(); everything else is the same kind of
+# CUDA-GL interop boilerplate used by gl_interop_plasma.py.
+# ============================================================================
+
+
+def make_pattern(width, height):
+    """Build an RGBA8 test pattern that makes POINT vs LINEAR obvious.
+
+    Layout (height, width, 4) of dtype uint8.  Channels are R, G, B, A.
+    The pattern contains:
+      - 8x8 black/white checkerboard (high-frequency)
+      - Two diagonal red lines (1px wide)
+      - Horizontal blue->green gradient strip near y = height/4
+      - A pair of thin horizontal rectangles ("text-like" blocks)
+    """
+    img = np.zeros((height, width, 4), dtype=np.uint8)
+
+    # Checkerboard (black / white) at 8x8 cells.
+    ys = np.arange(height)[:, None]
+    xs = np.arange(width)[None, :]
+    cell = ((xs // 8) + (ys // 8)) & 1
+    white = np.broadcast_to(cell[..., None].astype(np.uint8) * 255, (height, width, 3))
+    img[..., :3] = white
+    img[..., 3] = 255
+
+    # Two diagonal red lines.
+    diag1 = (xs == ys)
+    diag2 = (xs == (width - 1 - ys))
+    red_mask = diag1 | diag2
+    img[red_mask] = (255, 0, 0, 255)
+
+    # Horizontal gradient strip (blue -> green) ~ 8 rows tall at y ~ height/4.
+    g_y = height // 4
+    g_h = max(4, height // 32)
+    grad = np.linspace(0, 255, width, dtype=np.uint8)
+    for row in range(g_y, min(g_y + g_h, height)):
+        img[row, :, 0] = 0
+        img[row, :, 1] = grad             # G ramps up
+        img[row, :, 2] = 255 - grad       # B ramps down
+        img[row, :, 3] = 255
+
+    # Two "text-like" thin rectangles, alternating bright/dim.
+    def fill_rect(y0, y1, x0, x1, rgba):
+        img[y0:y1, x0:x1] = rgba
+
+    bar_y = (3 * height) // 4
+    fill_rect(bar_y, bar_y + 4, width // 8, (width * 3) // 8, (255, 255, 0, 255))
+    fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8,
+              (0, 255, 255, 255))
+
+    return np.ascontiguousarray(img)
+
+
+def make_textures(array, address_mode):
+    """Build (tex_point, tex_linear) on the given Array with the given mode.
+
+    The address mode is baked into the descriptor at cuTexObjectCreate time, so
+    we recreate both textures whenever the user cycles the mode.  Caller owns
+    the returned objects and must close() them.
+    """
+    res_desc = ResourceDescriptor.from_array(array)
+
+    point_desc = TextureDescriptor(
+        address_mode=address_mode,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=False,
+    )
+    linear_desc = TextureDescriptor(
+        address_mode=address_mode,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=False,
+    )
+    tex_point = TextureObject.from_descriptor(
+        resource=res_desc, texture_descriptor=point_desc
+    )
+    tex_linear = TextureObject.from_descriptor(
+        resource=res_desc, texture_descriptor=linear_desc
+    )
+    return tex_point, tex_linear
+
+
+def setup_cuda(kernel_source):
+    """Compile the CUDA kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+    stream = dev.create_stream()
+
+    # C++ compile so the templated tex2D<float4> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("split_screen_sample",))
+    kernel = mod.get_kernel("split_screen_sample")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="TextureObject Filter Comparison - POINT vs LINEAR",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate for a textured fullscreen quad, identical in
+    structure to the plasma example.  Returns (shader_program, vao_id, tex_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles).  Each vertex: x, y, s, t.
+    quad_verts = np.array(
+        [
+            -1, -1, 0, 0,
+             1, -1, 1, 0,
+             1,  1, 1, 1,
+            -1, -1, 0, 0,
+             1,  1, 1, 1,
+            -1,  1, 0, 1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    # Empty GL texture; filled each frame from the PBO.
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    # Use nearest filtering on the display texture so the example's own
+    # POINT/LINEAR comparison is not muddied by GL's sampler.
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D, 0, 0, 0, width, height,
+        gl.GL_RGBA, gl.GL_UNSIGNED_BYTE, None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda(KERNEL_SOURCE)
+
+    # The hardware-texture path needs at least compute capability 3.x
+    # (it's available essentially everywhere modern, but check anyway so the
+    # failure is friendly).
+    if dev.compute_capability.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got {dev.compute_capability.major}.{dev.compute_capability.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, quad, display texture) ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _nbytes = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the source Array and upload the test pattern ---
+    #     The Array lives for the entire program, so we use a `with` block.
+    #     Inside it we create / re-create two TextureObjects whenever the
+    #     user cycles the address mode.
+    with Array.from_descriptor(
+        shape=(SRC_W, SRC_H),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    ) as arr:
+        pattern = make_pattern(SRC_W, SRC_H)
+        # Sanity: 256 * 256 * 4 bytes = 262144.
+        assert pattern.nbytes == arr.size_bytes, (
+            f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})"
+        )
+        arr.copy_from(pattern, stream=stream)
+        stream.sync()  # upload must finish before kernel reads
+
+        # --- Step 7: Build initial POINT + LINEAR textures (WRAP mode). ---
+        # We can't use a `with` block here because the address mode is baked
+        # into the descriptor at creation time: cycling modes means closing
+        # and recreating these objects.  We instead hold them in mutable
+        # closure state and release them in on_close().
+        tex_state = {
+            "mode_idx": 0,
+            "tex_point": None,
+            "tex_linear": None,
+        }
+
+        def rebuild_textures():
+            # Close previous textures (if any) before creating new ones so we
+            # don't leak handles when cycling the address mode.
+            if tex_state["tex_point"] is not None:
+                tex_state["tex_point"].close()
+            if tex_state["tex_linear"] is not None:
+                tex_state["tex_linear"].close()
+            mode = ADDRESS_MODES[tex_state["mode_idx"]]
+            tp, tl = make_textures(arr, mode)
+            tex_state["tex_point"] = tp
+            tex_state["tex_linear"] = tl
+
+        rebuild_textures()
+
+        # --- Step 8: View state (zoom + pan), tight initial framing. ---
+        # zoom = pixels_per_texel.  zoom=3 -> roughly 3x magnification, which
+        # makes POINT vs LINEAR obvious without any user input.
+        view = {
+            "zoom": 3.0,
+            "pan_x": SRC_W * 0.5,
+            "pan_y": SRC_H * 0.5,
+            "drag": False,
+        }
+
+        def reset_view():
+            view["zoom"] = 3.0
+            view["pan_x"] = SRC_W * 0.5
+            view["pan_y"] = SRC_H * 0.5
+
+        # --- Step 9: Render loop ---
+        start_time = time.monotonic()
+        frame_count = 0
+        fps_time = start_time
+
+        def current_mode_name():
+            return ADDRESS_MODES[tex_state["mode_idx"]].name
+
+        @window.event
+        def on_draw():
+            nonlocal frame_count, fps_time
+            window.clear()
+
+            # (a) Map the PBO so CUDA can write to it.
+            with resource.map(stream=stream) as buf:
+                # (b) Launch the split-screen sampling kernel.
+                launch(
+                    stream,
+                    config,
+                    kernel,
+                    np.uint64(tex_state["tex_point"].handle),
+                    np.uint64(tex_state["tex_linear"].handle),
+                    buf.handle,
+                    np.int32(WIDTH),
+                    np.int32(HEIGHT),
+                    np.float32(view["zoom"]),
+                    np.float32(view["pan_x"]),
+                    np.float32(view["pan_y"]),
+                    np.int32(SRC_W),
+                    np.int32(SRC_H),
+                )
+            # (c) Unmap happens automatically when the `with` block exits.
+
+            # (d) PBO -> GL texture (GPU-to-GPU).
+            copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+            # (e) Draw the texture to the screen.
+            draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+            frame_count += 1
+            now = time.monotonic()
+            if now - fps_time >= 1.0:
+                fps = frame_count / (now - fps_time)
+                window.set_caption(
+                    f"TextureObject Filter - POINT | LINEAR  "
+                    f"[address={current_mode_name()}, zoom={view['zoom']:.2f}x, "
+                    f"{fps:.0f} FPS]"
+                )
+                frame_count = 0
+                fps_time = now
+
+        # --- Mouse: drag to pan, scroll to zoom ------------------------------
+        @window.event
+        def on_mouse_press(x, y, button, modifiers):
+            if button == pyglet.window.mouse.LEFT:
+                view["drag"] = True
+
+        @window.event
+        def on_mouse_release(x, y, button, modifiers):
+            if button == pyglet.window.mouse.LEFT:
+                view["drag"] = False
+
+        @window.event
+        def on_mouse_drag(x, y, dx, dy, buttons, modifiers):
+            if not (buttons & pyglet.window.mouse.LEFT):
+                return
+            # Pyglet dy is screen-up-positive; texture y is texel-down-positive.
+            # One screen pixel = 1/zoom texels in source space.
+            view["pan_x"] -= dx / view["zoom"]
+            view["pan_y"] += dy / view["zoom"]
+
+        @window.event
+        def on_mouse_scroll(x, y, scroll_x, scroll_y):
+            # Geometric zoom; clamp to a sensible range.
+            factor = 1.1 ** scroll_y
+            new_zoom = view["zoom"] * factor
+            view["zoom"] = max(0.1, min(32.0, new_zoom))
+
+        # --- Keyboard: M cycles address mode, R resets view ------------------
+        @window.event
+        def on_key_press(symbol, modifiers):
+            key = pyglet.window.key
+            if symbol == key.M:
+                tex_state["mode_idx"] = (tex_state["mode_idx"] + 1) % len(ADDRESS_MODES)
+                rebuild_textures()
+            elif symbol == key.R:
+                reset_view()
+            elif symbol == key.ESCAPE:
+                window.close()
+
+        @window.event
+        def on_close():
+            # Release CUDA resources in reverse order of creation.
+            if tex_state["tex_linear"] is not None:
+                tex_state["tex_linear"].close()
+                tex_state["tex_linear"] = None
+            if tex_state["tex_point"] is not None:
+                tex_state["tex_point"].close()
+                tex_state["tex_point"] = None
+            resource.close()
+
+        pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# KERNEL_SOURCE samples the same source Array through two TextureObjects
+# (POINT vs LINEAR) and writes RGBA8 pixels into the PBO.  ReadMode.
+# NORMALIZED_FLOAT means tex2D<float4>() returns each channel in [0, 1];
+# the kernel scales by 255 and writes unsigned bytes back out.
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws
+# a texture on a fullscreen quad -- nothing CUDA-specific.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+extern "C" __global__
+void split_screen_sample(cudaTextureObject_t point_tex,
+                         cudaTextureObject_t linear_tex,
+                         unsigned char* out,
+                         int w, int h,
+                         float zoom,
+                         float pan_x, float pan_y,
+                         int src_w, int src_h) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= w || y >= h) return;
+
+    int half_w = w / 2;
+
+    // 2-pixel-wide white separator down the middle.
+    if (x == half_w || x == half_w - 1) {
+        int idx = (y * w + x) * 4;
+        out[idx + 0] = 255;
+        out[idx + 1] = 255;
+        out[idx + 2] = 255;
+        out[idx + 3] = 255;
+        return;
+    }
+
+    // Each half of the screen samples the same (src_x, src_y) so the two
+    // sides line up visually for an apples-to-apples filter comparison.
+    float local_x = (x < half_w) ? (float)x : (float)(x - half_w);
+
+    // (src_x, src_y) in source-texture pixel coordinates.  Non-normalized
+    // coords are used, so coordinate (i + 0.5, j + 0.5) selects texel (i, j).
+    float src_x = pan_x + (local_x - (float)half_w * 0.5f) / zoom;
+    float src_y = pan_y + ((float)y     - (float)h      * 0.5f) / zoom;
+
+    float4 sample;
+    if (x < half_w) {
+        sample = tex2D<float4>(point_tex,  src_x, src_y);
+    } else {
+        sample = tex2D<float4>(linear_tex, src_x, src_y);
+    }
+
+    int idx = (y * w + x) * 4;
+    out[idx + 0] = (unsigned char)(sample.x * 255.0f);
+    out[idx + 1] = (unsigned char)(sample.y * 255.0f);
+    out[idx + 2] = (unsigned char)(sample.z * 255.0f);
+    out[idx + 3] = (unsigned char)(sample.w * 255.0f);
+}
+"""
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py
new file mode 100644
index 00000000000..fc5b05f086f
--- /dev/null
+++ b/cuda_core/examples/texture_sample.py
@@ -0,0 +1,220 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates building a 2D CUDA Array, binding it as a
+# bindless TextureObject, and sampling it from a kernel with both POINT-exact
+# and LINEAR-interpolated coordinates.
+#
+# Texture coordinate convention (non-normalized): each texel (i, j) is centered
+# at (i + 0.5, j + 0.5). So tex2D(tex, 0.5, 0.5) returns texel (0, 0) exactly,
+# while tex2D(tex, 1.0, 0.5) returns the linear blend of texels (0, 0) and (1, 0).
+# All test coordinates below are chosen with that half-pixel offset in mind.
+#
+# ################################################################################
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc"]
+# ///
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# Kernel reads N (x, y) coordinates from `coords` (interleaved float pairs) and
+# writes tex2D<float>(tex, x, y) to out[i]. Compiled as C++ so the templated
+# tex2D<float> overload resolves.
+code = r"""
+extern "C" __global__
+void sample_texture(cudaTextureObject_t tex,
+                    float *out,
+                    const float *coords,
+                    int n) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    float x = coords[2 * i + 0];
+    float y = coords[2 * i + 1];
+    out[i] = tex2D<float>(tex, x, y);
+}
+"""
+
+
+def main():
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+
+    coords_buf = None
+    out_buf = None
+    pinned_mr = LegacyPinnedMemoryResource()
+    try:
+        # Allocate a 2D Array: shape=(W, H), single-channel float32.
+        # Note: Array.from_descriptor takes shape=(width, height), so the host
+        # buffer fed into copy_from must be laid out as H rows of W elements
+        # (row-major), i.e. host_pattern.shape == (H, W).
+        width, height = 16, 16
+        with Array.from_descriptor(
+            shape=(width, height),
+            format=ArrayFormat.FLOAT32,
+            num_channels=1,
+        ) as arr:
+            # Plant a known pattern: pattern[y, x] = x + 100*y.
+            # Cast to float32 so the byte count matches the array's storage.
+            ys, xs = np.meshgrid(
+                np.arange(height, dtype=np.float32),
+                np.arange(width, dtype=np.float32),
+                indexing="ij",
+            )
+            pattern = (xs + 100.0 * ys).astype(np.float32)
+            assert pattern.shape == (height, width)
+            arr.copy_from(pattern, stream=stream)
+
+            # Build a linear-filtering, clamped, non-normalized texture.
+            res_desc = ResourceDescriptor.from_array(arr)
+            tex_desc = TextureDescriptor(
+                address_mode=AddressMode.CLAMP,
+                filter_mode=FilterMode.LINEAR,
+                read_mode=ReadMode.ELEMENT_TYPE,
+                normalized_coords=False,
+            )
+            with TextureObject.from_descriptor(
+                resource=res_desc, texture_descriptor=tex_desc
+            ) as tex:
+                _run_kernel_and_verify(
+                    dev, stream, tex, pattern, width, height, pinned_mr
+                )
+    finally:
+        stream.close()
+
+
+def _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr):
+    """Kernel launch + correctness check, isolated so the with-blocks in main()
+    stay readable. Owns its own pinned-buffer cleanup."""
+    coords_buf = None
+    out_buf = None
+    try:
+        # Build the test coordinate list:
+        # - Texel-center samples should return the exact planted value.
+        # - Half-integer samples land between texels and exercise LINEAR
+        #   filtering -- they should equal the average of the surrounding
+        #   texels.
+        center_samples = [
+            (0.5, 0.5),  # -> pattern[0, 0] = 0
+            (3.5, 0.5),  # -> pattern[0, 3] = 3
+            (0.5, 4.5),  # -> pattern[4, 0] = 400
+            (7.5, 9.5),  # -> pattern[9, 7] = 907
+            (15.5, 15.5),  # -> pattern[15, 15] = 1515
+        ]
+        half_samples = [
+            # (1.0, 0.5): blend of texels (0, 0) and (1, 0) -> 0.5
+            (1.0, 0.5),
+            # (0.5, 1.0): blend of texels (0, 0) and (0, 1) -> 50.0
+            (0.5, 1.0),
+            # (1.0, 1.0): blend of the 2x2 block at (0..1, 0..1) -> 50.5
+            (1.0, 1.0),
+            # (4.0, 5.0): blend of the 2x2 block at (3..4, 4..5) -> 453.5
+            (4.0, 5.0),
+        ]
+        coords = np.array(center_samples + half_samples, dtype=np.float32)
+        n = coords.shape[0]
+        coords_flat = coords.reshape(-1)
+        coords_nbytes = int(coords_flat.nbytes)
+        out_nbytes = n * np.dtype(np.float32).itemsize
+
+        # Use pinned host memory for inputs and outputs. Pinned allocations are
+        # GPU-accessible (zero-copy), so the kernel can read coords directly
+        # and we can read results without a separate device->host copy.
+        coords_buf = pinned_mr.allocate(coords_nbytes)
+        out_buf = pinned_mr.allocate(out_nbytes)
+        coords_view = np.from_dlpack(coords_buf).view(dtype=np.float32)
+        out_view = np.from_dlpack(out_buf).view(dtype=np.float32)
+        coords_view[:] = coords_flat
+        out_view[:] = 0.0
+
+        # Compile the kernel as C++ (templated tex2D<float> requires this).
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin", name_expressions=("sample_texture",))
+        kernel = mod.get_kernel("sample_texture")
+
+        block = 64
+        grid = (n + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
+        # cudaTextureObject_t is a 64-bit handle; pass it as uint64 to be
+        # unambiguous (a bare Python int would also work since intptr_t is
+        # 8 bytes on 64-bit platforms).
+        launch(
+            stream,
+            config,
+            kernel,
+            np.uint64(tex.handle),
+            out_buf,
+            coords_buf,
+            np.int32(n),
+        )
+        stream.sync()
+        results = np.asarray(out_view)
+
+        # Verify texel-center samples (POINT-exact regardless of filter mode).
+        n_center = len(center_samples)
+        for i, (x, y) in enumerate(center_samples):
+            expected = (x - 0.5) + 100.0 * (y - 0.5)
+            got = float(results[i])
+            assert np.isclose(got, expected, atol=1e-4), (
+                f"center sample {i} at ({x}, {y}): expected {expected}, got {got}"
+            )
+
+        # Verify half-integer samples against the analytic mean of the 4
+        # surrounding texels. Allow a small tolerance for the 1/256 fixed-point
+        # weight quantization that hardware filtering performs.
+        for j, (x, y) in enumerate(half_samples):
+            idx = n_center + j
+            # Surrounding integer texel coordinates: (xi, yi), (xi+1, yi),
+            # (xi, yi+1), (xi+1, yi+1). With x = xi + 1, y = yi + 1 (e.g.
+            # (1.0, 1.0)) the four neighbors are (0,0)..(1,1).
+            xi = int(np.floor(x - 0.5))
+            yi = int(np.floor(y - 0.5))
+            tx = (x - 0.5) - xi
+            ty = (y - 0.5) - yi
+            corners = []
+            for dy in (0, 1):
+                for dx in (0, 1):
+                    xv = min(max(xi + dx, 0), width - 1)
+                    yv = min(max(yi + dy, 0), height - 1)
+                    corners.append(pattern[yv, xv])
+            v00, v10, v01, v11 = corners
+            expected = (1 - tx) * (1 - ty) * v00 + tx * (1 - ty) * v10 + (1 - tx) * ty * v01 + tx * ty * v11
+            got = float(results[idx])
+            assert np.isclose(got, expected, atol=1e-2), (
+                f"half sample {j} at ({x}, {y}): expected {expected}, got {got}"
+            )
+
+        print("Texture sampling example completed successfully.")
+        print(f"  texel-center samples verified: {n_center}")
+        print(f"  half-integer samples verified: {len(half_samples)}")
+    finally:
+        if coords_buf is not None:
+            coords_buf.close()
+        if out_buf is not None:
+            out_buf.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 31b9f86e0a1..e1666114cc9 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -82,6 +82,15 @@ def has_recent_memory_pool_support() -> bool:
 SYSTEM_REQUIREMENTS = {
     "memory_pool_resources.py": has_recent_memory_pool_support,
     "gl_interop_plasma.py": has_display,
+    "gl_interop_fire.py": has_display,
+    "gl_interop_image_show.py": has_display,
+    "gl_interop_lenia.py": has_display,
+    "gl_interop_mandelbrot.py": has_display,
+    "gl_interop_mipmap_lod.py": has_display,
+    "gl_interop_ocean.py": has_display,
+    "gl_interop_reaction_diffusion.py": has_display,
+    "gl_interop_sdf_volume.py": has_display,
+    "gl_interop_texture_filter.py": has_display,
     "pytorch_example.py": lambda: (
         has_compute_capability_9_or_higher() and is_x86_64()
     ),  # PyTorch only provides CUDA support for x86_64
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
new file mode 100644
index 00000000000..00e67ed2398
--- /dev/null
+++ b/cuda_core/tests/test_texture_surface.py
@@ -0,0 +1,968 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import pytest
+
+import cuda.core
+from cuda.core import (
+    AddressMode,
+    Array,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    MipmappedArray,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+)
+
+
+def test_array_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^Array cannot be instantiated directly"):
+        cuda.core._array.Array()
+
+
+def test_texture_object_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^TextureObject cannot be instantiated directly"):
+        cuda.core._texture.TextureObject()
+
+
+def test_surface_object_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^SurfaceObject cannot be instantiated directly"):
+        cuda.core._surface.SurfaceObject()
+
+
+def test_resource_descriptor_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^ResourceDescriptor cannot be instantiated"):
+        ResourceDescriptor()
+
+
+def test_array_2d_create_and_properties(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        assert arr.shape == (32, 16)
+        assert arr.format == ArrayFormat.FLOAT32
+        assert arr.num_channels == 1
+        assert arr.element_size == 4
+        assert arr.size_bytes == 32 * 16 * 4
+        assert arr.surface_load_store is False
+        assert arr.handle != 0
+        assert isinstance(arr.device, Device)
+    finally:
+        arr.close()
+
+
+def test_array_3d_with_surface_flag(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8, 4),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    try:
+        assert arr.shape == (8, 8, 4)
+        assert arr.surface_load_store is True
+        assert arr.element_size == 4
+    finally:
+        arr.close()
+
+
+def test_array_rejects_bad_channels(init_cuda):
+    with pytest.raises(ValueError, match="num_channels"):
+        Array.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3)
+
+
+def test_array_rejects_bad_rank(init_cuda):
+    with pytest.raises(ValueError, match="shape rank"):
+        Array.from_descriptor(
+            shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1
+        )
+
+
+def test_array_roundtrip_copy(init_cuda):
+    import array as _array
+
+    device = Device()
+    stream = device.create_stream()
+    arr = Array.from_descriptor(
+        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
+    )
+    try:
+        src = _array.array("I", list(range(16)))
+        dst = _array.array("I", [0] * 16)
+        arr.copy_from(src, stream=stream)
+        arr.copy_to(dst, stream=stream)
+        stream.sync()
+        # Round-trip recovers data; src must not be mutated by copy_from.
+        assert list(dst) == list(range(16))
+        assert list(src) == list(range(16))
+    finally:
+        arr.close()
+        stream.close()
+
+
+def test_array_copy_rejects_undersized_host_buffer(init_cuda):
+    import array as _array
+
+    device = Device()
+    stream = device.create_stream()
+    arr = Array.from_descriptor(
+        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
+    )
+    try:
+        # arr is 16 * 4 = 64 bytes; pass an 8-element (32-byte) host buffer.
+        too_small = _array.array("I", [0] * 8)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_from(too_small, stream=stream)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_to(too_small, stream=stream)
+    finally:
+        arr.close()
+        stream.close()
+
+
+def test_array_copy_rejects_undersized_device_buffer(init_cuda):
+    device = Device()
+    stream = device.create_stream()
+    arr = Array.from_descriptor(
+        shape=(16,), format=ArrayFormat.UINT32, num_channels=1
+    )
+    # arr is 64 bytes; allocate a 32-byte device buffer.
+    small_buf = device.memory_resource.allocate(32, stream=device.default_stream)
+    try:
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_from(small_buf, stream=stream)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_to(small_buf, stream=stream)
+    finally:
+        small_buf.close()
+        arr.close()
+        stream.close()
+
+
+def test_texture_object_create(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            read_mode=ReadMode.ELEMENT_TYPE,
+            normalized_coords=True,
+        )
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+            assert tex.texture_descriptor is tex_desc
+        finally:
+            tex.close()
+    finally:
+        arr.close()
+
+
+def test_surface_object_create(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    try:
+        surf = SurfaceObject.from_array(arr)
+        try:
+            assert surf.handle != 0
+            assert isinstance(surf.resource, ResourceDescriptor)
+        finally:
+            surf.close()
+    finally:
+        arr.close()
+
+
+def test_surface_requires_ldst_flag(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4
+    )
+    try:
+        with pytest.raises(ValueError, match="surface_load_store=True"):
+            SurfaceObject.from_array(arr)
+    finally:
+        arr.close()
+
+
+def test_address_mode_normalization(init_cuda):
+    # Direct unit test of the private normalizer: a scalar should expand to a
+    # 3-tuple; a shorter tuple should be padded by repeating the last entry.
+    from cuda.core._texture import _normalize_address_modes
+
+    assert _normalize_address_modes(AddressMode.WRAP) == (
+        AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP,
+    )
+    assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP)) == (
+        AddressMode.WRAP, AddressMode.CLAMP, AddressMode.CLAMP,
+    )
+    assert _normalize_address_modes(
+        (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
+    ) == (AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)
+
+    # Smoke test: a 2-entry tuple is also accepted end-to-end.
+    arr = Array.from_descriptor(
+        shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        tex_desc = TextureDescriptor(
+            address_mode=(AddressMode.WRAP, AddressMode.CLAMP)
+        )
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+        finally:
+            tex.close()
+    finally:
+        arr.close()
+
+
+# --- Linear / pitch2D resource descriptors -----------------------------------
+
+def _alloc_device_buffer(device, nbytes):
+    """Allocate a device Buffer using the device's default memory resource."""
+    return device.memory_resource.allocate(nbytes, stream=device.default_stream)
+
+
+def test_resource_descriptor_from_linear_defaults_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.FLOAT32, num_channels=1
+        )
+        assert res.kind == "linear"
+        assert res.format == ArrayFormat.FLOAT32
+        assert res.num_channels == 1
+        assert res.source is buf
+        # repr should include the kind/format hint
+        assert "linear" in repr(res)
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_size_override(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048
+        )
+        assert res._size_bytes == 2048
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_oversize(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="exceeds buffer.size"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_bad_channels(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="num_channels"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT8, num_channels=3
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_non_buffer():
+    with pytest.raises(TypeError, match="Buffer"):
+        ResourceDescriptor.from_linear(
+            object(), format=ArrayFormat.UINT8, num_channels=1
+        )
+
+
+def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="at least one element"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_non_multiple(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        # UINT32 x 1 channel = 4 bytes/element; 10 bytes is not a multiple.
+        with pytest.raises(ValueError, match="multiple of element size"):
+            ResourceDescriptor.from_linear(
+                buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10
+            )
+    finally:
+        buf.close()
+
+
+def test_texture_object_from_linear(init_cuda):
+    """A linear-backed texture should bind even though sampling fields are
+    effectively ignored by the driver."""
+    device = Device()
+    # 1024 float elements
+    buf = _alloc_device_buffer(device, 1024 * 4)
+    try:
+        res = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.FLOAT32, num_channels=1
+        )
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+        finally:
+            tex.close()
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_validates_pitch(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 64 * 1024)
+    try:
+        # element_size = 4 (UINT32 * 1 channel); width=16 -> min_pitch=64
+        with pytest.raises(ValueError, match="pitch_bytes"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT32,
+                num_channels=1,
+                width=16,
+                height=8,
+                pitch_bytes=32,  # < 64 = width*element_size
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_validates_buffer_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="exceeds buffer.size"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=4,
+                width=64,
+                height=128,
+                pitch_bytes=512,  # 512 * 128 = 65536 > 4096
+            )
+    finally:
+        buf.close()
+
+
+def test_texture_object_from_pitch2d(init_cuda):
+    """A pitch2D-backed texture should bind given driver-aligned pitch."""
+    from cuda.bindings import driver
+
+    device = Device()
+    # Query the device's required texture pitch alignment (typically 32-512).
+    err, align = driver.cuDeviceGetAttribute(
+        driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT,
+        device.device_id,
+    )
+    assert int(err) == 0
+    pitch = max(int(align), 256)
+    height = 16
+    buf = _alloc_device_buffer(device, pitch * height)
+    try:
+        res = ResourceDescriptor.from_pitch2d(
+            buf,
+            format=ArrayFormat.UINT8,
+            num_channels=4,
+            width=32,
+            height=height,
+            pitch_bytes=pitch,
+        )
+        assert res.kind == "pitch2d"
+        assert "pitch2d" in repr(res)
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
+        try:
+            assert tex.handle != 0
+        finally:
+            tex.close()
+    finally:
+        buf.close()
+
+
+def test_surface_rejects_linear_and_pitch2d(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res_lin = ResourceDescriptor.from_linear(
+            buf, format=ArrayFormat.UINT32, num_channels=1
+        )
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(resource=res_lin)
+
+        res_p2 = ResourceDescriptor.from_pitch2d(
+            buf,
+            format=ArrayFormat.UINT8,
+            num_channels=4,
+            width=8,
+            height=8,
+            pitch_bytes=64,
+        )
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(resource=res_p2)
+    finally:
+        buf.close()
+
+
+# --- MipmappedArray ----------------------------------------------------------
+
+def test_mipmapped_array_init_disabled():
+    with pytest.raises(
+        RuntimeError, match=r"^MipmappedArray cannot be instantiated directly"
+    ):
+        cuda.core._mipmapped_array.MipmappedArray()
+
+
+def test_mipmapped_array_from_descriptor_2d(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(64, 32),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=4,
+    )
+    try:
+        assert mip.shape == (64, 32)
+        assert mip.format == ArrayFormat.FLOAT32
+        assert mip.num_channels == 1
+        assert mip.num_levels == 4
+        assert mip.surface_load_store is False
+        assert mip.handle != 0
+        assert isinstance(mip.device, Device)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_zero_matches_shape(init_cuda):
+    shape = (64, 32)
+    mip = MipmappedArray.from_descriptor(
+        shape=shape,
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        num_levels=4,
+    )
+    try:
+        lvl0 = mip.get_level(0)
+        try:
+            assert isinstance(lvl0, Array)
+            # Level 0 must match the base shape and rank.
+            assert lvl0.shape == shape
+            assert lvl0.format == ArrayFormat.UINT8
+            assert lvl0.num_channels == 4
+            assert lvl0.handle != 0
+        finally:
+            lvl0.close()
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_halves_dims(init_cuda):
+    shape = (64, 32)
+    num_levels = 4
+    mip = MipmappedArray.from_descriptor(
+        shape=shape,
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=num_levels,
+    )
+    try:
+        for level in range(num_levels):
+            lvl = mip.get_level(level)
+            try:
+                # Each dim halves per level, with a floor of 1; rank is preserved.
+                expected = tuple(max(1, dim >> level) for dim in shape)
+                assert lvl.shape == expected, (
+                    f"level={level}: expected {expected}, got {lvl.shape}"
+                )
+            finally:
+                lvl.close()
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_out_of_range(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=2,
+    )
+    try:
+        with pytest.raises(ValueError, match="num_levels"):
+            mip.get_level(mip.num_levels)
+        with pytest.raises(ValueError, match=">= 0"):
+            mip.get_level(-1)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_rejects_zero_levels(init_cuda):
+    with pytest.raises(ValueError, match="num_levels"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8),
+            format=ArrayFormat.UINT8,
+            num_channels=1,
+            num_levels=0,
+        )
+
+
+def test_resource_descriptor_from_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(32, 16),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=3,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        assert res.kind == "mipmapped_array"
+        assert res.source is mip
+    finally:
+        mip.close()
+
+
+def test_resource_descriptor_from_mipmapped_array_rejects_non_mipmap():
+    with pytest.raises(TypeError, match="MipmappedArray"):
+        ResourceDescriptor.from_mipmapped_array(object())
+
+
+def test_texture_object_from_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(32, 32),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=3,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        # Use non-default mipmap params so the driver exercises that path.
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            normalized_coords=True,
+            mipmap_filter_mode=FilterMode.LINEAR,
+            mipmap_level_bias=0.0,
+            min_mipmap_level_clamp=0.0,
+            max_mipmap_level_clamp=float(mip.num_levels - 1),
+        )
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+        finally:
+            tex.close()
+    finally:
+        mip.close()
+
+
+def test_surface_rejects_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        num_levels=2,
+        surface_load_store=True,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(resource=res)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
+    """Dropping the local parent reference must not invalidate the level Array;
+    the level holds an internal strong ref back to the MipmappedArray.
+
+    cdef classes don't natively support weakref, so we verify the parent
+    reference by inspecting the level Array's gc referents.
+    """
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=3,
+    )
+    parent_id = id(mip)
+    lvl = mip.get_level(1)
+    # Drop our local reference and force GC; the parent must survive because
+    # the level Array holds a strong ref via the internal _parent_ref slot.
+    del mip
+    gc.collect()
+
+    # The handle is still valid storage; the level still tracks the parent.
+    assert lvl.handle != 0
+    referents = gc.get_referents(lvl)
+    parents = [r for r in referents if isinstance(r, MipmappedArray)]
+    assert len(parents) == 1, (
+        f"level Array should reference exactly one MipmappedArray parent, got "
+        f"{parents!r}"
+    )
+    assert id(parents[0]) == parent_id, (
+        "level Array's parent ref is not the original MipmappedArray"
+    )
+    # Closing the level drops its parent ref. Don't access the parent past
+    # this point; cuMipmappedArrayDestroy may then run.
+    lvl.close()
+
+
+# --- Negative-path validation tests ------------------------------------------
+
+def test_array_from_descriptor_rejects_bad_format(init_cuda):
+    with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+        Array.from_descriptor(shape=(8,), format=0, num_channels=1)
+
+
+def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda):
+    with pytest.raises(TypeError, match="shape must be a tuple"):
+        Array.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1)
+
+
+def test_array_from_descriptor_rejects_zero_dim(init_cuda):
+    with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"):
+        Array.from_descriptor(
+            shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1
+        )
+
+
+def test_array_copy_rejects_non_stream(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8,), format=ArrayFormat.UINT8, num_channels=1
+    )
+    try:
+        import array as _array
+        buf = _array.array("B", [0] * 8)
+        with pytest.raises(TypeError, match="stream must be a Stream"):
+            arr.copy_from(buf, stream="not-a-stream")
+        with pytest.raises(TypeError, match="stream must be a Stream"):
+            arr.copy_to(buf, stream="not-a-stream")
+    finally:
+        arr.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_non_buffer():
+    with pytest.raises(TypeError, match="buffer must be a Buffer"):
+        ResourceDescriptor.from_pitch2d(
+            object(),
+            format=ArrayFormat.UINT8,
+            num_channels=1,
+            width=8,
+            height=8,
+            pitch_bytes=64,
+        )
+
+
+def test_resource_descriptor_from_pitch2d_rejects_bad_format(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=0,
+                num_channels=1,
+                width=8,
+                height=8,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_bad_channels(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="num_channels"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=3,
+                width=8,
+                height=8,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_zero_dims(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="width"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=1,
+                width=0,
+                height=8,
+                pitch_bytes=64,
+            )
+        with pytest.raises(ValueError, match="height"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=1,
+                width=8,
+                height=0,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_mipmapped_array_rejects_bad_format(init_cuda):
+    with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8), format=0, num_channels=1, num_levels=2
+        )
+
+
+def test_mipmapped_array_rejects_bad_channels(init_cuda):
+    with pytest.raises(ValueError, match="num_channels"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2
+        )
+
+
+def test_mipmapped_array_rejects_zero_dim(init_cuda):
+    with pytest.raises(ValueError, match=r"shape\[0\] must be >= 1"):
+        MipmappedArray.from_descriptor(
+            shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1
+        )
+
+
+def test_texture_object_rejects_non_resource_descriptor(init_cuda):
+    with pytest.raises(TypeError, match="resource must be a ResourceDescriptor"):
+        TextureObject.from_descriptor(
+            resource=object(), texture_descriptor=TextureDescriptor()
+        )
+
+
+def test_texture_object_rejects_non_texture_descriptor(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        with pytest.raises(
+            TypeError, match="texture_descriptor must be a TextureDescriptor"
+        ):
+            TextureObject.from_descriptor(resource=res, texture_descriptor="nope")
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_filter_mode(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(filter_mode=0)  # int, not FilterMode
+        with pytest.raises(TypeError, match="filter_mode must be a FilterMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_read_mode(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(read_mode=0)  # int, not ReadMode
+        with pytest.raises(TypeError, match="read_mode must be a ReadMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(mipmap_filter_mode=0)  # int, not FilterMode
+        with pytest.raises(
+            TypeError, match="mipmap_filter_mode must be a FilterMode"
+        ):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_negative_anisotropy(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(max_anisotropy=-1)
+        with pytest.raises(ValueError, match="max_anisotropy"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_border_color_length(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(border_color=(0.0, 0.0))  # length 2, not 4
+        with pytest.raises(ValueError, match="border_color must have 4"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=42)  # int, not AddressMode / iterable
+        with pytest.raises(TypeError, match="address_mode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_empty_tuple(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=())
+        with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_too_long_tuple(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(
+            address_mode=(
+                AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP
+            )
+        )
+        with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_non_addressmode_entry(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=(AddressMode.WRAP, "bad", AddressMode.CLAMP))
+        with pytest.raises(TypeError, match=r"address_mode\[1\]"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_keeps_backing_array_alive(init_cuda):
+    """Dropping the local references to the backing Array and the
+    ResourceDescriptor must NOT invalidate an existing TextureObject. The
+    TextureObject holds a strong ref through its _source_ref slot."""
+    arr = Array.from_descriptor(
+        shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1
+    )
+    res = ResourceDescriptor.from_array(arr)
+    tex = TextureObject.from_descriptor(
+        resource=res, texture_descriptor=TextureDescriptor()
+    )
+    # Verify the keepalive chain via gc referents: TextureObject -> _source_ref
+    # -> ResourceDescriptor -> _source -> Array. We can only walk one level
+    # at a time, so check tex's referents include the ResourceDescriptor.
+    arr_id = id(arr)
+    res_id = id(res)
+    del arr, res
+    gc.collect()
+
+    referents = gc.get_referents(tex)
+    res_refs = [r for r in referents if id(r) == res_id]
+    assert len(res_refs) == 1, (
+        f"TextureObject should still reference the ResourceDescriptor; "
+        f"got referents {referents!r}"
+    )
+    res_back = res_refs[0]
+    arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id]
+    assert len(arr_refs) == 1, "ResourceDescriptor should still reference its Array"
+
+    # tex.handle should still be valid (non-zero).
+    assert tex.handle != 0
+    tex.close()
+
+
+def test_surface_object_keeps_backing_array_alive(init_cuda):
+    arr = Array.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        surface_load_store=True,
+    )
+    surf = SurfaceObject.from_array(arr)
+    arr_id = id(arr)
+    del arr
+    gc.collect()
+
+    # The surface keeps the ResourceDescriptor alive, which keeps the Array
+    # alive. We verify the chain end-to-end the same way as the texture case.
+    referents = gc.get_referents(surf)
+    res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)]
+    assert len(res_objs) == 1
+    arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id]
+    assert len(arr_refs) == 1, (
+        "SurfaceObject should still reference its backing Array via the ResourceDescriptor"
+    )
+    assert surf.handle != 0
+    surf.close()