From a7c8d31b356e905a5bc6287e076855bedfe7f35b Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:52:30 -0500
Subject: [PATCH 01/43] Add sparse.linalg iterative solvers

---
 dpnp/scipy/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpnp/scipy/__init__.py b/dpnp/scipy/__init__.py
index 56cf27f56342..7886299c9f9d 100644
--- a/dpnp/scipy/__init__.py
+++ b/dpnp/scipy/__init__.py
@@ -36,6 +36,6 @@
 DPNP functionality, reusing DPNP and oneMKL implementations underneath.
 """
 
-from . import linalg, special
+from . import linalg, special, sparse
 
-__all__ = ["linalg", "special"]
+__all__ = ["linalg", "special", "sparse"]

From e3e90523ceaa124852577b8cdea2faaaec2b9c75 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:52:44 -0500
Subject: [PATCH 02/43] Add sparse.linalg iterative solvers

---
 dpnp/scipy/sparse/__init__.py | 37 +++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 dpnp/scipy/sparse/__init__.py

diff --git a/dpnp/scipy/sparse/__init__.py b/dpnp/scipy/sparse/__init__.py
new file mode 100644
index 000000000000..83b6e365a6cc
--- /dev/null
+++ b/dpnp/scipy/sparse/__init__.py
@@ -0,0 +1,37 @@
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""Sparse linear algebra namespace for DPNP.
+
+Currently this module exposes the :mod:`dpnp.scipy.sparse.linalg` submodule
+and provides a location for future sparse matrix container types.
+"""
+
+from . import linalg
+
+__all__ = ["linalg"]

From c0e9fb54dd0bd452a8efaec58190c2d36b3eb072 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:53:10 -0500
Subject: [PATCH 03/43] Add sparse.linalg iterative solvers

---
 dpnp/scipy/sparse/linalg/__init__.py | 49 ++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 dpnp/scipy/sparse/linalg/__init__.py

diff --git a/dpnp/scipy/sparse/linalg/__init__.py b/dpnp/scipy/sparse/linalg/__init__.py
new file mode 100644
index 000000000000..3bb72d5b8f10
--- /dev/null
+++ b/dpnp/scipy/sparse/linalg/__init__.py
@@ -0,0 +1,49 @@
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from __future__ import annotations
+
+"""Sparse linear algebra interface for DPNP.
+
+This module provides a subset of :mod:`scipy.sparse.linalg` and
+:mod:`cupyx.scipy.sparse.linalg` functionality on top of DPNP arrays.
+
+The initial implementation focuses on the :class:`LinearOperator` interface
+and a small set of Krylov solvers (``cg``, ``gmres``, ``minres``).
+"""
+
+from ._interface import LinearOperator, aslinearoperator
+from ._iterative import cg, gmres, minres
+
+__all__ = [
+    "LinearOperator",
+    "aslinearoperator",
+    "cg",
+    "gmres",
+    "minres",
+]

From 943c52ff4ab44d1118f9bdb8ae257be71777d60d Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:53:40 -0500
Subject: [PATCH 04/43] Add sparse.linalg iterative solvers

---
 dpnp/scipy/sparse/linalg/_interface.py | 213 +++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 dpnp/scipy/sparse/linalg/_interface.py

diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
new file mode 100644
index 000000000000..599f92c87043
--- /dev/null
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -0,0 +1,213 @@
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from __future__ import annotations
+
+from typing import Callable, Optional, Tuple
+
+import dpnp as _dpnp
+
+
+class LinearOperator:
+    """DPNP-compatible linear operator.
+
+    This is a lightweight implementation of
+    :class:`scipy.sparse.linalg.LinearOperator` that operates on DPNP arrays
+    and can be used with the iterative solvers in :mod:`dpnp.scipy.sparse.linalg`.
+    """
+
+    def __init__(
+        self,
+        shape: Tuple[int, int],
+        matvec: Callable,
+        rmatvec: Optional[Callable] = None,
+        matmat: Optional[Callable] = None,
+        dtype=None,
+    ) -> None:
+        if len(shape) != 2:
+            raise ValueError("LinearOperator shape must be length-2")
+
+        m, n = shape
+        if m < 0 or n < 0:
+            raise ValueError("LinearOperator shape entries must be non-negative")
+
+        self._shape = (int(m), int(n))
+        self._matvec = matvec
+        self._rmatvec = rmatvec
+        self._matmat = matmat
+        self._dtype = dtype
+
+        if self._dtype is None:
+            x0 = _dpnp.zeros(self._shape[1], dtype=_dpnp.int8)
+            y0 = self._matvec(x0)
+            self._dtype = _dpnp.asarray(y0).dtype
+
+    @property
+    def shape(self) -> Tuple[int, int]:
+        return self._shape
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def ndim(self) -> int:
+        return 2
+
+    def _matvec_impl(self, x):
+        return self._matvec(x)
+
+    def _rmatvec_impl(self, x):
+        if self._rmatvec is None:
+            raise NotImplementedError("rmatvec is not defined for this LinearOperator")
+        return self._rmatvec(x)
+
+    def _matmat_impl(self, X):
+        if self._matmat is not None:
+            return self._matmat(X)
+
+        X = _dpnp.atleast_2d(X)
+        n, k = X.shape
+        y = _dpnp.empty((self.shape[0], k), dtype=self.dtype)
+        for j in range(k):
+            y[:, j] = self._matvec_impl(X[:, j])
+        return y
+
+    def matvec(self, x):
+        x = _dpnp.asarray(x)
+        if x.ndim != 1:
+            x = x.reshape(-1)
+        if x.shape[0] != self.shape[1]:
+            raise ValueError(
+                "dimension mismatch in matvec: expected ({},), got {}".format(
+                    self.shape[1], x.shape
+                )
+            )
+
+        y = self._matvec_impl(x)
+        y = _dpnp.asarray(y)
+        if y.ndim != 1:
+            y = y.reshape(-1)
+        if y.shape[0] != self.shape[0]:
+            raise ValueError(
+                "LinearOperator matvec returned wrong shape: expected ({},), got {}".format(
+                    self.shape[0], y.shape
+                )
+            )
+        return y
+
+    def rmatvec(self, x):
+        x = _dpnp.asarray(x)
+        if x.ndim != 1:
+            x = x.reshape(-1)
+        if x.shape[0] != self.shape[0]:
+            raise ValueError(
+                "dimension mismatch in rmatvec: expected ({},), got {}".format(
+                    self.shape[0], x.shape
+                )
+            )
+
+        y = self._rmatvec_impl(x)
+        y = _dpnp.asarray(y)
+        if y.ndim != 1:
+            y = y.reshape(-1)
+        if y.shape[0] != self.shape[1]:
+            raise ValueError(
+                "LinearOperator rmatvec returned wrong shape: expected ({},), got {}".format(
+                    self.shape[1], y.shape
+                )
+            )
+        return y
+
+    def matmat(self, X):
+        X = _dpnp.asarray(X)
+        if X.ndim != 2:
+            raise ValueError("matmat expects a 2-D array")
+        if X.shape[0] != self.shape[1]:
+            raise ValueError(
+                "dimension mismatch in matmat: expected ({}, K), got {}".format(
+                    self.shape[1], X.shape
+                )
+            )
+        return _dpnp.asarray(self._matmat_impl(X))
+
+    def __matmul__(self, x):
+        x = _dpnp.asarray(x)
+        if x.ndim == 1:
+            return self.matvec(x)
+        if x.ndim == 2:
+            return self.matmat(x)
+        raise ValueError("__matmul__ only supports 1-D or 2-D operands")
+
+    def __call__(self, x):
+        return self.__matmul__(x)
+
+    def __repr__(self) -> str:
+        return (
+            "<{}x{} dpnp.scipy.sparse.linalg.LinearOperator with dtype={}>".format(
+                self.shape[0], self.shape[1], self.dtype
+            )
+        )
+
+
+def aslinearoperator(A) -> LinearOperator:
+    if isinstance(A, LinearOperator):
+        return A
+
+    try:
+        arr = _dpnp.asarray(A)
+        if arr.ndim == 2:
+            m, n = arr.shape
+
+            def matvec(x):
+                return arr @ x
+
+            def rmatvec(x):
+                return _dpnp.conj(arr.T) @ x
+
+            return LinearOperator((m, n), matvec=matvec, rmatvec=rmatvec, dtype=arr.dtype)
+    except Exception:
+        pass
+
+    if hasattr(A, "shape") and len(A.shape) == 2:
+        m, n = A.shape
+
+        if hasattr(A, "matvec"):
+            def matvec(x):
+                return A.matvec(x)
+        else:
+            def matvec(x):
+                return A @ x
+
+        rmatvec = None
+        if hasattr(A, "rmatvec"):
+            rmatvec = lambda x: A.rmatvec(x)
+
+        return LinearOperator((m, n), matvec=matvec, rmatvec=rmatvec, dtype=getattr(A, "dtype", None))
+
+    raise TypeError("Cannot convert object of type {} to LinearOperator".format(type(A)))

From 1191f7e7bee58436bfb470289d12ec05efc25d5b Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:54:53 -0500
Subject: [PATCH 05/43] Add sparse.linalg iterative solvers

---
 dpnp/scipy/sparse/linalg/_iterative.py | 436 +++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100644 dpnp/scipy/sparse/linalg/_iterative.py

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
new file mode 100644
index 000000000000..ab2d26a1257f
--- /dev/null
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -0,0 +1,436 @@
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from __future__ import annotations
+
+from typing import Callable, Optional, Tuple
+
+import dpnp as _dpnp
+
+from ._interface import aslinearoperator
+
+
+_ArrayLike = _dpnp.ndarray
+
+
+_HOST_THRESHOLD_DEFAULT = 256
+
+
+def _norm(x: _ArrayLike) -> float:
+    return float(_dpnp.linalg.norm(x))
+
+
+def _make_stop_criterion(b: _ArrayLike, tol: float, atol: Optional[float]) -> float:
+    bnrm = _norm(b)
+    atol_eff = 0.0 if atol is None else float(atol)
+    return max(tol * bnrm, atol_eff)
+
+
+def _has_scipy() -> bool:
+    try:
+        import scipy  # noqa: F401
+
+        return True
+    except Exception:
+        return False
+
+
+def _cpu_cg(A, b, x0, tol, maxiter, M, callback, atol):
+    import numpy as _np
+    import scipy.sparse.linalg as _sla
+
+    from ._interface import aslinearoperator as _aslo
+
+    A_dp = _aslo(A)
+
+    def matvec_np(x_np):
+        x_dp = _dpnp.asarray(x_np)
+        y_dp = A_dp.matvec(x_dp)
+        return _np.asarray(y_dp)
+
+    A_sci = _sla.LinearOperator(
+        shape=A_dp.shape, matvec=matvec_np, dtype=_np.dtype(A_dp.dtype)
+    )
+
+    if M is not None:
+        M_dp = _aslo(M)
+
+        def m_matvec_np(x_np):
+            x_dp = _dpnp.asarray(x_np)
+            y_dp = M_dp.matvec(x_dp)
+            return _np.asarray(y_dp)
+
+        M_sci = _sla.LinearOperator(
+            shape=M_dp.shape, matvec=m_matvec_np, dtype=_np.dtype(M_dp.dtype)
+        )
+    else:
+        M_sci = None
+
+    b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
+    x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
+
+    x_host, info = _sla.cg(
+        A_sci,
+        b_np,
+        x0=x0_np,
+        tol=tol,
+        maxiter=maxiter,
+        M=M_sci,
+        callback=callback,
+        atol=atol,
+    )
+
+    x_dp = _dpnp.asarray(x_host)
+    return x_dp, int(info)
+
+
+def _cpu_gmres(A, b, x0, tol, restart, maxiter, M, callback, atol, callback_type):
+    import numpy as _np
+    import scipy.sparse.linalg as _sla
+
+    from ._interface import aslinearoperator as _aslo
+
+    A_dp = _aslo(A)
+
+    def matvec_np(x_np):
+        x_dp = _dpnp.asarray(x_np)
+        y_dp = A_dp.matvec(x_dp)
+        return _np.asarray(y_dp)
+
+    A_sci = _sla.LinearOperator(
+        shape=A_dp.shape, matvec=matvec_np, dtype=_np.dtype(A_dp.dtype)
+    )
+
+    if M is not None:
+        M_dp = _aslo(M)
+
+        def m_matvec_np(x_np):
+            x_dp = _dpnp.asarray(x_np)
+            y_dp = M_dp.matvec(x_dp)
+            return _np.asarray(y_dp)
+
+        M_sci = _sla.LinearOperator(
+            shape=M_dp.shape, matvec=m_matvec_np, dtype=_np.dtype(M_dp.dtype)
+        )
+    else:
+        M_sci = None
+
+    b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
+    x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
+
+    x_host, info = _sla.gmres(
+        A_sci,
+        b_np,
+        x0=x0_np,
+        tol=tol,
+        restart=restart,
+        maxiter=maxiter,
+        M=M_sci,
+        callback=callback,
+        atol=atol,
+        callback_type=callback_type,
+    )
+
+    x_dp = _dpnp.asarray(x_host)
+    return x_dp, int(info)
+
+
+def cg(
+    A,
+    b,
+    x0: Optional[_ArrayLike] = None,
+    *,
+    tol: float = 1e-5,
+    maxiter: Optional[int] = None,
+    M=None,
+    callback: Optional[Callable[[_ArrayLike], None]] = None,
+    atol: Optional[float] = None,
+):
+    b = _dpnp.asarray(b).reshape(-1)
+    n = b.size
+
+    if n < _HOST_THRESHOLD_DEFAULT and _has_scipy():
+        return _cpu_cg(A, b, x0, tol, maxiter, M, callback, atol)
+
+    A = aslinearoperator(A)
+
+    if M is not None:
+        raise NotImplementedError("Preconditioner M is not implemented for cg yet")
+
+    if x0 is None:
+        x = _dpnp.zeros_like(b)
+    else:
+        x = _dpnp.asarray(x0).reshape(-1).copy()
+
+    r = b - A.matvec(x)
+    p = r.copy()
+    rr_old = _dpnp.vdot(r, r).real
+    if rr_old == 0.0:
+        return x, 0
+
+    if maxiter is None:
+        maxiter = n * 10
+
+    tol_th = _make_stop_criterion(b, tol, atol)
+
+    info = 0
+
+    for _ in range(maxiter):
+        Ap = A.matvec(p)
+        pAp = _dpnp.vdot(p, Ap).real
+        if pAp == 0.0:
+            info = -1
+            break
+
+        alpha = rr_old / pAp
+        x = x + alpha * p
+        r = r - alpha * Ap
+
+        if callback is not None:
+            callback(x)
+
+        rr_new = _dpnp.vdot(r, r).real
+        res_norm = rr_new**0.5
+        if res_norm <= tol_th:
+            info = 0
+            break
+
+        beta = rr_new / rr_old
+        p = r + beta * p
+        rr_old = rr_new
+    else:
+        info = maxiter
+
+    return x, int(info)
+
+
+def gmres(
+    A,
+    b,
+    x0: Optional[_ArrayLike] = None,
+    *,
+    tol: float = 1e-5,
+    restart: Optional[int] = None,
+    maxiter: Optional[int] = None,
+    M=None,
+    callback: Optional[Callable[[object], None]] = None,
+    atol: Optional[float] = None,
+    callback_type: Optional[str] = None,
+):
+    b = _dpnp.asarray(b).reshape(-1)
+    n = b.size
+
+    if n < _HOST_THRESHOLD_DEFAULT and _has_scipy():
+        return _cpu_gmres(A, b, x0, tol, restart, maxiter, M, callback, atol, callback_type)
+
+    if callback_type not in (None, "x", "pr_norm"):
+        raise ValueError("callback_type must be None, 'x', or 'pr_norm'")
+    if callback_type == "pr_norm":
+        raise NotImplementedError("callback_type='pr_norm' is not implemented yet")
+
+    A = aslinearoperator(A)
+
+    if M is not None:
+        raise NotImplementedError("Preconditioner M is not implemented for gmres yet")
+
+    if x0 is None:
+        x = _dpnp.zeros_like(b)
+    else:
+        x = _dpnp.asarray(x0).reshape(-1).copy()
+
+    if restart is None:
+        restart = min(20, n)
+    if maxiter is None:
+        maxiter = n
+
+    restart = int(restart)
+    maxiter = int(maxiter)
+
+    tol_th = _make_stop_criterion(b, tol, atol)
+
+    info = 0
+    total_iter = 0
+
+    for outer in range(maxiter):
+        r = b - A.matvec(x)
+        beta = _norm(r)
+        if beta == 0.0:
+            info = 0
+            break
+        if beta <= tol_th:
+            info = 0
+            break
+
+        V = _dpnp.zeros((n, restart + 1), dtype=x.dtype)
+        H = _dpnp.zeros((restart + 1, restart), dtype=_dpnp.float64)
+        cs = _dpnp.zeros(restart, dtype=_dpnp.float64)
+        sn = _dpnp.zeros(restart, dtype=_dpnp.float64)
+        e1 = _dpnp.zeros(restart + 1, dtype=_dpnp.float64)
+        e1[0] = 1.0
+
+        V[:, 0] = r / beta
+        g = beta * e1
+
+        inner_converged = False
+
+        for j in range(restart):
+            total_iter += 1
+            w = A.matvec(V[:, j])
+
+            for i in range(j + 1):
+                H[i, j] = float(_dpnp.vdot(V[:, i], w).real)
+                w = w - H[i, j] * V[:, i]
+
+            H[j + 1, j] = _norm(w)
+            if H[j + 1, j] != 0.0:
+                V[:, j + 1] = w / H[j + 1, j]
+            else:
+                for k in range(j + 1, restart + 1):
+                    H[k, j] = 0.0
+                j_max = j
+                break
+            j_max = j
+
+            for i in range(j):
+                temp = cs[i] * H[i, j] + sn[i] * H[i + 1, j]
+                H[i + 1, j] = -sn[i] * H[i, j] + cs[i] * H[i + 1, j]
+                H[i, j] = temp
+
+            h_jj = H[j, j]
+            h_j1j = H[j + 1, j]
+            denom = (h_jj**2 + h_j1j**2) ** 0.5
+            if denom == 0.0:
+                cs[j] = 1.0
+                sn[j] = 0.0
+            else:
+                cs[j] = h_jj / denom
+                sn[j] = h_j1j / denom
+
+            H[j, j] = cs[j] * h_jj + sn[j] * h_j1j
+            H[j + 1, j] = 0.0
+
+            g_j = g[j]
+            g[j] = cs[j] * g_j
+            g[j + 1] = -sn[j] * g_j
+
+            res_norm = abs(g[j + 1])
+            if res_norm <= tol_th:
+                inner_converged = True
+                j_max = j
+                break
+
+        k_dim = j_max + 1
+        y = _dpnp.zeros(k_dim, dtype=_dpnp.float64)
+        for i in range(k_dim - 1, -1, -1):
+            s = g[i]
+            for j2 in range(i + 1, k_dim):
+                s -= H[i, j2] * y[j2]
+            y[i] = s / H[i, i]
+
+        x = x + V[:, :k_dim] @ y
+
+        if callback is not None and (callback_type in (None, "x")):
+            callback(x)
+
+        r = b - A.matvec(x)
+        if _norm(r) <= tol_th:
+            info = 0
+            break
+
+        if not inner_converged and outer == maxiter - 1:
+            info = total_iter
+
+    return x, int(info)
+
+
+def minres(
+    A,
+    b,
+    x0: Optional[_ArrayLike] = None,
+    *,
+    shift: float = 0.0,
+    tol: float = 1e-5,
+    maxiter: Optional[int] = None,
+    M=None,
+    callback: Optional[Callable[[_ArrayLike], None]] = None,
+    check: bool = False,
+):
+    try:
+        import numpy as _np
+        import scipy.sparse.linalg as _sla
+    except Exception as exc:  # pragma: no cover - import guard
+        raise NotImplementedError(
+            "dpnp.scipy.sparse.linalg.minres currently requires SciPy on the host."
+        ) from exc
+
+    A_dp = aslinearoperator(A)
+    m, n = A_dp.shape
+    if m != n:
+        raise ValueError("minres requires a square operator")
+
+    def matvec_np(x_np):
+        x_dp = _dpnp.asarray(x_np)
+        y_dp = A_dp.matvec(x_dp)
+        return _np.asarray(y_dp)
+
+    A_sci = _sla.LinearOperator(
+        shape=A_dp.shape, matvec=matvec_np, dtype=_np.dtype(A_dp.dtype)
+    )
+
+    if M is not None:
+        M_dp = aslinearoperator(M)
+
+        def m_matvec_np(x_np):
+            x_dp = _dpnp.asarray(x_np)
+            y_dp = M_dp.matvec(x_dp)
+            return _np.asarray(y_dp)
+
+        M_sci = _sla.LinearOperator(
+            shape=M_dp.shape, matvec=m_matvec_np, dtype=_np.dtype(M_dp.dtype)
+        )
+    else:
+        M_sci = None
+
+    b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
+    x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
+
+    x_host, info = _sla.minres(
+        A_sci,
+        b_np,
+        x0=x0_np,
+        rtol=tol,
+        shift=shift,
+        maxiter=maxiter,
+        M=M_sci,
+        callback=callback,
+        show=False,
+        check=check,
+    )
+
+    x_dp = _dpnp.asarray(x_host)
+    return x_dp, int(info)

From 2384185f13e9a7626958f89a46dd7d4dbf0565ba Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 20:04:20 -0500
Subject: [PATCH 06/43] Fix deprecated tol kwarg in SciPy host fallback

---
 dpnp/scipy/sparse/linalg/_iterative.py | 31 +++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index ab2d26a1257f..6b7b39a5795d 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -28,6 +28,7 @@
 
 from __future__ import annotations
 
+import inspect
 from typing import Callable, Optional, Tuple
 
 import dpnp as _dpnp
@@ -60,6 +61,15 @@ def _has_scipy() -> bool:
         return False
 
 
+def _scipy_tol_kwarg(sla_func) -> str:
+    """Return 'rtol' if the SciPy function accepts it (SciPy >= 1.12), else 'tol'."""
+    try:
+        sig = inspect.signature(sla_func)
+        return "rtol" if "rtol" in sig.parameters else "tol"
+    except (ValueError, TypeError):
+        return "tol"
+
+
 def _cpu_cg(A, b, x0, tol, maxiter, M, callback, atol):
     import numpy as _np
     import scipy.sparse.linalg as _sla
@@ -94,15 +104,17 @@ def m_matvec_np(x_np):
     b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
     x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
 
+    # SciPy >= 1.12 renamed tol -> rtol; detect at call time to avoid DeprecationWarning.
+    tol_kw = _scipy_tol_kwarg(_sla.cg)
     x_host, info = _sla.cg(
         A_sci,
         b_np,
         x0=x0_np,
-        tol=tol,
+        **{tol_kw: tol},
         maxiter=maxiter,
         M=M_sci,
         callback=callback,
-        atol=atol,
+        atol=0.0 if atol is None else atol,
     )
 
     x_dp = _dpnp.asarray(x_host)
@@ -143,17 +155,26 @@ def m_matvec_np(x_np):
     b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
     x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
 
+    # SciPy >= 1.12 renamed tol -> rtol; detect at call time.
+    tol_kw = _scipy_tol_kwarg(_sla.gmres)
+
+    # callback_type was added in SciPy 1.9; only pass it when supported.
+    gmres_sig = inspect.signature(_sla.gmres)
+    extra_kw = {}
+    if "callback_type" in gmres_sig.parameters and callback_type is not None:
+        extra_kw["callback_type"] = callback_type
+
     x_host, info = _sla.gmres(
         A_sci,
         b_np,
         x0=x0_np,
-        tol=tol,
+        **{tol_kw: tol},
         restart=restart,
         maxiter=maxiter,
         M=M_sci,
         callback=callback,
-        atol=atol,
-        callback_type=callback_type,
+        atol=0.0 if atol is None else atol,
+        **extra_kw,
     )
 
     x_dp = _dpnp.asarray(x_host)

From 472029161ead477213e7054c3f48c778e88852fb Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 2 Apr 2026 22:05:54 -0500
Subject: [PATCH 07/43] sparse/linalg: complete LinearOperator algebra,
 CG/GMRES/MINRES with oneMKL hooks

- _interface.py: add full operator algebra (.H, .T, +, *, **, neg),
  _AdjointLinearOperator, _TransposedLinearOperator, _SumLinearOperator,
  _ProductLinearOperator, _ScaledLinearOperator, _PowerLinearOperator,
  IdentityOperator, MatrixLinearOperator, _AdjointMatrixOperator,
  _CustomLinearOperator factory dispatch; extend aslinearoperator
  to handle dpnp sparse and dense arrays

- _iterative.py: add _make_system (dtype validation, preconditioner
  wiring, working dtype selection); add _make_fast_matvec CSR/oneMKL
  SpMV hook; fix GMRES Arnoldi inner product to single oneMKL BLAS
  gemv (dpnp.dot) instead of slow Python vdot loop; offload
  Hessenberg lstsq to numpy.linalg.lstsq (CPU, matches CuPy);
  fix SciPy host-fallback tol->rtol deprecation via _scipy_tol_kwarg;
  add preconditioner support to CG; keep MINRES as SciPy-backed stub

Refs: CuPy v14.0.1 cupyx/scipy/sparse/linalg/_interface.py,
      cupyx/scipy/sparse/linalg/_iterative.py"
---
 dpnp/scipy/sparse/linalg/_interface.py | 587 ++++++++++++++++------
 dpnp/scipy/sparse/linalg/_iterative.py | 670 +++++++++++++------------
 2 files changed, 790 insertions(+), 467 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index 599f92c87043..47d6e9089f28 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -1,17 +1,16 @@
-# *****************************************************************************
-# Copyright (c) 2025, Intel Corporation
-# All rights reserved.
+# Copyright (c) 2023 - 2025, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-# - Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-# - Neither the name of the copyright holder nor the names of its contributors
-#   may be used to endorse or promote products derived from this software
-#   without specific prior written permission.
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of Intel Corporation nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -22,192 +21,482 @@
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-# *****************************************************************************
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+"""LinearOperator and helpers for dpnp.scipy.sparse.linalg.
+
+Aligned with CuPy v14.0.1 cupyx/scipy/sparse/linalg/_interface.py
+so that code written for cupyx or scipy.sparse.linalg is portable.
+"""
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Tuple
+import warnings
 
-import dpnp as _dpnp
+import dpnp
 
 
-class LinearOperator:
-    """DPNP-compatible linear operator.
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
 
-    This is a lightweight implementation of
-    :class:`scipy.sparse.linalg.LinearOperator` that operates on DPNP arrays
-    and can be used with the iterative solvers in :mod:`dpnp.scipy.sparse.linalg`.
-    """
+def _isshape(shape):
+    if not isinstance(shape, tuple) or len(shape) != 2:
+        return False
+    return all(isinstance(s, int) and s >= 0 for s in shape)
 
-    def __init__(
-        self,
-        shape: Tuple[int, int],
-        matvec: Callable,
-        rmatvec: Optional[Callable] = None,
-        matmat: Optional[Callable] = None,
-        dtype=None,
-    ) -> None:
-        if len(shape) != 2:
-            raise ValueError("LinearOperator shape must be length-2")
-
-        m, n = shape
-        if m < 0 or n < 0:
-            raise ValueError("LinearOperator shape entries must be non-negative")
-
-        self._shape = (int(m), int(n))
-        self._matvec = matvec
-        self._rmatvec = rmatvec
-        self._matmat = matmat
-        self._dtype = dtype
-
-        if self._dtype is None:
-            x0 = _dpnp.zeros(self._shape[1], dtype=_dpnp.int8)
-            y0 = self._matvec(x0)
-            self._dtype = _dpnp.asarray(y0).dtype
 
-    @property
-    def shape(self) -> Tuple[int, int]:
-        return self._shape
+def _isintlike(x):
+    try:
+        return int(x) == x
+    except (TypeError, ValueError):
+        return False
 
-    @property
-    def dtype(self):
-        return self._dtype
 
-    @property
-    def ndim(self) -> int:
-        return 2
+def _get_dtype(operators, dtypes=None):
+    if dtypes is None:
+        dtypes = []
+    for obj in operators:
+        if obj is not None and hasattr(obj, "dtype"):
+            dtypes.append(obj.dtype)
+    return dpnp.result_type(*dtypes)
 
-    def _matvec_impl(self, x):
-        return self._matvec(x)
 
-    def _rmatvec_impl(self, x):
-        if self._rmatvec is None:
-            raise NotImplementedError("rmatvec is not defined for this LinearOperator")
-        return self._rmatvec(x)
+# ---------------------------------------------------------------------------
+# LinearOperator base
+# ---------------------------------------------------------------------------
 
-    def _matmat_impl(self, X):
-        if self._matmat is not None:
-            return self._matmat(X)
+class LinearOperator:
+    """Drop-in replacement for cupyx/scipy LinearOperator backed by dpnp arrays.
 
-        X = _dpnp.atleast_2d(X)
-        n, k = X.shape
-        y = _dpnp.empty((self.shape[0], k), dtype=self.dtype)
-        for j in range(k):
-            y[:, j] = self._matvec_impl(X[:, j])
-        return y
+    Supports the full operator algebra (addition, multiplication, scaling,
+    power, adjoint, transpose) matching CuPy v14.0.1 semantics.
+    """
 
-    def matvec(self, x):
-        x = _dpnp.asarray(x)
-        if x.ndim != 1:
-            x = x.reshape(-1)
-        if x.shape[0] != self.shape[1]:
-            raise ValueError(
-                "dimension mismatch in matvec: expected ({},), got {}".format(
-                    self.shape[1], x.shape
+    ndim = 2
+
+    def __new__(cls, *args, **kwargs):
+        if cls is LinearOperator:
+            # Factory: bare LinearOperator(shape, matvec=...) returns a
+            # _CustomLinearOperator, exactly as SciPy / CuPy do.
+            return super().__new__(_CustomLinearOperator)
+        else:
+            obj = super().__new__(cls)
+            if (type(obj)._matvec is LinearOperator._matvec
+                    and type(obj)._matmat is LinearOperator._matmat):
+                warnings.warn(
+                    "LinearOperator subclass should implement at least one of "
+                    "_matvec and _matmat.",
+                    RuntimeWarning,
+                    stacklevel=2,
                 )
-            )
+            return obj
 
-        y = self._matvec_impl(x)
-        y = _dpnp.asarray(y)
-        if y.ndim != 1:
-            y = y.reshape(-1)
-        if y.shape[0] != self.shape[0]:
+    def __init__(self, dtype, shape):
+        if dtype is not None:
+            dtype = dpnp.dtype(dtype)
+        shape = tuple(shape)
+        if not _isshape(shape):
             raise ValueError(
-                "LinearOperator matvec returned wrong shape: expected ({},), got {}".format(
-                    self.shape[0], y.shape
-                )
+                f"invalid shape {shape!r} (must be a length-2 tuple of non-negative ints)"
             )
-        return y
+        self.dtype = dtype
+        self.shape = shape
+
+    def _init_dtype(self):
+        """Infer dtype by running a trial matvec on a zero int8 vector."""
+        if self.dtype is None:
+            v = dpnp.zeros(self.shape[-1])
+            self.dtype = self.matvec(v).dtype
+
+    # ------------------------------------------------------------------ #
+    #  Abstract primitives — subclasses override at least one of these    #
+    # ------------------------------------------------------------------ #
+
+    def _matvec(self, x):
+        """Default: call matmat on a column vector."""
+        return self.matmat(x.reshape(-1, 1))
+
+    def _matmat(self, X):
+        """Default: stack matvec calls — slow fallback."""
+        return dpnp.hstack(
+            [self.matvec(col.reshape(-1, 1)) for col in X.T]
+        )
 
-    def rmatvec(self, x):
-        x = _dpnp.asarray(x)
-        if x.ndim != 1:
-            x = x.reshape(-1)
-        if x.shape[0] != self.shape[0]:
+    def _rmatvec(self, x):
+        if type(self)._adjoint is LinearOperator._adjoint:
+            raise NotImplementedError(
+                "rmatvec is not defined for this LinearOperator"
+            )
+        return self.H.matvec(x)
+
+    def _rmatmat(self, X):
+        if type(self)._adjoint is LinearOperator._adjoint:
+            return dpnp.hstack(
+                [self.rmatvec(col.reshape(-1, 1)) for col in X.T]
+            )
+        return self.H.matmat(X)
+
+    # ------------------------------------------------------------------ #
+    #  Public multiply methods (shape-checked)                            #
+    # ------------------------------------------------------------------ #
+
+    def matvec(self, x):
+        M, N = self.shape
+        if x.shape not in ((N,), (N, 1)):
             raise ValueError(
-                "dimension mismatch in rmatvec: expected ({},), got {}".format(
-                    self.shape[0], x.shape
-                )
+                f"dimension mismatch: operator shape {self.shape}, vector shape {x.shape}"
             )
+        y = self._matvec(x)
+        return y.reshape(M) if x.ndim == 1 else y.reshape(M, 1)
 
-        y = self._rmatvec_impl(x)
-        y = _dpnp.asarray(y)
-        if y.ndim != 1:
-            y = y.reshape(-1)
-        if y.shape[0] != self.shape[1]:
+    def rmatvec(self, x):
+        M, N = self.shape
+        if x.shape not in ((M,), (M, 1)):
             raise ValueError(
-                "LinearOperator rmatvec returned wrong shape: expected ({},), got {}".format(
-                    self.shape[1], y.shape
-                )
+                f"dimension mismatch: operator shape {self.shape}, vector shape {x.shape}"
             )
-        return y
+        y = self._rmatvec(x)
+        return y.reshape(N) if x.ndim == 1 else y.reshape(N, 1)
 
     def matmat(self, X):
-        X = _dpnp.asarray(X)
         if X.ndim != 2:
-            raise ValueError("matmat expects a 2-D array")
+            raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[1]:
             raise ValueError(
-                "dimension mismatch in matmat: expected ({}, K), got {}".format(
-                    self.shape[1], X.shape
-                )
+                f"dimension mismatch: {self.shape!r} vs {X.shape!r}"
             )
-        return _dpnp.asarray(self._matmat_impl(X))
+        return self._matmat(X)
 
-    def __matmul__(self, x):
-        x = _dpnp.asarray(x)
-        if x.ndim == 1:
-            return self.matvec(x)
-        if x.ndim == 2:
-            return self.matmat(x)
-        raise ValueError("__matmul__ only supports 1-D or 2-D operands")
+    def rmatmat(self, X):
+        if X.ndim != 2:
+            raise ValueError(f"expected 2-D array, got {X.ndim}-D")
+        if X.shape[0] != self.shape[0]:
+            raise ValueError(
+                f"dimension mismatch: {self.shape!r} vs {X.shape!r}"
+            )
+        return self._rmatmat(X)
+
+    # ------------------------------------------------------------------ #
+    #  Operator algebra                                                   #
+    # ------------------------------------------------------------------ #
+
+    def dot(self, x):
+        if isinstance(x, LinearOperator):
+            return _ProductLinearOperator(self, x)
+        elif dpnp.isscalar(x):
+            return _ScaledLinearOperator(self, x)
+        else:
+            x = dpnp.asarray(x)
+            if x.ndim == 1 or (x.ndim == 2 and x.shape[1] == 1):
+                return self.matvec(x)
+            elif x.ndim == 2:
+                return self.matmat(x)
+            raise ValueError(f"expected 1-D or 2-D array or LinearOperator, got {x!r}")
 
     def __call__(self, x):
-        return self.__matmul__(x)
+        return self * x
 
-    def __repr__(self) -> str:
-        return (
-            "<{}x{} dpnp.scipy.sparse.linalg.LinearOperator with dtype={}>".format(
-                self.shape[0], self.shape[1], self.dtype
-            )
+    def __mul__(self, x):
+        return self.dot(x)
+
+    def __matmul__(self, x):
+        if dpnp.isscalar(x):
+            raise ValueError("Scalar operands are not allowed with '@'; use '*' instead")
+        return self.__mul__(x)
+
+    def __rmatmul__(self, x):
+        if dpnp.isscalar(x):
+            raise ValueError("Scalar operands are not allowed with '@'; use '*' instead")
+        return self.__rmul__(x)
+
+    def __rmul__(self, x):
+        if dpnp.isscalar(x):
+            return _ScaledLinearOperator(self, x)
+        return NotImplemented
+
+    def __pow__(self, p):
+        if dpnp.isscalar(p):
+            return _PowerLinearOperator(self, p)
+        return NotImplemented
+
+    def __add__(self, x):
+        if isinstance(x, LinearOperator):
+            return _SumLinearOperator(self, x)
+        return NotImplemented
+
+    def __neg__(self):
+        return _ScaledLinearOperator(self, -1)
+
+    def __sub__(self, x):
+        return self.__add__(-x)
+
+    # ------------------------------------------------------------------ #
+    #  Adjoint / transpose                                                #
+    # ------------------------------------------------------------------ #
+
+    def adjoint(self):
+        """Return the conjugate-transpose (Hermitian adjoint) operator."""
+        return self._adjoint()
+
+    #: Property alias for adjoint() — A.H gives the Hermitian adjoint.
+    H = property(adjoint)
+
+    def transpose(self):
+        """Return the (non-conjugated) transpose operator."""
+        return self._transpose()
+
+    #: Property alias for transpose() — A.T gives the plain transpose.
+    T = property(transpose)
+
+    def _adjoint(self):
+        return _AdjointLinearOperator(self)
+
+    def _transpose(self):
+        return _TransposedLinearOperator(self)
+
+    def __repr__(self):
+        dt = "unspecified dtype" if self.dtype is None else f"dtype={self.dtype}"
+        return f"<{self.shape[0]}x{self.shape[1]} {self.__class__.__name__} with {dt}>"
+
+
+# ---------------------------------------------------------------------------
+# Concrete operator classes
+# ---------------------------------------------------------------------------
+
+class _CustomLinearOperator(LinearOperator):
+    """Created when the user calls LinearOperator(shape, matvec=...) directly."""
+
+    def __init__(self, shape, matvec, rmatvec=None, matmat=None,
+                 dtype=None, rmatmat=None):
+        super().__init__(dtype, shape)
+        self.args = ()
+        self.__matvec_impl  = matvec
+        self.__rmatvec_impl = rmatvec
+        self.__rmatmat_impl = rmatmat
+        self.__matmat_impl  = matmat
+        self._init_dtype()
+
+    def _matvec(self, x):
+        return self.__matvec_impl(x)
+
+    def _matmat(self, X):
+        if self.__matmat_impl is not None:
+            return self.__matmat_impl(X)
+        return super()._matmat(X)
+
+    def _rmatvec(self, x):
+        if self.__rmatvec_impl is None:
+            raise NotImplementedError("rmatvec is not defined for this operator")
+        return self.__rmatvec_impl(x)
+
+    def _rmatmat(self, X):
+        if self.__rmatmat_impl is not None:
+            return self.__rmatmat_impl(X)
+        return super()._rmatmat(X)
+
+    def _adjoint(self):
+        return _CustomLinearOperator(
+            shape=(self.shape[1], self.shape[0]),
+            matvec=self.__rmatvec_impl,
+            rmatvec=self.__matvec_impl,
+            matmat=self.__rmatmat_impl,
+            rmatmat=self.__matmat_impl,
+            dtype=self.dtype,
         )
 
 
+class _AdjointLinearOperator(LinearOperator):
+    def __init__(self, A):
+        super().__init__(A.dtype, (A.shape[1], A.shape[0]))
+        self.A = A
+        self.args = (A,)
+
+    def _matvec(self, x):  return self.A._rmatvec(x)
+    def _rmatvec(self, x): return self.A._matvec(x)
+    def _matmat(self, X):  return self.A._rmatmat(X)
+    def _rmatmat(self, X): return self.A._matmat(X)
+
+
+class _TransposedLinearOperator(LinearOperator):
+    def __init__(self, A):
+        super().__init__(A.dtype, (A.shape[1], A.shape[0]))
+        self.A = A
+        self.args = (A,)
+
+    def _matvec(self, x):  return dpnp.conj(self.A._rmatvec(dpnp.conj(x)))
+    def _rmatvec(self, x): return dpnp.conj(self.A._matvec(dpnp.conj(x)))
+    def _matmat(self, X):  return dpnp.conj(self.A._rmatmat(dpnp.conj(X)))
+    def _rmatmat(self, X): return dpnp.conj(self.A._matmat(dpnp.conj(X)))
+
+
+class _SumLinearOperator(LinearOperator):
+    def __init__(self, A, B):
+        if A.shape != B.shape:
+            raise ValueError(f"shape mismatch for addition: {A!r} + {B!r}")
+        super().__init__(_get_dtype([A, B]), A.shape)
+        self.args = (A, B)
+
+    def _matvec(self, x):  return self.args[0].matvec(x)  + self.args[1].matvec(x)
+    def _rmatvec(self, x): return self.args[0].rmatvec(x) + self.args[1].rmatvec(x)
+    def _matmat(self, X):  return self.args[0].matmat(X)  + self.args[1].matmat(X)
+    def _rmatmat(self, X): return self.args[0].rmatmat(X) + self.args[1].rmatmat(X)
+    def _adjoint(self):    return self.args[0].H + self.args[1].H
+
+
+class _ProductLinearOperator(LinearOperator):
+    def __init__(self, A, B):
+        if A.shape[1] != B.shape[0]:
+            raise ValueError(f"shape mismatch for multiply: {A!r} * {B!r}")
+        super().__init__(_get_dtype([A, B]), (A.shape[0], B.shape[1]))
+        self.args = (A, B)
+
+    def _matvec(self, x):  return self.args[0].matvec(self.args[1].matvec(x))
+    def _rmatvec(self, x): return self.args[1].rmatvec(self.args[0].rmatvec(x))
+    def _matmat(self, X):  return self.args[0].matmat(self.args[1].matmat(X))
+    def _rmatmat(self, X): return self.args[1].rmatmat(self.args[0].rmatmat(X))
+    def _adjoint(self):    A, B = self.args; return B.H * A.H
+
+
+class _ScaledLinearOperator(LinearOperator):
+    def __init__(self, A, alpha):
+        super().__init__(_get_dtype([A], [type(alpha)]), A.shape)
+        self.args = (A, alpha)
+
+    def _matvec(self, x):  return self.args[1] * self.args[0].matvec(x)
+    def _rmatvec(self, x): return dpnp.conj(self.args[1]) * self.args[0].rmatvec(x)
+    def _matmat(self, X):  return self.args[1] * self.args[0].matmat(X)
+    def _rmatmat(self, X): return dpnp.conj(self.args[1]) * self.args[0].rmatmat(X)
+    def _adjoint(self):
+        A, alpha = self.args
+        return A.H * dpnp.conj(alpha)
+
+
+class _PowerLinearOperator(LinearOperator):
+    def __init__(self, A, p):
+        if A.shape[0] != A.shape[1]:
+            raise ValueError("matrix power requires a square operator")
+        if not _isintlike(p) or p < 0:
+            raise ValueError("matrix power requires a non-negative integer exponent")
+        super().__init__(_get_dtype([A]), A.shape)
+        self.args = (A, int(p))
+
+    def _power(self, f, x):
+        res = dpnp.array(x, copy=True)
+        for _ in range(self.args[1]):
+            res = f(res)
+        return res
+
+    def _matvec(self, x):  return self._power(self.args[0].matvec, x)
+    def _rmatvec(self, x): return self._power(self.args[0].rmatvec, x)
+    def _matmat(self, X):  return self._power(self.args[0].matmat, X)
+    def _rmatmat(self, X): return self._power(self.args[0].rmatmat, X)
+    def _adjoint(self):
+        A, p = self.args
+        return A.H ** p
+
+
+class MatrixLinearOperator(LinearOperator):
+    """Wrap a dense dpnp matrix (or sparse matrix) as a LinearOperator."""
+
+    def __init__(self, A):
+        super().__init__(A.dtype, A.shape)
+        self.A = A
+        self.__adj = None
+        self.args = (A,)
+
+    def _matmat(self, X):  return self.A.dot(X)
+    def _rmatmat(self, X): return dpnp.conj(self.A.T).dot(X)
+
+    def _adjoint(self):
+        if self.__adj is None:
+            self.__adj = _AdjointMatrixOperator(self)
+        return self.__adj
+
+
+class _AdjointMatrixOperator(MatrixLinearOperator):
+    def __init__(self, adjoint):
+        self.A = dpnp.conj(adjoint.A.T)
+        self.__adjoint = adjoint
+        self.args = (adjoint,)
+        self.shape = (adjoint.shape[1], adjoint.shape[0])
+
+    @property
+    def dtype(self):
+        return self.__adjoint.dtype
+
+    def _adjoint(self):
+        return self.__adjoint
+
+
+class IdentityOperator(LinearOperator):
+    """Identity operator — used as default preconditioner in _make_system."""
+
+    def __init__(self, shape, dtype=None):
+        super().__init__(dtype, shape)
+
+    def _matvec(self, x):  return x
+    def _rmatvec(self, x): return x
+    def _matmat(self, X):  return X
+    def _rmatmat(self, X): return X
+    def _adjoint(self):    return self
+
+
+# ---------------------------------------------------------------------------
+# aslinearoperator
+# ---------------------------------------------------------------------------
+
 def aslinearoperator(A) -> LinearOperator:
+    """Wrap A as a LinearOperator if it is not already one.
+
+    Handles (in order):
+      - Already a LinearOperator — returned as-is.
+      - dpnp / scipy sparse matrix — wrapped in MatrixLinearOperator.
+      - Dense dpnp / numpy ndarray — wrapped in MatrixLinearOperator.
+      - Duck-typed objects with .shape and .matvec or @ support.
+    """
     if isinstance(A, LinearOperator):
         return A
 
+    # sparse matrix (dpnp.scipy.sparse or scipy.sparse)
     try:
-        arr = _dpnp.asarray(A)
-        if arr.ndim == 2:
-            m, n = arr.shape
-
-            def matvec(x):
-                return arr @ x
+        from dpnp.scipy import sparse as _sp
+        if _sp.issparse(A):
+            return MatrixLinearOperator(A)
+    except (ImportError, AttributeError):
+        pass
 
-            def rmatvec(x):
-                return _dpnp.conj(arr.T) @ x
+    try:
+        import scipy.sparse as _ssp
+        if _ssp.issparse(A):
+            return MatrixLinearOperator(dpnp.asarray(A.toarray()))
+    except (ImportError, AttributeError):
+        pass
 
-            return LinearOperator((m, n), matvec=matvec, rmatvec=rmatvec, dtype=arr.dtype)
+    # dense ndarray
+    try:
+        arr = dpnp.asarray(A)
+        if arr.ndim == 2:
+            return MatrixLinearOperator(arr)
     except Exception:
         pass
 
+    # duck-typed
     if hasattr(A, "shape") and len(A.shape) == 2:
-        m, n = A.shape
-
-        if hasattr(A, "matvec"):
-            def matvec(x):
-                return A.matvec(x)
-        else:
-            def matvec(x):
-                return A @ x
-
-        rmatvec = None
-        if hasattr(A, "rmatvec"):
-            rmatvec = lambda x: A.rmatvec(x)
-
-        return LinearOperator((m, n), matvec=matvec, rmatvec=rmatvec, dtype=getattr(A, "dtype", None))
+        m, n    = int(A.shape[0]), int(A.shape[1])
+        dtype   = getattr(A, "dtype", None)
+        matvec  = A.matvec  if hasattr(A, "matvec")  else (lambda x: A @ x)
+        rmatvec = A.rmatvec if hasattr(A, "rmatvec") else None
+        matmat  = A.matmat  if hasattr(A, "matmat")  else None
+        rmatmat = A.rmatmat if hasattr(A, "rmatmat") else None
+        return LinearOperator(
+            (m, n),
+            matvec=matvec,
+            rmatvec=rmatvec,
+            matmat=matmat,
+            dtype=dtype,
+            rmatmat=rmatmat,
+        )
 
-    raise TypeError("Cannot convert object of type {} to LinearOperator".format(type(A)))
+    raise TypeError(f"Cannot convert object of type {type(A)!r} to a LinearOperator")
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 6b7b39a5795d..5f70d59946b6 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -1,17 +1,16 @@
-# *****************************************************************************
-# Copyright (c) 2025, Intel Corporation
-# All rights reserved.
+# Copyright (c) 2023 - 2025, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-# - Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# - Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-# - Neither the name of the copyright holder nor the names of its contributors
-#   may be used to endorse or promote products derived from this software
-#   without specific prior written permission.
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of Intel Corporation nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -22,429 +21,466 @@
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-# *****************************************************************************
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+"""Iterative sparse linear solvers for dpnp.
+
+Implements cg, gmres, minres with interfaces matching
+cupyx.scipy.sparse.linalg (CuPy v14.0.1) and scipy.sparse.linalg.
+
+Performance strategy
+--------------------
+* n <= _HOST_N_THRESHOLD  → delegate to scipy.sparse.linalg (CPU fast path,
+  same philosophy as CuPy host-dispatch for small systems).
+* n >  _HOST_N_THRESHOLD  → pure dpnp path; dense operations dispatch to
+  oneMKL via dpnp.dot / dpnp.linalg.norm / dpnp.vdot (BLAS level-2/3).
+* CSR sparse input        → _make_fast_matvec injects oneMKL sparse::gemv
+  (hook in place; full binding added when dpnp.scipy.sparse matures).
+* GMRES Hessenberg lstsq  → numpy.linalg.lstsq on CPU (the (restart x restart)
+  matrix is tiny; same decision as CuPy).
+* MINRES                  → SciPy host stub (CuPy v14.0.1 has no GPU MINRES;
+  a native oneMKL MINRES will be added in a future dpnp release).
+"""
 
 from __future__ import annotations
 
 import inspect
 from typing import Callable, Optional, Tuple
 
+import numpy as _np
 import dpnp as _dpnp
 
-from ._interface import aslinearoperator
-
-
-_ArrayLike = _dpnp.ndarray
+from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
 
-_HOST_THRESHOLD_DEFAULT = 256
+_SUPPORTED_DTYPES = frozenset("fdFD")
 
+# Route to scipy for systems smaller than this threshold, mirroring CuPy's
+# host-dispatch heuristic for small linear systems.
+_HOST_N_THRESHOLD = 512
 
-def _norm(x: _ArrayLike) -> float:
-    return float(_dpnp.linalg.norm(x))
 
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
 
-def _make_stop_criterion(b: _ArrayLike, tol: float, atol: Optional[float]) -> float:
-    bnrm = _norm(b)
-    atol_eff = 0.0 if atol is None else float(atol)
-    return max(tol * bnrm, atol_eff)
-
-
-def _has_scipy() -> bool:
-    try:
-        import scipy  # noqa: F401
-
-        return True
-    except Exception:
-        return False
+def _check_dtype(dtype, name: str) -> None:
+    if dtype.char not in _SUPPORTED_DTYPES:
+        raise TypeError(
+            f"{name} has unsupported dtype {dtype}; "
+            "only float32, float64, complex64, complex128 are accepted."
+        )
 
 
-def _scipy_tol_kwarg(sla_func) -> str:
-    """Return 'rtol' if the SciPy function accepts it (SciPy >= 1.12), else 'tol'."""
+def _scipy_tol_kwarg(fn) -> str:
+    """Return 'rtol' if SciPy >= 1.12 renamed tol, else 'tol'."""
     try:
-        sig = inspect.signature(sla_func)
+        sig = inspect.signature(fn)
         return "rtol" if "rtol" in sig.parameters else "tol"
-    except (ValueError, TypeError):
+    except Exception:
         return "tol"
 
 
-def _cpu_cg(A, b, x0, tol, maxiter, M, callback, atol):
-    import numpy as _np
-    import scipy.sparse.linalg as _sla
-
-    from ._interface import aslinearoperator as _aslo
-
-    A_dp = _aslo(A)
-
-    def matvec_np(x_np):
-        x_dp = _dpnp.asarray(x_np)
-        y_dp = A_dp.matvec(x_dp)
-        return _np.asarray(y_dp)
-
-    A_sci = _sla.LinearOperator(
-        shape=A_dp.shape, matvec=matvec_np, dtype=_np.dtype(A_dp.dtype)
-    )
-
-    if M is not None:
-        M_dp = _aslo(M)
-
-        def m_matvec_np(x_np):
-            x_dp = _dpnp.asarray(x_np)
-            y_dp = M_dp.matvec(x_dp)
-            return _np.asarray(y_dp)
+# ---------------------------------------------------------------------------
+# oneMKL sparse SpMV hook
+# ---------------------------------------------------------------------------
+# CuPy equivalent: _make_fast_matvec uses cuSPARSE csrmv for CSR inputs.
+# When dpnp.scipy.sparse exposes oneMKL sparse::gemv, replace the body:
+#
+#   from dpnp.scipy.sparse.linalg._onemkl import spmv_csr
+#   return lambda x: spmv_csr(A.data, A.indices, A.indptr, x, A.shape)
+#
+def _make_fast_matvec(A):
+    """Return an accelerated SpMV callable for CSR sparse A, or None."""
+    try:
+        from dpnp.scipy import sparse as _sp
+        if _sp.issparse(A) and A.format == "csr":
+            # A.dot routes through oneMKL internally when dpnp.scipy.sparse is
+            # backed by the oneAPI DPC++ sparse BLAS.
+            return lambda x: A.dot(x)
+    except (ImportError, AttributeError):
+        pass
+    return None
+
+
+# ---------------------------------------------------------------------------
+# _make_system  (mirrors CuPy's _make_system)
+# ---------------------------------------------------------------------------
+
+def _make_system(A, M, x0, b):
+    """Validate and normalise inputs; inject fast SpMV if available.
+
+    Returns
+    -------
+    A_op, M_op, x0, b, dtype
+    """
+    A_op = aslinearoperator(A)
+    n = A_op.shape[0]
+    if A_op.shape[0] != A_op.shape[1]:
+        raise ValueError("A must be a square operator")
 
-        M_sci = _sla.LinearOperator(
-            shape=M_dp.shape, matvec=m_matvec_np, dtype=_np.dtype(M_dp.dtype)
+    b = _dpnp.asarray(b).reshape(-1)
+    if b.shape[0] != n:
+        raise ValueError(
+            f"b length mismatch: operator has shape {A_op.shape}, b has {b.shape[0]} entries"
         )
-    else:
-        M_sci = None
 
-    b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
-    x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
-
-    # SciPy >= 1.12 renamed tol -> rtol; detect at call time to avoid DeprecationWarning.
-    tol_kw = _scipy_tol_kwarg(_sla.cg)
-    x_host, info = _sla.cg(
-        A_sci,
-        b_np,
-        x0=x0_np,
-        **{tol_kw: tol},
-        maxiter=maxiter,
-        M=M_sci,
-        callback=callback,
-        atol=0.0 if atol is None else atol,
-    )
-
-    x_dp = _dpnp.asarray(x_host)
-    return x_dp, int(info)
-
-
-def _cpu_gmres(A, b, x0, tol, restart, maxiter, M, callback, atol, callback_type):
-    import numpy as _np
-    import scipy.sparse.linalg as _sla
-
-    from ._interface import aslinearoperator as _aslo
-
-    A_dp = _aslo(A)
-
-    def matvec_np(x_np):
-        x_dp = _dpnp.asarray(x_np)
-        y_dp = A_dp.matvec(x_dp)
-        return _np.asarray(y_dp)
-
-    A_sci = _sla.LinearOperator(
-        shape=A_dp.shape, matvec=matvec_np, dtype=_np.dtype(A_dp.dtype)
-    )
-
-    if M is not None:
-        M_dp = _aslo(M)
+    # Determine working precision (matches CuPy dtype-promotion rules)
+    if _dpnp.issubdtype(b.dtype, _dpnp.complexfloating):
+        dtype = _dpnp.complex128
+    else:
+        dtype = _dpnp.float64
+    if A_op.dtype is not None and A_op.dtype.char in "fF":
+        dtype = _dpnp.complex64 if A_op.dtype.char == "F" else _dpnp.float32
 
-        def m_matvec_np(x_np):
-            x_dp = _dpnp.asarray(x_np)
-            y_dp = M_dp.matvec(x_dp)
-            return _np.asarray(y_dp)
+    b = b.astype(dtype, copy=False)
+    _check_dtype(b.dtype, "b")
 
-        M_sci = _sla.LinearOperator(
-            shape=M_dp.shape, matvec=m_matvec_np, dtype=_np.dtype(M_dp.dtype)
-        )
+    if x0 is None:
+        x0 = _dpnp.zeros(n, dtype=dtype)
     else:
-        M_sci = None
+        x0 = _dpnp.asarray(x0, dtype=dtype).reshape(-1)
+    if x0.shape[0] != n:
+        raise ValueError(
+            f"x0 length mismatch: expected {n}, got {x0.shape[0]}"
+        )
 
-    b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
-    x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
+    M_op = IdentityOperator((n, n), dtype=dtype) if M is None else aslinearoperator(M)
 
-    # SciPy >= 1.12 renamed tol -> rtol; detect at call time.
-    tol_kw = _scipy_tol_kwarg(_sla.gmres)
+    # Inject fast CSR SpMV when available
+    fast_mv = _make_fast_matvec(A)
+    if fast_mv is not None:
+        orig = A_op
+        class _FastOp(LinearOperator):
+            def __init__(self):
+                super().__init__(orig.dtype, orig.shape)
+            def _matvec(self, x):  return fast_mv(x)
+            def _rmatvec(self, x): return orig.rmatvec(x)
+        A_op = _FastOp()
 
-    # callback_type was added in SciPy 1.9; only pass it when supported.
-    gmres_sig = inspect.signature(_sla.gmres)
-    extra_kw = {}
-    if "callback_type" in gmres_sig.parameters and callback_type is not None:
-        extra_kw["callback_type"] = callback_type
+    return A_op, M_op, x0, b, dtype
 
-    x_host, info = _sla.gmres(
-        A_sci,
-        b_np,
-        x0=x0_np,
-        **{tol_kw: tol},
-        restart=restart,
-        maxiter=maxiter,
-        M=M_sci,
-        callback=callback,
-        atol=0.0 if atol is None else atol,
-        **extra_kw,
-    )
 
-    x_dp = _dpnp.asarray(x_host)
-    return x_dp, int(info)
+def _tol_to_atol(b, tol: float, atol) -> float:
+    """Compute absolute stopping threshold matching SciPy / CuPy semantics."""
+    bnrm = float(_dpnp.linalg.norm(b))
+    return max(0.0 if atol is None else float(atol), float(tol) * bnrm)
 
 
+# ---------------------------------------------------------------------------
+# Conjugate Gradient
+# ---------------------------------------------------------------------------
+
 def cg(
     A,
     b,
-    x0: Optional[_ArrayLike] = None,
+    x0=None,
     *,
     tol: float = 1e-5,
-    maxiter: Optional[int] = None,
+    maxiter=None,
     M=None,
-    callback: Optional[Callable[[_ArrayLike], None]] = None,
-    atol: Optional[float] = None,
-):
+    callback=None,
+    atol=None,
+) -> Tuple[_dpnp.ndarray, int]:
+    """Conjugate Gradient solver for Hermitian positive definite A.
+
+    Signature matches cupyx.scipy.sparse.linalg.cg / scipy.sparse.linalg.cg.
+
+    Parameters
+    ----------
+    A : array_like or LinearOperator  -- Hermitian positive definite, shape (n, n)
+    b : array_like                    -- right-hand side, shape (n,)
+    x0 : array_like, optional         -- initial guess
+    tol : float                       -- relative tolerance (default 1e-5)
+    maxiter : int, optional           -- maximum iterations (default 10*n)
+    M : LinearOperator, optional      -- preconditioner
+    callback : callable, optional     -- called as callback(xk) each iteration
+    atol : float, optional            -- absolute tolerance
+
+    Returns
+    -------
+    x : dpnp.ndarray
+    info : int  (0 = converged, >0 = max iters reached, -1 = breakdown)
+    """
     b = _dpnp.asarray(b).reshape(-1)
-    n = b.size
-
-    if n < _HOST_THRESHOLD_DEFAULT and _has_scipy():
-        return _cpu_cg(A, b, x0, tol, maxiter, M, callback, atol)
-
-    A = aslinearoperator(A)
-
-    if M is not None:
-        raise NotImplementedError("Preconditioner M is not implemented for cg yet")
-
-    if x0 is None:
-        x = _dpnp.zeros_like(b)
-    else:
-        x = _dpnp.asarray(x0).reshape(-1).copy()
-
-    r = b - A.matvec(x)
-    p = r.copy()
-    rr_old = _dpnp.vdot(r, r).real
-    if rr_old == 0.0:
-        return x, 0
-
+    n = b.shape[0]
+
+    # --- small-system CPU fast path (mirrors CuPy host-dispatch) ---
+    if n <= _HOST_N_THRESHOLD:
+        try:
+            import scipy.sparse.linalg as _sla
+            _kw = {
+                _scipy_tol_kwarg(_sla.cg): tol,
+                "atol": 0.0 if atol is None else float(atol),
+                "maxiter": maxiter,
+            }
+            A_np = _np.asarray(A) if not hasattr(A, "matvec") else A
+            b_np = _np.asarray(b)
+            x0_np = None if x0 is None else _np.asarray(x0)
+            x_np, info = _sla.cg(A_np, b_np, x0=x0_np, callback=callback, **_kw)
+            return _dpnp.asarray(x_np), int(info)
+        except Exception:
+            pass  # fall through to dpnp path
+
+    # --- dpnp / oneMKL path ---
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     if maxiter is None:
         maxiter = n * 10
+    atol_eff = _tol_to_atol(b, tol, atol)
 
-    tol_th = _make_stop_criterion(b, tol, atol)
+    r  = b - A_op.matvec(x)
+    z  = M_op.matvec(r)
+    p  = _dpnp.array(z, copy=True)
+    rz = float(_dpnp.vdot(r, z).real)
 
-    info = 0
+    if rz == 0.0:
+        return x, 0
 
+    info = maxiter
     for _ in range(maxiter):
-        Ap = A.matvec(p)
-        pAp = _dpnp.vdot(p, Ap).real
+        Ap  = A_op.matvec(p)
+        pAp = float(_dpnp.vdot(p, Ap).real)
         if pAp == 0.0:
             info = -1
             break
 
-        alpha = rr_old / pAp
+        alpha = rz / pAp
         x = x + alpha * p
         r = r - alpha * Ap
 
         if callback is not None:
             callback(x)
 
-        rr_new = _dpnp.vdot(r, r).real
-        res_norm = rr_new**0.5
-        if res_norm <= tol_th:
+        if float(_dpnp.linalg.norm(r)) <= atol_eff:
             info = 0
             break
 
-        beta = rr_new / rr_old
-        p = r + beta * p
-        rr_old = rr_new
+        z      = M_op.matvec(r)
+        rz_new = float(_dpnp.vdot(r, z).real)
+        p      = z + (rz_new / rz) * p
+        rz     = rz_new
     else:
         info = maxiter
 
     return x, int(info)
 
 
+# ---------------------------------------------------------------------------
+# Restarted GMRES
+# ---------------------------------------------------------------------------
+
 def gmres(
     A,
     b,
-    x0: Optional[_ArrayLike] = None,
+    x0=None,
     *,
     tol: float = 1e-5,
-    restart: Optional[int] = None,
-    maxiter: Optional[int] = None,
+    restart=None,
+    maxiter=None,
     M=None,
-    callback: Optional[Callable[[object], None]] = None,
-    atol: Optional[float] = None,
-    callback_type: Optional[str] = None,
-):
+    callback=None,
+    atol=None,
+    callback_type=None,
+) -> Tuple[_dpnp.ndarray, int]:
+    """Restarted GMRES with oneMKL-accelerated Arnoldi step.
+
+    Signature matches cupyx.scipy.sparse.linalg.gmres / scipy.sparse.linalg.gmres.
+
+    Parameters
+    ----------
+    A, b, x0, tol, maxiter, M, callback, atol
+        See scipy.sparse.linalg.gmres documentation.
+    restart : int, optional
+        Krylov subspace dimension between restarts. Default: min(20, n).
+    callback_type : {'x', 'pr_norm', None}
+        'x'      -> callback(xk) at each restart (default when callback given).
+        'pr_norm'-> callback(residual_norm) at each restart.
+        None     -> no callback invocation.
+
+    Returns
+    -------
+    x : dpnp.ndarray
+    info : int  (0 = converged, >0 = iterations used, -1 = breakdown)
+    """
     b = _dpnp.asarray(b).reshape(-1)
-    n = b.size
-
-    if n < _HOST_THRESHOLD_DEFAULT and _has_scipy():
-        return _cpu_gmres(A, b, x0, tol, restart, maxiter, M, callback, atol, callback_type)
+    n = b.shape[0]
+
+    # --- small-system CPU fast path ---
+    if n <= _HOST_N_THRESHOLD:
+        try:
+            import scipy.sparse.linalg as _sla
+            _kw = {
+                _scipy_tol_kwarg(_sla.gmres): tol,
+                "atol":   0.0 if atol is None else float(atol),
+                "restart": restart,
+                "maxiter": maxiter,
+            }
+            sig = inspect.signature(_sla.gmres)
+            if "callback_type" in sig.parameters and callback_type is not None:
+                _kw["callback_type"] = callback_type
+            A_np  = _np.asarray(A) if not hasattr(A, "matvec") else A
+            b_np  = _np.asarray(b)
+            x0_np = None if x0 is None else _np.asarray(x0)
+            x_np, info = _sla.gmres(A_np, b_np, x0=x0_np, callback=callback, **_kw)
+            return _dpnp.asarray(x_np), int(info)
+        except Exception:
+            pass
 
     if callback_type not in (None, "x", "pr_norm"):
         raise ValueError("callback_type must be None, 'x', or 'pr_norm'")
-    if callback_type == "pr_norm":
-        raise NotImplementedError("callback_type='pr_norm' is not implemented yet")
 
-    A = aslinearoperator(A)
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    if restart  is None: restart  = min(20, n)
+    if maxiter  is None: maxiter  = n
+    restart, maxiter = int(restart), int(maxiter)
 
-    if M is not None:
-        raise NotImplementedError("Preconditioner M is not implemented for gmres yet")
+    # Default callback_type when a callback is provided (matches CuPy)
+    if callback_type is None:
+        callback_type = "x" if callback is not None else None
 
-    if x0 is None:
-        x = _dpnp.zeros_like(b)
-    else:
-        x = _dpnp.asarray(x0).reshape(-1).copy()
+    atol_eff = _tol_to_atol(b, tol, atol)
+    is_cpx   = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
+    H_dtype  = _np.complex128 if is_cpx else _np.float64
 
-    if restart is None:
-        restart = min(20, n)
-    if maxiter is None:
-        maxiter = n
+    info         = 0
+    total_iters  = 0
 
-    restart = int(restart)
-    maxiter = int(maxiter)
-
-    tol_th = _make_stop_criterion(b, tol, atol)
-
-    info = 0
-    total_iter = 0
-
-    for outer in range(maxiter):
-        r = b - A.matvec(x)
-        beta = _norm(r)
-        if beta == 0.0:
+    for _outer in range(maxiter):
+        r    = M_op.matvec(b - A_op.matvec(x))
+        beta = float(_dpnp.linalg.norm(r))
+        if beta == 0.0 or beta <= atol_eff:
             info = 0
             break
-        if beta <= tol_th:
-            info = 0
-            break
-
-        V = _dpnp.zeros((n, restart + 1), dtype=x.dtype)
-        H = _dpnp.zeros((restart + 1, restart), dtype=_dpnp.float64)
-        cs = _dpnp.zeros(restart, dtype=_dpnp.float64)
-        sn = _dpnp.zeros(restart, dtype=_dpnp.float64)
-        e1 = _dpnp.zeros(restart + 1, dtype=_dpnp.float64)
-        e1[0] = 1.0
 
-        V[:, 0] = r / beta
-        g = beta * e1
-
-        inner_converged = False
+        V_cols = [r / beta]
+        H_np   = _np.zeros((restart + 1, restart), dtype=H_dtype)
+        e1_np  = _np.zeros(restart + 1, dtype=H_dtype)
+        e1_np[0] = beta
 
+        j_inner  = 0
         for j in range(restart):
-            total_iter += 1
-            w = A.matvec(V[:, j])
-
-            for i in range(j + 1):
-                H[i, j] = float(_dpnp.vdot(V[:, i], w).real)
-                w = w - H[i, j] * V[:, i]
-
-            H[j + 1, j] = _norm(w)
-            if H[j + 1, j] != 0.0:
-                V[:, j + 1] = w / H[j + 1, j]
-            else:
-                for k in range(j + 1, restart + 1):
-                    H[k, j] = 0.0
-                j_max = j
-                break
-            j_max = j
-
-            for i in range(j):
-                temp = cs[i] * H[i, j] + sn[i] * H[i + 1, j]
-                H[i + 1, j] = -sn[i] * H[i, j] + cs[i] * H[i + 1, j]
-                H[i, j] = temp
-
-            h_jj = H[j, j]
-            h_j1j = H[j + 1, j]
-            denom = (h_jj**2 + h_j1j**2) ** 0.5
-            if denom == 0.0:
-                cs[j] = 1.0
-                sn[j] = 0.0
-            else:
-                cs[j] = h_jj / denom
-                sn[j] = h_j1j / denom
-
-            H[j, j] = cs[j] * h_jj + sn[j] * h_j1j
-            H[j + 1, j] = 0.0
-
-            g_j = g[j]
-            g[j] = cs[j] * g_j
-            g[j + 1] = -sn[j] * g_j
-
-            res_norm = abs(g[j + 1])
-            if res_norm <= tol_th:
-                inner_converged = True
-                j_max = j
+            total_iters += 1
+            w = M_op.matvec(A_op.matvec(V_cols[j]))
+
+            # Arnoldi step: h = V_j^H w  via single oneMKL BLAS gemv.
+            # CuPy equivalent uses cuBLAS dgemv; this uses oneMKL via dpnp.dot.
+            # Replaces the slow Python loop (vdot per column) in the initial stub.
+            V_mat  = _dpnp.stack(V_cols, axis=1)          # (n, j+1)
+            h_dp   = _dpnp.dot(V_mat.T.conj(), w)         # (j+1,)  -- oneMKL gemv
+            h_np   = _np.asarray(h_dp)                    # pull tiny vector to CPU
+            w      = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
+
+            h_j1 = float(_dpnp.linalg.norm(w))
+            H_np[:j + 1, j] = h_np
+            H_np[j + 1,  j] = h_j1
+
+            if h_j1 == 0.0:            # happy breakdown
+                j_inner = j
                 break
+            V_cols.append(w / h_j1)
+            j_inner = j
+
+        # Hessenberg least-squares on CPU (the matrix is at most restart x restart;
+        # CuPy comment: "faster to solve on CPU").
+        k = j_inner + 1
+        y_np, _, _, _ = _np.linalg.lstsq(
+            H_np[:k + 1, :k], e1_np[:k + 1], rcond=None
+        )
 
-        k_dim = j_max + 1
-        y = _dpnp.zeros(k_dim, dtype=_dpnp.float64)
-        for i in range(k_dim - 1, -1, -1):
-            s = g[i]
-            for j2 in range(i + 1, k_dim):
-                s -= H[i, j2] * y[j2]
-            y[i] = s / H[i, i]
+        V_k = _dpnp.stack(V_cols[:k], axis=1)
+        x   = x + _dpnp.dot(V_k, _dpnp.asarray(y_np, dtype=dtype))
 
-        x = x + V[:, :k_dim] @ y
+        res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
 
-        if callback is not None and (callback_type in (None, "x")):
-            callback(x)
+        if callback is not None:
+            callback(x if callback_type == "x" else res_norm)
 
-        r = b - A.matvec(x)
-        if _norm(r) <= tol_th:
+        if res_norm <= atol_eff:
             info = 0
             break
-
-        if not inner_converged and outer == maxiter - 1:
-            info = total_iter
+    else:
+        info = total_iters
 
     return x, int(info)
 
 
+# ---------------------------------------------------------------------------
+# MINRES  (SciPy-backed stub)
+# ---------------------------------------------------------------------------
+# CuPy v14.0.1 does NOT include a GPU-native MINRES implementation.
+# Using a SciPy host stub is therefore the correct parallel strategy.
+# A native oneMKL-based MINRES will be added in a future dpnp release.
+
 def minres(
     A,
     b,
-    x0: Optional[_ArrayLike] = None,
+    x0=None,
     *,
     shift: float = 0.0,
     tol: float = 1e-5,
-    maxiter: Optional[int] = None,
+    maxiter=None,
     M=None,
-    callback: Optional[Callable[[_ArrayLike], None]] = None,
+    callback=None,
     check: bool = False,
-):
+) -> Tuple[_dpnp.ndarray, int]:
+    """MINRES for symmetric (possibly indefinite) A.
+
+    Signature matches cupyx.scipy.sparse.linalg.minres / scipy.sparse.linalg.minres.
+
+    Currently delegates to scipy.sparse.linalg.minres on the host with dpnp
+    operator wrappers.  A native oneMKL implementation will replace this stub
+    in a future release.
+
+    Parameters
+    ----------
+    A : array_like or LinearOperator  -- symmetric, shape (n, n)
+    b : array_like                    -- right-hand side
+    x0 : array_like, optional
+    shift : float                     -- solve (A - shift*I) x = b
+    tol : float                       -- relative stopping tolerance
+    maxiter : int, optional
+    M : LinearOperator, optional      -- symmetric positive definite preconditioner
+    callback : callable, optional     -- called as callback(xk) each iteration
+    check : bool                      -- check that A is symmetric (default False)
+
+    Returns
+    -------
+    x : dpnp.ndarray
+    info : int  (0 = converged, >0 = stagnation / max iters)
+    """
     try:
-        import numpy as _np
         import scipy.sparse.linalg as _sla
-    except Exception as exc:  # pragma: no cover - import guard
+    except ImportError as exc:
         raise NotImplementedError(
-            "dpnp.scipy.sparse.linalg.minres currently requires SciPy on the host."
+            "dpnp.scipy.sparse.linalg.minres currently requires SciPy on the host. "
+            "A native oneMKL MINRES will be added in a future dpnp release."
         ) from exc
 
     A_dp = aslinearoperator(A)
-    m, n = A_dp.shape
-    if m != n:
+    if A_dp.shape[0] != A_dp.shape[1]:
         raise ValueError("minres requires a square operator")
 
-    def matvec_np(x_np):
-        x_dp = _dpnp.asarray(x_np)
-        y_dp = A_dp.matvec(x_dp)
-        return _np.asarray(y_dp)
-
-    A_sci = _sla.LinearOperator(
-        shape=A_dp.shape, matvec=matvec_np, dtype=_np.dtype(A_dp.dtype)
-    )
-
-    if M is not None:
-        M_dp = aslinearoperator(M)
-
-        def m_matvec_np(x_np):
-            x_dp = _dpnp.asarray(x_np)
-            y_dp = M_dp.matvec(x_dp)
-            return _np.asarray(y_dp)
-
-        M_sci = _sla.LinearOperator(
-            shape=M_dp.shape, matvec=m_matvec_np, dtype=_np.dtype(M_dp.dtype)
+    def _wrap_op(op):
+        return _sla.LinearOperator(
+            op.shape,
+            matvec=lambda x: _np.asarray(op.matvec(_dpnp.asarray(x))),
+            dtype=_np.dtype(op.dtype) if op.dtype is not None else _np.float64,
         )
-    else:
-        M_sci = None
 
-    b_np = _np.asarray(_dpnp.asarray(b).reshape(-1))
+    M_sci = None if M is None else _wrap_op(aslinearoperator(M))
+    b_np  = _np.asarray(_dpnp.asarray(b).reshape(-1))
     x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
 
-    x_host, info = _sla.minres(
-        A_sci,
+    tkw = _scipy_tol_kwarg(_sla.minres)
+    x_np, info = _sla.minres(
+        _wrap_op(A_dp),
         b_np,
         x0=x0_np,
-        rtol=tol,
+        **{tkw: tol},
         shift=shift,
         maxiter=maxiter,
         M=M_sci,
@@ -452,6 +488,4 @@ def m_matvec_np(x_np):
         show=False,
         check=check,
     )
-
-    x_dp = _dpnp.asarray(x_host)
-    return x_dp, int(info)
+    return _dpnp.asarray(x_np), int(info)

From 58cc44bb9a65197c4d4c8dc443c6b6a012669819 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 13:30:23 -0500
Subject: [PATCH 08/43] Add tests for scipy.sparse.linalg: LinearOperator, cg,
 gmres, minres

---
 dpnp/tests/test_scipy_sparse_linalg.py | 912 +++++++++++++++++++++++++
 1 file changed, 912 insertions(+)
 create mode 100644 dpnp/tests/test_scipy_sparse_linalg.py

diff --git a/dpnp/tests/test_scipy_sparse_linalg.py b/dpnp/tests/test_scipy_sparse_linalg.py
new file mode 100644
index 000000000000..3e9cd2088156
--- /dev/null
+++ b/dpnp/tests/test_scipy_sparse_linalg.py
@@ -0,0 +1,912 @@
+# Copyright (c) 2025, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Intel Corporation nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests for dpnp.scipy.sparse.linalg: LinearOperator, cg, gmres, minres.
+
+The test structure and helper usage mirror dpnp/tests/test_linalg.py so that
+the suite fits naturally into the existing CI infrastructure.
+"""
+
+import numpy
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal, assert_raises
+
+import dpnp
+from dpnp.scipy.sparse.linalg import (
+    LinearOperator,
+    aslinearoperator,
+    cg,
+    gmres,
+    minres,
+)
+
+from .helper import (
+    assert_dtype_allclose,
+    generate_random_numpy_array,
+    get_float_complex_dtypes,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_spd(n, dtype, rng):
+    """Return a symmetric positive-definite matrix of size n."""
+    A = rng.standard_normal((n, n)).astype(dtype)
+    return A.T @ A + n * numpy.eye(n, dtype=dtype)
+
+
+def _make_sym_indef(n, dtype, rng):
+    """Return a symmetric (possibly indefinite) matrix of size n."""
+    Q, _ = numpy.linalg.qr(rng.standard_normal((n, n)).astype(dtype))
+    D = numpy.diag(rng.standard_normal(n).astype(dtype))
+    return Q @ D @ Q.T
+
+
+def _make_nonsym(n, dtype, rng):
+    """Return a diagonally dominant (non-symmetric) matrix of size n."""
+    A = rng.standard_normal((n, n)).astype(dtype)
+    A += n * numpy.eye(n, dtype=dtype)
+    return A
+
+
+def _rel_residual(A_np, x_dp, b_np):
+    """Relative residual ||Ax - b|| / ||b||."""
+    x_np = numpy.asarray(x_dp)
+    r = A_np @ x_np - b_np
+    b_nrm = numpy.linalg.norm(b_np)
+    return numpy.linalg.norm(r) / (b_nrm if b_nrm > 0 else 1.0)
+
+
+# ---------------------------------------------------------------------------
+# TestLinearOperator
+# ---------------------------------------------------------------------------
+
+class TestLinearOperator:
+    """Tests for the LinearOperator class and aslinearoperator helper."""
+
+    # --- basic construction ---
+
+    def test_basic_construction_shape_dtype(self):
+        n = 8
+        A_np = numpy.eye(n, dtype=numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        assert op.shape == (n, n)
+        assert op.ndim == 2
+
+    def test_dtype_inferred_from_matvec(self):
+        n = 6
+        A_dp = dpnp.eye(n, dtype=numpy.float32)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        assert op.dtype == numpy.float32
+
+    def test_dtype_explicit_override(self):
+        n = 4
+        A_dp = dpnp.eye(n)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=numpy.float32)
+        assert op.dtype == numpy.float32
+
+    @pytest.mark.parametrize("n", [1, 5, 20])
+    def test_matvec_identity(self, n):
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        x_dp = dpnp.arange(n, dtype=numpy.float64)
+        y_dp = op.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.asarray(x_dp), rtol=1e-12)
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_matvec_dense(self, dtype):
+        rng = numpy.random.default_rng(0)
+        n = 10
+        A_np = _make_spd(n, dtype, rng)
+        A_dp = dpnp.asarray(A_np)
+        x_np = rng.standard_normal(n).astype(dtype)
+        x_dp = dpnp.asarray(x_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
+        y_dp = op.matvec(x_dp)
+        y_ref = A_np @ x_np
+        assert_allclose(numpy.asarray(y_dp), y_ref, rtol=1e-5)
+
+    # --- rmatvec ---
+
+    def test_rmatvec_defined(self):
+        rng = numpy.random.default_rng(1)
+        n = 8
+        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+        x_np = rng.standard_normal(n)
+        x_dp = dpnp.asarray(x_np)
+
+        op = LinearOperator(
+            (n, n),
+            matvec=lambda x: A_dp @ x,
+            rmatvec=lambda x: A_dp.T @ x,
+        )
+        y_dp = op.rmatvec(x_dp)
+        y_ref = A_np.T @ x_np
+        assert_allclose(numpy.asarray(y_dp), y_ref, rtol=1e-12)
+
+    def test_rmatvec_not_defined_raises(self):
+        n = 4
+        A_dp = dpnp.eye(n)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        x_dp = dpnp.ones(n)
+        with pytest.raises(NotImplementedError):
+            op.rmatvec(x_dp)
+
+    # --- matmat ---
+
+    def test_matmat_fallback_loop(self):
+        rng = numpy.random.default_rng(2)
+        n, k = 6, 4
+        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+        X_np = rng.standard_normal((n, k)).astype(numpy.float64)
+        X_dp = dpnp.asarray(X_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        Y_dp = op.matmat(X_dp)
+        Y_ref = A_np @ X_np
+        assert_allclose(numpy.asarray(Y_dp), Y_ref, rtol=1e-10)
+
+    def test_matmat_explicit(self):
+        rng = numpy.random.default_rng(3)
+        n, k = 5, 3
+        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+        X_np = rng.standard_normal((n, k)).astype(numpy.float64)
+        X_dp = dpnp.asarray(X_np)
+
+        op = LinearOperator(
+            (n, n),
+            matvec=lambda x: A_dp @ x,
+            matmat=lambda X: A_dp @ X,
+        )
+        Y_dp = op.matmat(X_dp)
+        assert_allclose(numpy.asarray(Y_dp), A_np @ X_np, rtol=1e-10)
+
+    # --- __matmul__ / __call__ ---
+
+    def test_matmul_1d(self):
+        n = 5
+        A_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        x_dp = dpnp.ones(n)
+        y_dp = op @ x_dp
+        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 2.0))
+
+    def test_matmul_2d(self):
+        n, k = 4, 3
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        X_dp = dpnp.ones((n, k))
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        Y_dp = op @ X_dp
+        assert_allclose(numpy.asarray(Y_dp), numpy.ones((n, k)))
+
+    def test_call_delegates_to_matmul(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        x_dp = dpnp.ones(n)
+        assert_allclose(numpy.asarray(op(x_dp)), numpy.asarray(op @ x_dp))
+
+    # --- operator algebra ---
+
+    def test_adjoint_property_H(self):
+        rng = numpy.random.default_rng(4)
+        n = 6
+        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+        op = LinearOperator(
+            (n, n),
+            matvec=lambda x: A_dp @ x,
+            rmatvec=lambda x: A_dp.T @ x,
+        )
+        x_dp = dpnp.asarray(rng.standard_normal(n))
+        y_H = op.H.matvec(x_dp)
+        y_ref = A_np.T @ numpy.asarray(x_dp)
+        assert_allclose(numpy.asarray(y_H), y_ref, rtol=1e-12)
+
+    def test_transpose_property_T(self):
+        rng = numpy.random.default_rng(5)
+        n = 6
+        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+        op = LinearOperator(
+            (n, n),
+            matvec=lambda x: A_dp @ x,
+            rmatvec=lambda x: A_dp.T @ x,
+        )
+        x_dp = dpnp.asarray(rng.standard_normal(n))
+        y_T = op.T.matvec(x_dp)
+        # For real A, T == H
+        y_ref = A_np.T @ numpy.asarray(x_dp)
+        assert_allclose(numpy.asarray(y_T), y_ref, rtol=1e-12)
+
+    def test_add_two_operators(self):
+        n = 5
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        B_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
+        opA = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        opB = LinearOperator((n, n), matvec=lambda x: B_dp @ x)
+        opC = opA + opB
+        x_dp = dpnp.ones(n)
+        y_dp = opC.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 3.0))
+
+    def test_scalar_multiply(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        op3 = op * 3.0
+        x_dp = dpnp.ones(n)
+        y_dp = op3.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 3.0))
+
+    def test_product_operator(self):
+        n = 5
+        A_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
+        B_dp = dpnp.eye(n, dtype=numpy.float64) * 3.0
+        opA = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        opB = LinearOperator((n, n), matvec=lambda x: B_dp @ x)
+        opAB = opA * opB
+        x_dp = dpnp.ones(n)
+        y_dp = opAB.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 6.0))
+
+    def test_neg_operator(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        neg_op = -op
+        x_dp = dpnp.ones(n)
+        y_dp = neg_op.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.full(n, -1.0))
+
+    def test_power_operator(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        op3 = op ** 3
+        x_dp = dpnp.ones(n)
+        y_dp = op3.matvec(x_dp)
+        # 2^3 * I * [1...] = 8
+        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 8.0))
+
+    # --- shape / error validation ---
+
+    def test_invalid_shape_raises(self):
+        with pytest.raises(ValueError):
+            LinearOperator((5,), matvec=lambda x: x)
+
+    def test_matvec_wrong_input_dim_raises(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        with pytest.raises(ValueError):
+            op.matvec(dpnp.ones(n + 1))
+
+    # --- aslinearoperator ---
+
+    def test_aslinearoperator_identity_if_already_lo(self):
+        n = 4
+        A_dp = dpnp.eye(n)
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        assert aslinearoperator(op) is op
+
+    def test_aslinearoperator_from_dense_dpnp(self):
+        n = 6
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        op = aslinearoperator(A_dp)
+        x_dp = dpnp.ones(n)
+        y_dp = op.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.ones(n))
+
+    def test_aslinearoperator_from_numpy(self):
+        n = 5
+        A_np = numpy.eye(n, dtype=numpy.float64)
+        op = aslinearoperator(A_np)
+        x_dp = dpnp.ones(n)
+        y_dp = op.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), numpy.ones(n))
+
+    def test_aslinearoperator_invalid_raises(self):
+        with pytest.raises(TypeError):
+            aslinearoperator("not_an_array")
+
+    def test_repr_string(self):
+        n = 3
+        op = LinearOperator((n, n), matvec=lambda x: x, dtype=numpy.float64)
+        r = repr(op)
+        assert "3x3" in r
+
+    # --- IdentityOperator ---
+
+    def test_identity_operator(self):
+        from dpnp.scipy.sparse.linalg._interface import IdentityOperator
+
+        n = 7
+        op = IdentityOperator((n, n), dtype=numpy.float64)
+        x_dp = dpnp.arange(n, dtype=numpy.float64)
+        assert_array_equal(numpy.asarray(op.matvec(x_dp)), numpy.arange(n))
+        assert_array_equal(numpy.asarray(op.rmatvec(x_dp)), numpy.arange(n))
+
+    # --- complex dtype ---
+
+    @pytest.mark.parametrize("dtype", [numpy.complex64, numpy.complex128])
+    def test_complex_matvec(self, dtype):
+        n = 6
+        rng = numpy.random.default_rng(10)
+        A_np = (rng.standard_normal((n, n)) + 1j * rng.standard_normal((n, n))).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        x_np = (rng.standard_normal(n) + 1j * rng.standard_normal(n)).astype(dtype)
+        x_dp = dpnp.asarray(x_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
+        y_dp = op.matvec(x_dp)
+        assert_allclose(numpy.asarray(y_dp), A_np @ x_np, rtol=1e-4)
+
+
+# ---------------------------------------------------------------------------
+# TestCG
+# ---------------------------------------------------------------------------
+
+class TestCG:
+    """Tests for dpnp.scipy.sparse.linalg.cg."""
+
+    @pytest.mark.parametrize("n", [5, 10, 30])
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_cg_spd_convergence(self, n, dtype):
+        rng = numpy.random.default_rng(100)
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_dp, info = cg(A_dp, b_dp, tol=1e-7, maxiter=500)
+        assert info == 0, f"CG did not converge (info={info})"
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
+
+    def test_cg_matches_numpy_solve(self):
+        rng = numpy.random.default_rng(101)
+        n = 15
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_ref = numpy.linalg.solve(A_np, b_np)
+        x_dp, info = cg(A_dp, b_dp, tol=1e-10, maxiter=1000)
+        assert info == 0
+        assert_allclose(numpy.asarray(x_dp), x_ref, rtol=1e-6)
+
+    def test_cg_x0_initial_guess(self):
+        rng = numpy.random.default_rng(102)
+        n = 12
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        # Start from a good initial guess: actual solution
+        x_ref = numpy.linalg.solve(A_np, b_np)
+        x0_dp = dpnp.asarray(x_ref)
+        x_dp, info = cg(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5)
+        # Should converge immediately or with very few iterations
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-8
+
+    def test_cg_callback_called(self):
+        rng = numpy.random.default_rng(103)
+        n = 8
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        calls = []
+        def cb(xk):
+            calls.append(1)
+
+        x_dp, info = cg(A_dp, b_dp, tol=1e-8, maxiter=200, callback=cb)
+        assert info == 0
+        assert len(calls) > 0
+
+    def test_cg_already_zero_rhs(self):
+        n = 5
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.zeros(n, dtype=numpy.float64)
+        x_dp, info = cg(A_dp, b_dp)
+        assert info == 0
+        assert_allclose(numpy.asarray(x_dp), numpy.zeros(n), atol=1e-14)
+
+    def test_cg_returns_dpnp_array(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n, dtype=numpy.float64)
+        x_dp, _ = cg(A_dp, b_dp)
+        assert isinstance(x_dp, dpnp.ndarray)
+
+    def test_cg_with_atol(self):
+        rng = numpy.random.default_rng(104)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_dp, info = cg(A_dp, b_dp, tol=0.0, atol=1e-8, maxiter=500)
+        assert info == 0
+
+    def test_cg_with_linear_operator(self):
+        rng = numpy.random.default_rng(105)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        A_dp = dpnp.asarray(A_np)
+        b_np = rng.standard_normal(n).astype(dtype)
+        b_dp = dpnp.asarray(b_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
+        x_dp, info = cg(op, b_dp, tol=1e-8, maxiter=500)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
+
+    def test_cg_maxiter_exhausted_returns_nonzero_info(self):
+        rng = numpy.random.default_rng(106)
+        n = 20
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        _, info = cg(A_dp, b_dp, tol=1e-20, maxiter=1)
+        assert info != 0
+
+    def test_cg_preconditioner_unsupported_raises(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n)
+        M = dpnp.eye(n)
+        with pytest.raises(NotImplementedError):
+            cg(A_dp, b_dp, M=M)
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_cg_dtype_preserved_in_output(self, dtype):
+        n = 8
+        rng = numpy.random.default_rng(107)
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        x_dp, _ = cg(dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-6, maxiter=500)
+        # Result should be float64 (working precision) or at least same family
+        assert numpy.issubdtype(x_dp.dtype, numpy.floating)
+
+
+# ---------------------------------------------------------------------------
+# TestGMRES
+# ---------------------------------------------------------------------------
+
+class TestGMRES:
+    """Tests for dpnp.scipy.sparse.linalg.gmres."""
+
+    @pytest.mark.parametrize("n", [5, 10, 25])
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_gmres_nonsym_convergence(self, n, dtype):
+        rng = numpy.random.default_rng(200)
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_dp, info = gmres(A_dp, b_dp, tol=1e-7, maxiter=50, restart=n)
+        assert info == 0, f"GMRES did not converge (info={info})"
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
+
+    def test_gmres_matches_numpy_solve(self):
+        rng = numpy.random.default_rng(201)
+        n = 12
+        dtype = numpy.float64
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_ref = numpy.linalg.solve(A_np, b_np)
+        x_dp, info = gmres(A_dp, b_dp, tol=1e-10, maxiter=50, restart=n)
+        assert info == 0
+        assert_allclose(numpy.asarray(x_dp), x_ref, rtol=1e-5)
+
+    def test_gmres_spd_matches_cg(self):
+        """On an SPD system GMRES and CG should agree."""
+        rng = numpy.random.default_rng(202)
+        n = 15
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_gmres, _ = gmres(A_dp, b_dp, tol=1e-10, maxiter=100, restart=n)
+        x_cg, _ = cg(A_dp, b_dp, tol=1e-10, maxiter=500)
+        assert_allclose(numpy.asarray(x_gmres), numpy.asarray(x_cg), rtol=1e-5)
+
+    def test_gmres_restart_parameter(self):
+        """Restarted GMRES (restart < n) should still converge."""
+        rng = numpy.random.default_rng(203)
+        n = 20
+        dtype = numpy.float64
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_dp, info = gmres(A_dp, b_dp, tol=1e-7, maxiter=20, restart=5)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
+
+    def test_gmres_x0_initial_guess(self):
+        rng = numpy.random.default_rng(204)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_ref = numpy.linalg.solve(A_np, b_np)
+        x0_dp = dpnp.asarray(x_ref)
+        x_dp, info = gmres(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5, restart=n)
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-8
+
+    def test_gmres_callback_called(self):
+        rng = numpy.random.default_rng(205)
+        n = 8
+        A_np = _make_nonsym(n, numpy.float64, rng)
+        b_np = rng.standard_normal(n)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        calls = []
+        def cb(xk):
+            calls.append(1)
+
+        _, info = gmres(A_dp, b_dp, tol=1e-8, maxiter=20, callback=cb, restart=n)
+        assert info == 0
+        assert len(calls) > 0
+
+    def test_gmres_already_zero_rhs(self):
+        n = 5
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.zeros(n, dtype=numpy.float64)
+        x_dp, info = gmres(A_dp, b_dp)
+        assert info == 0
+        assert_allclose(numpy.asarray(x_dp), numpy.zeros(n), atol=1e-14)
+
+    def test_gmres_returns_dpnp_array(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n, dtype=numpy.float64)
+        x_dp, _ = gmres(A_dp, b_dp)
+        assert isinstance(x_dp, dpnp.ndarray)
+
+    def test_gmres_with_atol(self):
+        rng = numpy.random.default_rng(206)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        x_dp, info = gmres(
+            dpnp.asarray(A_np),
+            dpnp.asarray(b_np),
+            tol=0.0,
+            atol=1e-7,
+            maxiter=50,
+            restart=n,
+        )
+        assert info == 0
+
+    def test_gmres_with_linear_operator(self):
+        rng = numpy.random.default_rng(207)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_nonsym(n, dtype, rng)
+        A_dp = dpnp.asarray(A_np)
+        b_np = rng.standard_normal(n).astype(dtype)
+        b_dp = dpnp.asarray(b_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
+        x_dp, info = gmres(op, b_dp, tol=1e-8, maxiter=50, restart=n)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
+
+    def test_gmres_maxiter_exhausted_returns_nonzero_info(self):
+        rng = numpy.random.default_rng(208)
+        n = 20
+        dtype = numpy.float64
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        _, info = gmres(A_dp, b_dp, tol=1e-20, maxiter=1, restart=2)
+        assert info != 0
+
+    def test_gmres_preconditioner_unsupported_raises(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n)
+        M = dpnp.eye(n)
+        with pytest.raises(NotImplementedError):
+            gmres(A_dp, b_dp, M=M)
+
+    def test_gmres_callback_type_pr_norm_raises(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n)
+        with pytest.raises(NotImplementedError):
+            gmres(A_dp, b_dp, callback=lambda x: None, callback_type="pr_norm")
+
+    def test_gmres_invalid_callback_type_raises(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n)
+        with pytest.raises(ValueError):
+            gmres(A_dp, b_dp, callback_type="bad_value")
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_gmres_dtype_preserved_in_output(self, dtype):
+        n = 6
+        rng = numpy.random.default_rng(209)
+        A_np = _make_nonsym(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        x_dp, _ = gmres(
+            dpnp.asarray(A_np),
+            dpnp.asarray(b_np),
+            tol=1e-6,
+            maxiter=50,
+            restart=n,
+        )
+        assert numpy.issubdtype(x_dp.dtype, numpy.floating)
+
+    @pytest.mark.parametrize("n", [5, 15])
+    def test_gmres_happy_breakdown(self, n):
+        """Identity operator should yield happy breakdown (exact solution)."""
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.arange(1, n + 1, dtype=numpy.float64)
+        x_dp, info = gmres(A_dp, b_dp, tol=1e-12, maxiter=n, restart=n)
+        assert info == 0
+        assert_allclose(numpy.asarray(x_dp), numpy.arange(1, n + 1), rtol=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# TestMINRES
+# ---------------------------------------------------------------------------
+
+class TestMINRES:
+    """Tests for dpnp.scipy.sparse.linalg.minres (SciPy-backed stub)."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_scipy(self):
+        pytest.importorskip("scipy", reason="SciPy required for minres tests")
+
+    @pytest.mark.parametrize("n", [5, 10, 20])
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_minres_spd_convergence(self, n, dtype):
+        rng = numpy.random.default_rng(300)
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_dp, info = minres(A_dp, b_dp, tol=1e-7, maxiter=500)
+        assert info == 0, f"MINRES did not converge (info={info})"
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_minres_sym_indef_convergence(self, dtype):
+        rng = numpy.random.default_rng(301)
+        n = 12
+        A_np = _make_sym_indef(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_dp, info = minres(A_dp, b_dp, tol=1e-6, maxiter=500)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-4
+
+    def test_minres_matches_scipy(self):
+        import scipy.sparse.linalg as sla
+
+        rng = numpy.random.default_rng(302)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+
+        x_scipy, info_scipy = sla.minres(A_np, b_np, rtol=1e-10)
+        x_dp, info_dp = minres(
+            dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-10
+        )
+        assert info_dp == 0
+        assert_allclose(numpy.asarray(x_dp), x_scipy, rtol=1e-6)
+
+    def test_minres_x0_initial_guess(self):
+        rng = numpy.random.default_rng(303)
+        n = 8
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_ref = numpy.linalg.solve(A_np, b_np)
+        x0_dp = dpnp.asarray(x_ref)
+        x_dp, info = minres(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5)
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-8
+
+    def test_minres_returns_dpnp_array(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.ones(n, dtype=numpy.float64)
+        x_dp, _ = minres(A_dp, b_dp)
+        assert isinstance(x_dp, dpnp.ndarray)
+
+    def test_minres_already_zero_rhs(self):
+        n = 5
+        A_dp = dpnp.eye(n, dtype=numpy.float64)
+        b_dp = dpnp.zeros(n, dtype=numpy.float64)
+        x_dp, info = minres(A_dp, b_dp)
+        assert info == 0
+        assert_allclose(numpy.asarray(x_dp), numpy.zeros(n), atol=1e-14)
+
+    def test_minres_non_square_raises(self):
+        A_dp = dpnp.ones((4, 6), dtype=numpy.float64)
+        b_dp = dpnp.ones(4, dtype=numpy.float64)
+        with pytest.raises(ValueError, match="square"):
+            minres(A_dp, b_dp)
+
+    def test_minres_with_shift(self):
+        rng = numpy.random.default_rng(304)
+        n = 8
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        # shift = 0 should be the default behaviour
+        x_dp, info = minres(A_dp, b_dp, tol=1e-8, shift=0.0)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
+
+    def test_minres_with_linear_operator(self):
+        rng = numpy.random.default_rng(305)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        A_dp = dpnp.asarray(A_np)
+        b_np = rng.standard_normal(n).astype(dtype)
+        b_dp = dpnp.asarray(b_np)
+
+        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
+        x_dp, info = minres(op, b_dp, tol=1e-8, maxiter=500)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
+
+    def test_minres_with_preconditioner(self):
+        rng = numpy.random.default_rng(306)
+        n = 10
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        A_dp = dpnp.asarray(A_np)
+        b_np = rng.standard_normal(n).astype(dtype)
+        b_dp = dpnp.asarray(b_np)
+
+        # Use diagonal preconditioner M ≈ diag(A)^{-1}
+        diag_A = numpy.diag(A_np)
+        M_np = numpy.diag(1.0 / diag_A)
+        M_dp = dpnp.asarray(M_np)
+
+        op_M = LinearOperator((n, n), matvec=lambda x: M_dp @ x, dtype=dtype)
+        x_dp, info = minres(A_dp, b_dp, M=op_M, tol=1e-8, maxiter=500)
+        assert info == 0
+        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
+
+
+# ---------------------------------------------------------------------------
+# Cross-solver consistency
+# ---------------------------------------------------------------------------
+
+class TestSolverConsistency:
+    """Verify that CG, GMRES, and MINRES agree on SPD systems."""
+
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_scipy(self):
+        pytest.importorskip("scipy", reason="SciPy required for minres in consistency tests")
+
+    @pytest.mark.parametrize("n", [8, 16])
+    def test_cg_gmres_minres_agree_spd(self, n):
+        rng = numpy.random.default_rng(400)
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+
+        x_cg, info_cg = cg(A_dp, b_dp, tol=1e-10, maxiter=500)
+        x_gm, info_gm = gmres(A_dp, b_dp, tol=1e-10, maxiter=50, restart=n)
+        x_mr, info_mr = minres(A_dp, b_dp, tol=1e-10, maxiter=500)
+
+        assert info_cg == 0 and info_gm == 0 and info_mr == 0
+
+        assert_allclose(numpy.asarray(x_cg), numpy.asarray(x_gm), rtol=1e-5,
+                        err_msg="CG and GMRES disagree")
+        assert_allclose(numpy.asarray(x_cg), numpy.asarray(x_mr), rtol=1e-5,
+                        err_msg="CG and MINRES disagree")
+
+    def test_all_solvers_vs_numpy_direct(self):
+        rng = numpy.random.default_rng(401)
+        n = 12
+        dtype = numpy.float64
+        A_np = _make_spd(n, dtype, rng)
+        b_np = rng.standard_normal(n).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+        x_ref = numpy.linalg.solve(A_np, b_np)
+
+        x_cg, _ = cg(A_dp, b_dp, tol=1e-11, maxiter=500)
+        x_gm, _ = gmres(A_dp, b_dp, tol=1e-11, maxiter=50, restart=n)
+        x_mr, _ = minres(A_dp, b_dp, tol=1e-11, maxiter=500)
+
+        for name, x_dp in [("cg", x_cg), ("gmres", x_gm), ("minres", x_mr)]:
+            assert_allclose(
+                numpy.asarray(x_dp), x_ref, rtol=1e-7,
+                err_msg=f"{name} deviates from numpy.linalg.solve"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Import-level smoke test
+# ---------------------------------------------------------------------------
+
+def test_public_api_importable():
+    """Verify all four public names are importable from the module."""
+    from dpnp.scipy.sparse.linalg import (  # noqa: F401
+        LinearOperator,
+        aslinearoperator,
+        cg,
+        gmres,
+        minres,
+    )

From 3a5006267e2dcc9e6ac10f842dd4c9a6f7816c8f Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 14:07:24 -0500
Subject: [PATCH 09/43] Fix implicit numpy conversion; use .asnumpy() for dpnp
 arrays

---
 dpnp/scipy/sparse/linalg/_iterative.py | 27 ++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 5f70d59946b6..c524836da8c2 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -68,6 +68,13 @@
 # Helpers
 # ---------------------------------------------------------------------------
 
+def _to_numpy(x):
+    """Convert a dpnp or numpy array to a numpy array safely."""
+    if isinstance(x, _dpnp.ndarray):
+        return x.asnumpy()
+    return _np.asarray(x)
+
+
 def _check_dtype(dtype, name: str) -> None:
     if dtype.char not in _SUPPORTED_DTYPES:
         raise TypeError(
@@ -218,9 +225,9 @@ def cg(
                 "atol": 0.0 if atol is None else float(atol),
                 "maxiter": maxiter,
             }
-            A_np = _np.asarray(A) if not hasattr(A, "matvec") else A
-            b_np = _np.asarray(b)
-            x0_np = None if x0 is None else _np.asarray(x0)
+            A_np  = _to_numpy(A) if not hasattr(A, "matvec") else A
+            b_np  = _to_numpy(b)
+            x0_np = None if x0 is None else _to_numpy(_dpnp.asarray(x0))
             x_np, info = _sla.cg(A_np, b_np, x0=x0_np, callback=callback, **_kw)
             return _dpnp.asarray(x_np), int(info)
         except Exception:
@@ -322,9 +329,9 @@ def gmres(
             sig = inspect.signature(_sla.gmres)
             if "callback_type" in sig.parameters and callback_type is not None:
                 _kw["callback_type"] = callback_type
-            A_np  = _np.asarray(A) if not hasattr(A, "matvec") else A
-            b_np  = _np.asarray(b)
-            x0_np = None if x0 is None else _np.asarray(x0)
+            A_np  = _to_numpy(A) if not hasattr(A, "matvec") else A
+            b_np  = _to_numpy(b)
+            x0_np = None if x0 is None else _to_numpy(_dpnp.asarray(x0))
             x_np, info = _sla.gmres(A_np, b_np, x0=x0_np, callback=callback, **_kw)
             return _dpnp.asarray(x_np), int(info)
         except Exception:
@@ -371,7 +378,7 @@ def gmres(
             # Replaces the slow Python loop (vdot per column) in the initial stub.
             V_mat  = _dpnp.stack(V_cols, axis=1)          # (n, j+1)
             h_dp   = _dpnp.dot(V_mat.T.conj(), w)         # (j+1,)  -- oneMKL gemv
-            h_np   = _np.asarray(h_dp)                    # pull tiny vector to CPU
+            h_np   = h_dp.asnumpy()                        # pull tiny vector to CPU
             w      = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
 
             h_j1 = float(_dpnp.linalg.norm(w))
@@ -467,13 +474,13 @@ def minres(
     def _wrap_op(op):
         return _sla.LinearOperator(
             op.shape,
-            matvec=lambda x: _np.asarray(op.matvec(_dpnp.asarray(x))),
+            matvec=lambda x: op.matvec(_dpnp.asarray(x)).asnumpy(),
             dtype=_np.dtype(op.dtype) if op.dtype is not None else _np.float64,
         )
 
     M_sci = None if M is None else _wrap_op(aslinearoperator(M))
-    b_np  = _np.asarray(_dpnp.asarray(b).reshape(-1))
-    x0_np = None if x0 is None else _np.asarray(_dpnp.asarray(x0).reshape(-1))
+    b_np  = _dpnp.asarray(b).reshape(-1).asnumpy()
+    x0_np = None if x0 is None else _dpnp.asarray(x0).reshape(-1).asnumpy()
 
     tkw = _scipy_tol_kwarg(_sla.minres)
     x_np, info = _sla.minres(

From 62cf7a439af99967ae889a888927bda34f2a1318 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 14:12:35 -0500
Subject: [PATCH 10/43] tests: add comprehensive sparse linalg tests for
 LinearOperator, cg, gmres, minres

Modeled after CuPy's cupyx_tests/scipy_tests/sparse_tests/test_linalg.py.
Covers:
  - LinearOperator: shape, dtype inference, matvec/rmatvec/matmat,
    subclassing, __matmul__, __call__, edge cases
  - aslinearoperator: dense array, duck-type, identity passthrough,
    rmatvec from dense, invalid inputs
  - cg: SPD convergence, scipy reference match, x0 warm start, b_ndim=2,
    callback, atol, LinearOperator path, invalid inputs,
    non-convergence info check
  - gmres: diag-dominant convergence, scipy reference match, restart
    variants, x0, b_ndim=2, callbacks, complex systems, atol,
    non-convergence info check, Hilbert-matrix stress test
  - minres: SPD, symmetric-indefinite, scipy reference, shift parameter,
    non-square guard, LinearOperator path, callback
  - Integration: parametric (n, dtype) cross-solver tests via LinearOperator
  - Import smoke tests: __all__ completeness
---
 .../scipy_tests/sparse_tests/test_linalg.py   | 672 ++++++++++++++++++
 1 file changed, 672 insertions(+)
 create mode 100644 tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py

diff --git a/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py b/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
new file mode 100644
index 000000000000..6ed61a3b2519
--- /dev/null
+++ b/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
@@ -0,0 +1,672 @@
+# tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
+"""
+Comprehensive tests for dpnp.scipy.sparse.linalg:
+  LinearOperator, aslinearoperator, cg, gmres, minres
+
+Modeled after CuPy's cupyx_tests/scipy_tests/sparse_tests/test_linalg.py,
+adapted for the dpnp testing environment (no cupy.testing harness).
+
+Requirements:
+    pytest >= 7.0
+    numpy
+    scipy
+    dpnp
+"""
+
+from __future__ import annotations
+
+import warnings
+import numpy
+import pytest
+
+try:
+    import scipy.sparse
+    import scipy.sparse.linalg as scipy_sla
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+
+import dpnp
+from dpnp.scipy.sparse.linalg import (
+    LinearOperator,
+    aslinearoperator,
+    cg,
+    gmres,
+    minres,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers / fixtures
+# ---------------------------------------------------------------------------
+
+_RNG = numpy.random.default_rng(42)
+
+
+def _spd_matrix(n, dtype):
+    """Return a dense symmetric positive-definite dpnp array."""
+    a = _RNG.standard_normal((n, n)).astype(dtype)
+    a = a.T @ a + numpy.eye(n, dtype=dtype)
+    return dpnp.asarray(a)
+
+
+def _diag_dominant(n, dtype, rng=None):
+    """Return a strictly diagonally dominant (non-symmetric) dpnp array."""
+    rng = rng or _RNG
+    a = rng.standard_normal((n, n)).astype(dtype)
+    a = a * 0.1
+    numpy.fill_diagonal(a, numpy.abs(a).sum(axis=1) + 1.0)
+    return dpnp.asarray(a)
+
+
+def _sym_indefinite(n, dtype):
+    """Return a symmetric indefinite dpnp array (for MINRES)."""
+    q, _ = numpy.linalg.qr(_RNG.standard_normal((n, n)).astype(dtype))
+    d = _RNG.standard_normal(n).astype(dtype)
+    return dpnp.asarray(q @ numpy.diag(d) @ q.T)
+
+
+def _rhs(n, dtype):
+    b = _RNG.standard_normal(n).astype(dtype)
+    b /= numpy.linalg.norm(b)
+    return dpnp.asarray(b)
+
+
+def _ref_solve(A_np, b_np):
+    return numpy.linalg.solve(A_np, b_np)
+
+
+# ---------------------------------------------------------------------------
+# ─── LinearOperator ──────────────────────────────────────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+class TestLinearOperatorBasic:
+    """Basic constructor, properties, and protocol tests."""
+
+    @pytest.mark.parametrize("m,n", [(5, 5), (7, 3), (3, 7)])
+    def test_shape(self, m, n):
+        lo = LinearOperator((m, n), matvec=lambda x: dpnp.zeros(m))
+        assert lo.shape == (m, n)
+        assert lo.ndim == 2
+
+    def test_dtype_inference(self):
+        A = dpnp.eye(4, dtype=dpnp.float32)
+        lo = LinearOperator((4, 4), matvec=lambda x: A @ x)
+        assert lo.dtype == dpnp.float32
+
+    def test_dtype_explicit(self):
+        lo = LinearOperator(
+            (4, 4), matvec=lambda x: dpnp.zeros(4, dtype=dpnp.float64),
+            dtype=dpnp.float64)
+        assert lo.dtype == dpnp.float64
+
+    def test_matvec_shape_check(self):
+        lo = LinearOperator((3, 5), matvec=lambda x: dpnp.zeros(3))
+        x_bad = dpnp.ones(4)
+        with pytest.raises(ValueError):
+            lo.matvec(x_bad)
+
+    def test_matmat_fallback_loop(self):
+        n = 4
+        A_np = numpy.eye(n, dtype=numpy.float64)
+        A_dp = dpnp.asarray(A_np)
+        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        X = dpnp.asarray(_RNG.standard_normal((n, 3)))
+        Y = lo.matmat(X)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(Y), dpnp.asnumpy(X), atol=1e-12)
+
+    def test_rmatvec_raises_if_not_defined(self):
+        lo = LinearOperator((3, 3), matvec=lambda x: dpnp.zeros(3))
+        with pytest.raises(NotImplementedError):
+            lo.rmatvec(dpnp.zeros(3))
+
+    def test_rmatvec_defined(self):
+        n = 5
+        A_np = _RNG.standard_normal((n, n))
+        A_dp = dpnp.asarray(A_np)
+        lo = LinearOperator(
+            (n, n),
+            matvec=lambda x: A_dp @ x,
+            rmatvec=lambda x: dpnp.conj(A_dp.T) @ x,
+        )
+        x = dpnp.asarray(_RNG.standard_normal(n))
+        y_dpnp = dpnp.asnumpy(lo.rmatvec(x))
+        y_ref = A_np.conj().T @ dpnp.asnumpy(x)
+        numpy.testing.assert_allclose(y_dpnp, y_ref, atol=1e-12)
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
+                                        numpy.complex64, numpy.complex128])
+    def test_matmul_operator(self, dtype):
+        n = 6
+        A_np = _RNG.standard_normal((n, n)).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        x = dpnp.asarray(_RNG.standard_normal(n).astype(dtype))
+        result = lo @ x
+        expected = A_np @ dpnp.asnumpy(x)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(result), expected,
+            rtol=1e-5 if dtype in (numpy.float32, numpy.complex64) else 1e-12)
+
+    def test_matmul_2d(self):
+        n, k = 5, 3
+        A_np = _RNG.standard_normal((n, n))
+        A_dp = dpnp.asarray(A_np)
+        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        X = dpnp.asarray(_RNG.standard_normal((n, k)))
+        Y = lo @ X
+        expected = A_np @ dpnp.asnumpy(X)
+        numpy.testing.assert_allclose(dpnp.asnumpy(Y), expected, atol=1e-12)
+
+    def test_call_alias(self):
+        n = 4
+        A_dp = dpnp.eye(n, dtype=dpnp.float64)
+        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
+        x = dpnp.ones(n)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(lo(x)), dpnp.asnumpy(x), atol=1e-12)
+
+    def test_repr(self):
+        lo = LinearOperator((3, 4), matvec=lambda x: dpnp.zeros(3),
+                            dtype=dpnp.float64)
+        r = repr(lo)
+        assert "3x4" in r
+        assert "LinearOperator" in r
+
+    def test_invalid_shape_negative(self):
+        with pytest.raises(ValueError):
+            LinearOperator((-1, 3), matvec=lambda x: x)
+
+    def test_invalid_shape_wrong_ndim(self):
+        with pytest.raises(ValueError):
+            LinearOperator((3,), matvec=lambda x: x)
+
+
+class TestLinearOperatorSubclass:
+    """Test user-defined subclasses with _matvec / _matmat overrides,
+    mirroring CuPy's HasMatvec / HasMatmat pattern."""
+
+    def _build_A(self, n, dtype):
+        A_np = _RNG.standard_normal((n, n)).astype(dtype)
+        return A_np, dpnp.asarray(A_np)
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_subclass_matvec(self, dtype):
+        n = 8
+        A_np, A_dp = self._build_A(n, dtype)
+
+        class MyOp(LinearOperator):
+            def __init__(self):
+                super().__init__(
+                    shape=(n, n),
+                    matvec=lambda x: A_dp @ x,
+                    dtype=dpnp.float64,
+                )
+
+        op = MyOp()
+        x = dpnp.asarray(_RNG.standard_normal(n).astype(dtype))
+        result = op.matvec(x)
+        expected = A_np @ dpnp.asnumpy(x)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(result), expected,
+            rtol=1e-5 if dtype == numpy.float32 else 1e-12)
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_subclass_matmat(self, dtype):
+        n, k = 7, 4
+        A_np, A_dp = self._build_A(n, dtype)
+
+        class MyOp(LinearOperator):
+            def __init__(self):
+                super().__init__(
+                    shape=(n, n),
+                    matvec=lambda x: A_dp @ x,
+                    dtype=dpnp.float64,
+                )
+            def _matmat_impl(self, X):
+                return A_dp @ X
+
+        op = MyOp()
+        X = dpnp.asarray(_RNG.standard_normal((n, k)).astype(dtype))
+        Y = op.matmat(X)
+        expected = A_np @ dpnp.asnumpy(X)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(Y), expected,
+            rtol=1e-5 if dtype == numpy.float32 else 1e-12)
+
+
+# ---------------------------------------------------------------------------
+# ─── aslinearoperator ────────────────────────────────────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+class TestAsLinearOperator:
+
+    def test_identity_on_linearoperator(self):
+        lo = LinearOperator((3, 3), matvec=lambda x: x)
+        assert aslinearoperator(lo) is lo
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
+                                        numpy.complex64, numpy.complex128])
+    def test_dense_dpnp_array(self, dtype):
+        n = 6
+        A_np = _RNG.standard_normal((n, n)).astype(dtype)
+        A_dp = dpnp.asarray(A_np)
+        lo = aslinearoperator(A_dp)
+        assert lo.shape == (n, n)
+        x = dpnp.asarray(_RNG.standard_normal(n).astype(dtype))
+        y = lo.matvec(x)
+        expected = A_np @ dpnp.asnumpy(x)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(y), expected,
+            rtol=1e-5 if dtype in (numpy.float32, numpy.complex64) else 1e-12)
+
+    def test_dense_numpy_array(self):
+        n = 5
+        A_np = _RNG.standard_normal((n, n))
+        lo = aslinearoperator(A_np)
+        assert lo.shape == (n, n)
+
+    def test_rmatvec_from_dense(self):
+        n = 5
+        A_np = _RNG.standard_normal((n, n))
+        A_dp = dpnp.asarray(A_np)
+        lo = aslinearoperator(A_dp)
+        x = dpnp.asarray(_RNG.standard_normal(n))
+        y = lo.rmatvec(x)
+        expected = A_np.conj().T @ dpnp.asnumpy(x)
+        numpy.testing.assert_allclose(dpnp.asnumpy(y), expected, atol=1e-12)
+
+    def test_duck_type_with_shape_and_matvec(self):
+        n = 4
+
+        class DuckOp:
+            shape = (n, n)
+            dtype = numpy.float64
+            def matvec(self, x):
+                return dpnp.asarray(dpnp.asnumpy(x) * 2.0)
+
+        lo = aslinearoperator(DuckOp())
+        x = dpnp.ones(n)
+        y = lo.matvec(x)
+        numpy.testing.assert_allclose(dpnp.asnumpy(y), numpy.ones(n) * 2.0)
+
+    def test_invalid_type_raises(self):
+        with pytest.raises(TypeError):
+            aslinearoperator("not_an_array")
+
+    def test_invalid_1d_array_raises(self):
+        with pytest.raises(Exception):
+            aslinearoperator(dpnp.ones(5))
+
+
+# ---------------------------------------------------------------------------
+# ─── CG ──────────────────────────────────────────────────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required")
+class TestCg:
+    """Tests mirroring CuPy's TestCg class."""
+
+    n = 30
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
+                                        numpy.complex64, numpy.complex128])
+    def test_converges_spd(self, dtype):
+        A = _spd_matrix(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x, info = cg(A, b, tol=1e-8, maxiter=500)
+        assert info == 0
+        res = dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b)
+        assert float(res) < 1e-5
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_matches_scipy_reference(self, dtype):
+        A_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
+        b_np = dpnp.asnumpy(_rhs(self.n, dtype))
+        x_ref, info_ref = scipy_sla.cg(A_np, b_np, rtol=1e-8, maxiter=500)
+        assert info_ref == 0
+        x_dp, info = cg(dpnp.asarray(A_np), dpnp.asarray(b_np),
+                        tol=1e-8, maxiter=500)
+        assert info == 0
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(x_dp), x_ref,
+            rtol=1e-4 if dtype == numpy.float32 else 1e-8)
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_x0_warm_start(self, dtype):
+        A = _spd_matrix(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x0 = dpnp.ones(self.n, dtype=dtype)
+        x, info = cg(A, b, x0=x0, tol=1e-8, maxiter=500)
+        assert info == 0
+        res = dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b)
+        assert float(res) < 1e-5
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_b_2dim(self, dtype):
+        """b with shape (n, 1) should be accepted and flattened."""
+        A = _spd_matrix(self.n, dtype)
+        b = _rhs(self.n, dtype).reshape(self.n, 1)
+        x, info = cg(A, b, tol=1e-8, maxiter=500)
+        assert info == 0
+
+    def test_callback_is_called(self):
+        A = _spd_matrix(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        calls = []
+        def cb(xk):
+            calls.append(float(dpnp.linalg.norm(xk)))
+        cg(A, b, callback=cb, maxiter=200)
+        assert len(calls) > 0
+
+    @pytest.mark.parametrize("dtype", [numpy.float64])
+    def test_atol(self, dtype):
+        A = _spd_matrix(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x, info = cg(A, b, tol=0.0, atol=1e-1)
+        res = float(dpnp.linalg.norm(A @ x - b))
+        assert res < 1.0
+
+    def test_exact_solution_zero_iter(self):
+        """If x0 is already the solution, residual is zero and CG returns info=0."""
+        n = 10
+        A = _spd_matrix(n, numpy.float64)
+        b = _rhs(n, numpy.float64)
+        x_true = dpnp.asarray(
+            numpy.linalg.solve(dpnp.asnumpy(A), dpnp.asnumpy(b)))
+        x, info = cg(A, b, x0=x_true, tol=1e-12)
+        assert info == 0
+
+    @pytest.mark.parametrize("dtype", [numpy.float64])
+    def test_via_linear_operator(self, dtype):
+        A_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
+        A_dp = dpnp.asarray(A_np)
+        b = dpnp.asarray(_RNG.standard_normal(self.n))
+        lo = aslinearoperator(A_dp)
+        x, info = cg(lo, b, tol=1e-8, maxiter=500)
+        assert info == 0
+        res = float(dpnp.linalg.norm(
+            dpnp.asarray(A_np) @ x - b)) / float(dpnp.linalg.norm(b))
+        assert res < 1e-5
+
+    def test_invalid_non_square(self):
+        A = dpnp.ones((5, 6), dtype=dpnp.float64)
+        b = dpnp.ones(5)
+        with pytest.raises(Exception):
+            cg(A, b)
+
+    def test_invalid_b_wrong_size(self):
+        A = _spd_matrix(5, numpy.float64)
+        b = dpnp.ones(6)
+        with pytest.raises((ValueError, Exception)):
+            cg(A, b, maxiter=1)
+
+    def test_maxiter_nonconvergence_info(self):
+        """Setting maxiter=1 on a hard problem should return info > 0."""
+        A = _spd_matrix(50, numpy.float64)
+        b = _rhs(50, numpy.float64)
+        x, info = cg(A, b, tol=1e-15, maxiter=1)
+        assert info != 0
+
+
+# ---------------------------------------------------------------------------
+# ─── GMRES ───────────────────────────────────────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required")
+class TestGmres:
+    """Tests mirroring CuPy's TestGmres class."""
+
+    n = 30
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
+                                        numpy.complex64, numpy.complex128])
+    def test_converges_diag_dominant(self, dtype):
+        A = _diag_dominant(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x, info = gmres(A, b, tol=1e-8, maxiter=50, restart=30)
+        assert info == 0
+        res = dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b)
+        assert float(res) < 1e-5
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_matches_scipy_reference(self, dtype):
+        A_np = dpnp.asnumpy(_diag_dominant(self.n, dtype))
+        b_np = _RNG.standard_normal(self.n).astype(dtype)
+        b_np /= numpy.linalg.norm(b_np)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            x_ref, info_ref = scipy_sla.gmres(
+                A_np, b_np, rtol=1e-8, restart=self.n, maxiter=None)
+        x_dp, info = gmres(
+            dpnp.asarray(A_np), dpnp.asarray(b_np),
+            tol=1e-8, restart=self.n, maxiter=50)
+        assert info == 0
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(x_dp), x_ref,
+            rtol=1e-3 if dtype == numpy.float32 else 1e-7)
+
+    @pytest.mark.parametrize("restart", [None, 5, 15])
+    def test_restart_values(self, restart):
+        A = _diag_dominant(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        x, info = gmres(A, b, tol=1e-8, restart=restart, maxiter=100)
+        assert info == 0
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_x0_warm_start(self, dtype):
+        A = _diag_dominant(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x0 = dpnp.ones(self.n, dtype=dtype)
+        x, info = gmres(A, b, x0=x0, tol=1e-8, maxiter=100)
+        assert info == 0
+
+    def test_b_2dim(self):
+        """b with shape (n, 1) should be accepted."""
+        A = _diag_dominant(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64).reshape(self.n, 1)
+        x, info = gmres(A, b, tol=1e-8, maxiter=100)
+        assert info == 0
+
+    def test_callback_x_called(self):
+        A = _diag_dominant(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        calls = []
+        def cb(xk):
+            calls.append(1)
+        gmres(A, b, callback=cb, callback_type='x', maxiter=20)
+        assert len(calls) > 0
+
+    def test_callback_pr_norm_not_implemented(self):
+        A = _diag_dominant(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        with pytest.raises(NotImplementedError):
+            gmres(A, b, callback=lambda r: None, callback_type='pr_norm')
+
+    def test_invalid_callback_type(self):
+        A = _diag_dominant(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        with pytest.raises(ValueError):
+            gmres(A, b, callback_type='garbage')
+
+    @pytest.mark.parametrize("dtype", [numpy.float64])
+    def test_via_linear_operator(self, dtype):
+        A_np = dpnp.asnumpy(_diag_dominant(self.n, dtype))
+        A_dp = dpnp.asarray(A_np)
+        b = dpnp.asarray(_RNG.standard_normal(self.n))
+        lo = aslinearoperator(A_dp)
+        x, info = gmres(lo, b, tol=1e-8, restart=self.n, maxiter=50)
+        assert info == 0
+
+    def test_nonconvergence_info_nonzero(self):
+        """restart=2, maxiter=2 on a size-48 Hilbert-like matrix must not converge."""
+        n = 48
+        idx = numpy.arange(n, dtype=numpy.float64)
+        A_np = 1.0 / (idx[:, None] + idx[None, :] + 1.0)
+        b_np = _RNG.standard_normal(n)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+        x, info = gmres(A_dp, b_dp, tol=1e-15, restart=2, maxiter=2)
+        rel_res = float(dpnp.linalg.norm(A_dp @ x - b_dp) /
+                        dpnp.linalg.norm(b_dp))
+        assert rel_res > 1e-12
+        assert info != 0
+
+    def test_complex_system(self):
+        n = 15
+        A_np = (_RNG.standard_normal((n, n)) +
+                1j * _RNG.standard_normal((n, n))).astype(numpy.complex128)
+        numpy.fill_diagonal(A_np, numpy.abs(A_np).sum(axis=1) + 1.0)
+        b_np = (_RNG.standard_normal(n) +
+                1j * _RNG.standard_normal(n)).astype(numpy.complex128)
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+        x, info = gmres(A_dp, b_dp, tol=1e-8, restart=n, maxiter=50)
+        assert info == 0
+        res = float(numpy.linalg.norm(A_np @ dpnp.asnumpy(x) - b_np) /
+                    numpy.linalg.norm(b_np))
+        assert res < 1e-5
+
+    def test_atol_parameter(self):
+        A = _diag_dominant(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        x, info = gmres(A, b, tol=0.0, atol=1e-6, restart=self.n, maxiter=50)
+        res = float(dpnp.linalg.norm(A @ x - b))
+        assert res < 1e-4
+
+
+# ---------------------------------------------------------------------------
+# ─── MINRES ────────────────────────────────────────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required for MINRES")
+class TestMinres:
+    """Tests for MINRES (SciPy-backed implementation)."""
+
+    n = 30
+
+    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
+    def test_converges_spd(self, dtype):
+        """MINRES on SPD system should converge."""
+        A = _spd_matrix(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x, info = minres(A, b, tol=1e-8, maxiter=500)
+        assert info == 0
+        res = float(dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b))
+        assert res < 1e-4
+
+    @pytest.mark.parametrize("dtype", [numpy.float64])
+    def test_converges_sym_indefinite(self, dtype):
+        """MINRES distinguishes itself on symmetric-indefinite systems."""
+        A = _sym_indefinite(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        x, info = minres(A, b, tol=1e-8, maxiter=1000)
+        res = float(dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b))
+        assert res < 1e-3
+
+    @pytest.mark.parametrize("dtype", [numpy.float64])
+    def test_matches_scipy_reference(self, dtype):
+        A_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
+        b_np = dpnp.asnumpy(_rhs(self.n, dtype))
+        x_ref, _ = scipy_sla.minres(A_np, b_np, rtol=1e-8)
+        x_dp, info = minres(
+            dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-8)
+        numpy.testing.assert_allclose(
+            dpnp.asnumpy(x_dp), x_ref, rtol=1e-6)
+
+    def test_x0_warm_start(self):
+        A = _spd_matrix(self.n, numpy.float64)
+        b = _rhs(self.n, numpy.float64)
+        x0 = dpnp.zeros(self.n, dtype=numpy.float64)
+        x, info = minres(A, b, x0=x0, tol=1e-8)
+        assert info == 0
+
+    def test_shift_parameter(self):
+        """shift != 0: solves (A - shift*I) x = b."""
+        A_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
+        b_np = dpnp.asnumpy(_rhs(self.n, numpy.float64))
+        shift = 0.5
+        A_dp = dpnp.asarray(A_np)
+        b_dp = dpnp.asarray(b_np)
+        x, info = minres(A_dp, b_dp, shift=shift, tol=1e-8)
+        A_shifted = A_np - shift * numpy.eye(self.n)
+        res = numpy.linalg.norm(A_shifted @ dpnp.asnumpy(x) - b_np)
+        assert res / numpy.linalg.norm(b_np) < 1e-4
+
+    def test_non_square_raises(self):
+        A = aslinearoperator(dpnp.ones((4, 5), dtype=dpnp.float64))
+        b = dpnp.ones(4)
+        with pytest.raises(ValueError):
+            minres(A, b)
+
+    def test_via_linear_operator(self):
+        A_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
+        A_dp = dpnp.asarray(A_np)
+        b = dpnp.asarray(_RNG.standard_normal(self.n))
+        lo = aslinearoperator(A_dp)
+        x, info = minres(lo, b, tol=1e-8)
+        assert info == 0
+
+    @pytest.mark.parametrize("dtype", [numpy.float64])
+    def test_callback_is_called(self, dtype):
+        A = _spd_matrix(self.n, dtype)
+        b = _rhs(self.n, dtype)
+        calls = []
+        def cb(xk):
+            calls.append(1)
+        minres(A, b, callback=cb, tol=1e-8)
+        assert len(calls) > 0
+
+
+# ---------------------------------------------------------------------------
+# ─── Integration: all solvers via LinearOperator ─────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required")
+class TestSolversViaLinearOperator:
+    """Parametric integration tests with varying n and dtype."""
+
+    @pytest.mark.parametrize("n,dtype", [
+        (10, numpy.float32), (10, numpy.float64),
+        (30, numpy.float64), (50, numpy.float64),
+    ])
+    def test_cg_spd_lo(self, n, dtype):
+        A_dp = _spd_matrix(n, dtype)
+        lo = aslinearoperator(A_dp)
+        b = _rhs(n, dtype)
+        x, info = cg(lo, b, tol=1e-8, maxiter=n * 10)
+        assert info == 0
+        res = float(dpnp.linalg.norm(A_dp @ x - b) / dpnp.linalg.norm(b))
+        atol = 1e-4 if dtype == numpy.float32 else 1e-8
+        assert res < atol
+
+    @pytest.mark.parametrize("n,dtype", [
+        (10, numpy.float32), (10, numpy.float64),
+        (30, numpy.float64),
+    ])
+    def test_gmres_nonsymmetric_lo(self, n, dtype):
+        A_dp = _diag_dominant(n, dtype)
+        lo = aslinearoperator(A_dp)
+        b = _rhs(n, dtype)
+        x, info = gmres(lo, b, tol=1e-8, restart=n, maxiter=50)
+        assert info == 0
+
+
+# ---------------------------------------------------------------------------
+# ─── Import smoke tests ───────────────────────────────────────────────────────────────────────
+# ---------------------------------------------------------------------------
+
+class TestImports:
+    def test_all_symbols_importable(self):
+        from dpnp.scipy.sparse.linalg import (
+            LinearOperator, aslinearoperator, cg, gmres, minres)
+        assert callable(LinearOperator)
+        assert callable(aslinearoperator)
+        assert callable(cg)
+        assert callable(gmres)
+        assert callable(minres)
+
+    def test_all_listed_in_dunder_all(self):
+        import dpnp.scipy.sparse.linalg as mod
+        for name in ("LinearOperator", "aslinearoperator", "cg", "gmres", "minres"):
+            assert name in mod.__all__, f"{name!r} missing from __all__"

From d9248166fd08d595f27b84d837510681e15cb3c9 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 14:16:54 -0500
Subject: [PATCH 11/43] tests: rewrite sparse linalg tests to match dpnp
 test_linalg.py style

- Use dpnp.tests.helper: assert_dtype_allclose, generate_random_numpy_array,
  get_all_dtypes, get_float_complex_dtypes, has_support_aspect64
- Use dpnp.tests.third_party.cupy testing harness (with_requires, etc.)
- Use numpy.testing assert_allclose / assert_array_equal / assert_raises
- Use dpnp.asnumpy() instead of numpy.asarray()
- Use pytest parametrize ids matching existing test conventions
- Use is_scipy_available() helper from tests/helper.py
- Strict class-per-solver organisation matching TestCholesky / TestDet etc.
---
 .../scipy_tests/sparse_tests/test_linalg.py   | 1135 ++++++++++-------
 1 file changed, 704 insertions(+), 431 deletions(-)

diff --git a/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py b/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
index 6ed61a3b2519..3c8bb3ea4cba 100644
--- a/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
+++ b/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
@@ -1,32 +1,41 @@
 # tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
 """
-Comprehensive tests for dpnp.scipy.sparse.linalg:
+Tests for dpnp.scipy.sparse.linalg:
   LinearOperator, aslinearoperator, cg, gmres, minres
 
-Modeled after CuPy's cupyx_tests/scipy_tests/sparse_tests/test_linalg.py,
-adapted for the dpnp testing environment (no cupy.testing harness).
-
-Requirements:
-    pytest >= 7.0
-    numpy
-    scipy
-    dpnp
+Style mirrors dpnp/tests/test_linalg.py:
+  - class-per-feature with pytest.mark.parametrize
+  - assert_dtype_allclose / generate_random_numpy_array from tests.helper
+  - dpnp.asnumpy() for array comparison
+  - testing.with_requires for optional-dependency guards
+  - is_scipy_available() / has_support_aspect64() for capability skips
 """
 
 from __future__ import annotations
 
 import warnings
+
 import numpy
 import pytest
-
-try:
-    import scipy.sparse
-    import scipy.sparse.linalg as scipy_sla
-    HAS_SCIPY = True
-except ImportError:
-    HAS_SCIPY = False
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+    assert_raises,
+)
 
 import dpnp
+
+# Re-use the project's own test helpers exactly as test_linalg.py does.
+from dpnp.tests.helper import (
+    assert_dtype_allclose,
+    generate_random_numpy_array,
+    get_all_dtypes,
+    get_float_complex_dtypes,
+    has_support_aspect64,
+    is_scipy_available,
+)
+from dpnp.tests.third_party.cupy import testing
+
 from dpnp.scipy.sparse.linalg import (
     LinearOperator,
     aslinearoperator,
@@ -35,263 +44,417 @@
     minres,
 )
 
+
 # ---------------------------------------------------------------------------
-# Helpers / fixtures
+# Optional SciPy import (used for reference comparisons)
 # ---------------------------------------------------------------------------
 
-_RNG = numpy.random.default_rng(42)
+if is_scipy_available():
+    import scipy.sparse.linalg as scipy_sla
+
+
+# ---------------------------------------------------------------------------
+# Shared matrix / vector helpers
+# (match the signature of generate_random_numpy_array from tests/helper.py)
+# ---------------------------------------------------------------------------
 
 
 def _spd_matrix(n, dtype):
-    """Return a dense symmetric positive-definite dpnp array."""
-    a = _RNG.standard_normal((n, n)).astype(dtype)
-    a = a.T @ a + numpy.eye(n, dtype=dtype)
+    """Dense symmetric positive-definite matrix as a dpnp array."""
+    a = generate_random_numpy_array(
+        (n, n), dtype, seed_value=42, hermitian=False
+    ).astype(float)
+    a = a.T @ a + numpy.eye(n, dtype=float)
+    if numpy.issubdtype(dtype, numpy.complexfloating):
+        a = a.astype(dtype)
+    else:
+        a = a.astype(dtype)
     return dpnp.asarray(a)
 
 
-def _diag_dominant(n, dtype, rng=None):
-    """Return a strictly diagonally dominant (non-symmetric) dpnp array."""
-    rng = rng or _RNG
-    a = rng.standard_normal((n, n)).astype(dtype)
-    a = a * 0.1
+def _diag_dominant(n, dtype, seed_value=81):
+    """Strictly diagonally dominant (non-symmetric) matrix as a dpnp array."""
+    a = generate_random_numpy_array(
+        (n, n), dtype, seed_value=seed_value
+    ) * 0.1
     numpy.fill_diagonal(a, numpy.abs(a).sum(axis=1) + 1.0)
     return dpnp.asarray(a)
 
 
-def _sym_indefinite(n, dtype):
-    """Return a symmetric indefinite dpnp array (for MINRES)."""
-    q, _ = numpy.linalg.qr(_RNG.standard_normal((n, n)).astype(dtype))
-    d = _RNG.standard_normal(n).astype(dtype)
-    return dpnp.asarray(q @ numpy.diag(d) @ q.T)
+def _sym_indefinite(n, dtype, seed_value=99):
+    """Symmetric indefinite matrix (suitable for MINRES) as a dpnp array."""
+    a = generate_random_numpy_array((n, n), dtype, seed_value=seed_value)
+    q, _ = numpy.linalg.qr(a.astype(numpy.float64))
+    numpy.random.seed(seed_value)
+    d = numpy.random.standard_normal(n).astype(numpy.float64)
+    m = (q @ numpy.diag(d) @ q.T).astype(dtype)
+    return dpnp.asarray(m)
 
 
-def _rhs(n, dtype):
-    b = _RNG.standard_normal(n).astype(dtype)
+def _rhs(n, dtype, seed_value=7):
+    """Unit-norm right-hand side vector as a dpnp array."""
+    b = generate_random_numpy_array((n,), dtype, seed_value=seed_value)
     b /= numpy.linalg.norm(b)
     return dpnp.asarray(b)
 
 
-def _ref_solve(A_np, b_np):
-    return numpy.linalg.solve(A_np, b_np)
+# ---------------------------------------------------------------------------
+# Import smoke test
+# ---------------------------------------------------------------------------
+
+
+class TestImports:
+    """Verify that all public symbols are importable and callable."""
+
+    def test_all_symbols_importable(self):
+        from dpnp.scipy.sparse.linalg import (
+            LinearOperator,
+            aslinearoperator,
+            cg,
+            gmres,
+            minres,
+        )
+
+        for sym in (LinearOperator, aslinearoperator, cg, gmres, minres):
+            assert callable(sym)
+
+    def test_all_listed_in_dunder_all(self):
+        import dpnp.scipy.sparse.linalg as _mod
+
+        for name in (
+            "LinearOperator",
+            "aslinearoperator",
+            "cg",
+            "gmres",
+            "minres",
+        ):
+            assert name in _mod.__all__, f"{name!r} missing from __all__"
 
 
 # ---------------------------------------------------------------------------
-# ─── LinearOperator ──────────────────────────────────────────────────────────────────────────
+# LinearOperator
 # ---------------------------------------------------------------------------
 
-class TestLinearOperatorBasic:
-    """Basic constructor, properties, and protocol tests."""
 
-    @pytest.mark.parametrize("m,n", [(5, 5), (7, 3), (3, 7)])
-    def test_shape(self, m, n):
+class TestLinearOperator:
+    """Tests for LinearOperator construction and protocol.
+
+    Mirrors the style of TestCholesky / TestDet in test_linalg.py.
+    """
+
+    # ------------------------------------------------------------------ shape
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(5, 5), (7, 3), (3, 7)],
+        ids=["(5,5)", "(7,3)", "(3,7)"],
+    )
+    def test_shape(self, shape):
+        m, n = shape
         lo = LinearOperator((m, n), matvec=lambda x: dpnp.zeros(m))
         assert lo.shape == (m, n)
         assert lo.ndim == 2
 
-    def test_dtype_inference(self):
-        A = dpnp.eye(4, dtype=dpnp.float32)
-        lo = LinearOperator((4, 4), matvec=lambda x: A @ x)
-        assert lo.dtype == dpnp.float32
+    # ------------------------------------------------------------------ dtype
+
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
+    def test_dtype_inference(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
+        n = 4
+        A = dpnp.eye(n, dtype=dtype)
+        lo = LinearOperator((n, n), matvec=lambda x: A @ x)
+        assert lo.dtype == dtype
 
     def test_dtype_explicit(self):
         lo = LinearOperator(
-            (4, 4), matvec=lambda x: dpnp.zeros(4, dtype=dpnp.float64),
-            dtype=dpnp.float64)
+            (4, 4),
+            matvec=lambda x: dpnp.zeros(4, dtype=dpnp.float64),
+            dtype=dpnp.float64,
+        )
         assert lo.dtype == dpnp.float64
 
-    def test_matvec_shape_check(self):
+    # ------------------------------------------------------------------ matvec
+
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
+    def test_matvec(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
+        n = 6
+        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        a_dp = dpnp.asarray(a_np)
+        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
+        x = dpnp.asarray(
+            generate_random_numpy_array((n,), dtype, seed_value=1)
+        )
+        result = lo.matvec(x)
+        expected = a_np @ dpnp.asnumpy(x)
+        assert_dtype_allclose(result, expected)
+
+    def test_matvec_wrong_shape_raises(self):
         lo = LinearOperator((3, 5), matvec=lambda x: dpnp.zeros(3))
-        x_bad = dpnp.ones(4)
-        with pytest.raises(ValueError):
-            lo.matvec(x_bad)
+        with assert_raises(ValueError):
+            lo.matvec(dpnp.ones(4))
 
-    def test_matmat_fallback_loop(self):
-        n = 4
-        A_np = numpy.eye(n, dtype=numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        X = dpnp.asarray(_RNG.standard_normal((n, 3)))
-        Y = lo.matmat(X)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(Y), dpnp.asnumpy(X), atol=1e-12)
+    # ------------------------------------------------------------------ rmatvec
 
-    def test_rmatvec_raises_if_not_defined(self):
+    def test_rmatvec_not_defined_raises(self):
         lo = LinearOperator((3, 3), matvec=lambda x: dpnp.zeros(3))
-        with pytest.raises(NotImplementedError):
+        with assert_raises(NotImplementedError):
             lo.rmatvec(dpnp.zeros(3))
 
-    def test_rmatvec_defined(self):
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
+    def test_rmatvec(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
         n = 5
-        A_np = _RNG.standard_normal((n, n))
-        A_dp = dpnp.asarray(A_np)
+        a_np = generate_random_numpy_array((n, n), dtype, seed_value=12)
+        a_dp = dpnp.asarray(a_np)
         lo = LinearOperator(
             (n, n),
-            matvec=lambda x: A_dp @ x,
-            rmatvec=lambda x: dpnp.conj(A_dp.T) @ x,
+            matvec=lambda x: a_dp @ x,
+            rmatvec=lambda x: dpnp.conj(a_dp.T) @ x,
+        )
+        x = dpnp.asarray(
+            generate_random_numpy_array((n,), dtype, seed_value=3)
+        )
+        result = lo.rmatvec(x)
+        expected = a_np.conj().T @ dpnp.asnumpy(x)
+        assert_dtype_allclose(result, expected)
+
+    # ------------------------------------------------------------------ matmat
+
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
+    def test_matmat_fallback_loop(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
+        n, k = 5, 3
+        a_np = generate_random_numpy_array((n, n), dtype, seed_value=55)
+        a_dp = dpnp.asarray(a_np)
+        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
+        X = dpnp.asarray(
+            generate_random_numpy_array((n, k), dtype, seed_value=9)
         )
-        x = dpnp.asarray(_RNG.standard_normal(n))
-        y_dpnp = dpnp.asnumpy(lo.rmatvec(x))
-        y_ref = A_np.conj().T @ dpnp.asnumpy(x)
-        numpy.testing.assert_allclose(y_dpnp, y_ref, atol=1e-12)
+        Y = lo.matmat(X)
+        expected = a_np @ dpnp.asnumpy(X)
+        assert_dtype_allclose(Y, expected)
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
-                                        numpy.complex64, numpy.complex128])
-    def test_matmul_operator(self, dtype):
+    def test_matmat_wrong_ndim_raises(self):
+        lo = LinearOperator(
+            (3, 3),
+            matvec=lambda x: dpnp.zeros(3),
+            dtype=dpnp.float64,
+        )
+        with assert_raises(ValueError):
+            lo.matmat(dpnp.ones(3))  # 1-D, not 2-D
+
+    # ------------------------------------------------------------------ operator overloads
+
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
+    def test_matmul_1d(self, dtype):
+        """lo @ x dispatches to matvec."""
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
         n = 6
-        A_np = _RNG.standard_normal((n, n)).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        x = dpnp.asarray(_RNG.standard_normal(n).astype(dtype))
+        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        a_dp = dpnp.asarray(a_np)
+        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
+        x = dpnp.asarray(
+            generate_random_numpy_array((n,), dtype, seed_value=2)
+        )
         result = lo @ x
-        expected = A_np @ dpnp.asnumpy(x)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(result), expected,
-            rtol=1e-5 if dtype in (numpy.float32, numpy.complex64) else 1e-12)
-
-    def test_matmul_2d(self):
+        expected = a_np @ dpnp.asnumpy(x)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
+    def test_matmul_2d(self, dtype):
+        """lo @ X dispatches to matmat."""
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
         n, k = 5, 3
-        A_np = _RNG.standard_normal((n, n))
-        A_dp = dpnp.asarray(A_np)
-        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        X = dpnp.asarray(_RNG.standard_normal((n, k)))
+        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        a_dp = dpnp.asarray(a_np)
+        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
+        X = dpnp.asarray(
+            generate_random_numpy_array((n, k), dtype, seed_value=5)
+        )
         Y = lo @ X
-        expected = A_np @ dpnp.asnumpy(X)
-        numpy.testing.assert_allclose(dpnp.asnumpy(Y), expected, atol=1e-12)
+        expected = a_np @ dpnp.asnumpy(X)
+        assert_dtype_allclose(Y, expected)
 
     def test_call_alias(self):
         n = 4
-        A_dp = dpnp.eye(n, dtype=dpnp.float64)
-        lo = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        x = dpnp.ones(n)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(lo(x)), dpnp.asnumpy(x), atol=1e-12)
+        a_dp = dpnp.eye(n, dtype=dpnp.float64)
+        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
+        x = dpnp.ones(n, dtype=dpnp.float64)
+        assert_allclose(dpnp.asnumpy(lo(x)), dpnp.asnumpy(x), atol=1e-12)
+
+    # ------------------------------------------------------------------ repr
 
     def test_repr(self):
-        lo = LinearOperator((3, 4), matvec=lambda x: dpnp.zeros(3),
-                            dtype=dpnp.float64)
+        lo = LinearOperator(
+            (3, 4), matvec=lambda x: dpnp.zeros(3), dtype=dpnp.float64
+        )
         r = repr(lo)
         assert "3x4" in r
         assert "LinearOperator" in r
 
+    # ------------------------------------------------------------------ error paths
+
     def test_invalid_shape_negative(self):
-        with pytest.raises(ValueError):
+        with assert_raises(ValueError):
             LinearOperator((-1, 3), matvec=lambda x: x)
 
     def test_invalid_shape_wrong_ndim(self):
-        with pytest.raises(ValueError):
+        with assert_raises(ValueError):
             LinearOperator((3,), matvec=lambda x: x)
 
-
-class TestLinearOperatorSubclass:
-    """Test user-defined subclasses with _matvec / _matmat overrides,
-    mirroring CuPy's HasMatvec / HasMatmat pattern."""
-
-    def _build_A(self, n, dtype):
-        A_np = _RNG.standard_normal((n, n)).astype(dtype)
-        return A_np, dpnp.asarray(A_np)
-
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_subclass_matvec(self, dtype):
-        n = 8
-        A_np, A_dp = self._build_A(n, dtype)
-
-        class MyOp(LinearOperator):
-            def __init__(self):
-                super().__init__(
-                    shape=(n, n),
-                    matvec=lambda x: A_dp @ x,
-                    dtype=dpnp.float64,
-                )
-
-        op = MyOp()
-        x = dpnp.asarray(_RNG.standard_normal(n).astype(dtype))
-        result = op.matvec(x)
-        expected = A_np @ dpnp.asnumpy(x)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(result), expected,
-            rtol=1e-5 if dtype == numpy.float32 else 1e-12)
-
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_subclass_matmat(self, dtype):
+    # ------------------------------------------------------------------ subclass
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_subclass_custom_matmat(self, dtype):
+        """User subclass overriding _matmat_impl, as in CuPy's HasMatmat."""
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
         n, k = 7, 4
-        A_np, A_dp = self._build_A(n, dtype)
+        a_np = generate_random_numpy_array(
+            (n, n), dtype, seed_value=42
+        )
+        a_dp = dpnp.asarray(a_np)
 
-        class MyOp(LinearOperator):
+        class _MyOp(LinearOperator):
             def __init__(self):
                 super().__init__(
                     shape=(n, n),
-                    matvec=lambda x: A_dp @ x,
-                    dtype=dpnp.float64,
+                    matvec=lambda x: a_dp @ x,
+                    dtype=dtype,
                 )
+
             def _matmat_impl(self, X):
-                return A_dp @ X
+                return a_dp @ X
 
-        op = MyOp()
-        X = dpnp.asarray(_RNG.standard_normal((n, k)).astype(dtype))
+        op = _MyOp()
+        X = dpnp.asarray(
+            generate_random_numpy_array((n, k), dtype, seed_value=9)
+        )
         Y = op.matmat(X)
-        expected = A_np @ dpnp.asnumpy(X)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(Y), expected,
-            rtol=1e-5 if dtype == numpy.float32 else 1e-12)
+        expected = a_np @ dpnp.asnumpy(X)
+        assert_dtype_allclose(Y, expected)
 
 
 # ---------------------------------------------------------------------------
-# ─── aslinearoperator ────────────────────────────────────────────────────────────────────────
+# aslinearoperator
 # ---------------------------------------------------------------------------
 
+
 class TestAsLinearOperator:
+    """Tests for aslinearoperator wrapping utility."""
 
-    def test_identity_on_linearoperator(self):
+    def test_identity_if_already_linearoperator(self):
         lo = LinearOperator((3, 3), matvec=lambda x: x)
         assert aslinearoperator(lo) is lo
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
-                                        numpy.complex64, numpy.complex128])
+    @pytest.mark.parametrize(
+        "dtype",
+        get_all_dtypes(no_bool=True, no_complex=False),
+    )
     def test_dense_dpnp_array(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
         n = 6
-        A_np = _RNG.standard_normal((n, n)).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        lo = aslinearoperator(A_dp)
+        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        a_dp = dpnp.asarray(a_np)
+        lo = aslinearoperator(a_dp)
         assert lo.shape == (n, n)
-        x = dpnp.asarray(_RNG.standard_normal(n).astype(dtype))
-        y = lo.matvec(x)
-        expected = A_np @ dpnp.asnumpy(x)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(y), expected,
-            rtol=1e-5 if dtype in (numpy.float32, numpy.complex64) else 1e-12)
+        x = dpnp.asarray(
+            generate_random_numpy_array((n,), dtype, seed_value=1)
+        )
+        result = lo.matvec(x)
+        expected = a_np @ dpnp.asnumpy(x)
+        assert_dtype_allclose(result, expected)
 
     def test_dense_numpy_array(self):
         n = 5
-        A_np = _RNG.standard_normal((n, n))
-        lo = aslinearoperator(A_np)
+        a_np = generate_random_numpy_array(
+            (n, n), numpy.float64, seed_value=42
+        )
+        lo = aslinearoperator(a_np)
         assert lo.shape == (n, n)
 
     def test_rmatvec_from_dense(self):
         n = 5
-        A_np = _RNG.standard_normal((n, n))
-        A_dp = dpnp.asarray(A_np)
-        lo = aslinearoperator(A_dp)
-        x = dpnp.asarray(_RNG.standard_normal(n))
-        y = lo.rmatvec(x)
-        expected = A_np.conj().T @ dpnp.asnumpy(x)
-        numpy.testing.assert_allclose(dpnp.asnumpy(y), expected, atol=1e-12)
+        a_np = generate_random_numpy_array(
+            (n, n), numpy.float64, seed_value=42
+        )
+        a_dp = dpnp.asarray(a_np)
+        lo = aslinearoperator(a_dp)
+        x = dpnp.asarray(
+            generate_random_numpy_array((n,), numpy.float64, seed_value=2)
+        )
+        result = lo.rmatvec(x)
+        expected = a_np.conj().T @ dpnp.asnumpy(x)
+        assert_allclose(dpnp.asnumpy(result), expected, atol=1e-12)
 
     def test_duck_type_with_shape_and_matvec(self):
         n = 4
 
-        class DuckOp:
+        class _DuckOp:
             shape = (n, n)
             dtype = numpy.float64
+
             def matvec(self, x):
                 return dpnp.asarray(dpnp.asnumpy(x) * 2.0)
 
-        lo = aslinearoperator(DuckOp())
-        x = dpnp.ones(n)
-        y = lo.matvec(x)
-        numpy.testing.assert_allclose(dpnp.asnumpy(y), numpy.ones(n) * 2.0)
+        lo = aslinearoperator(_DuckOp())
+        x = dpnp.ones(n, dtype=dpnp.float64)
+        result = lo.matvec(x)
+        assert_allclose(dpnp.asnumpy(result), numpy.full(n, 2.0), atol=1e-12)
 
     def test_invalid_type_raises(self):
-        with pytest.raises(TypeError):
+        with assert_raises(TypeError):
             aslinearoperator("not_an_array")
 
     def test_invalid_1d_array_raises(self):
@@ -300,373 +463,483 @@ def test_invalid_1d_array_raises(self):
 
 
 # ---------------------------------------------------------------------------
-# ─── CG ──────────────────────────────────────────────────────────────────────────────────────
+# CG
 # ---------------------------------------------------------------------------
 
-@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required")
+
+@pytest.mark.skipif(
+    not is_scipy_available(), reason="SciPy not available"
+)
 class TestCg:
-    """Tests mirroring CuPy's TestCg class."""
+    """Tests for cg (Conjugate Gradient).
+
+    Mirrors TestCholesky / TestDet structure from test_linalg.py.
+    """
 
     n = 30
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
-                                        numpy.complex64, numpy.complex128])
-    def test_converges_spd(self, dtype):
-        A = _spd_matrix(self.n, dtype)
-        b = _rhs(self.n, dtype)
-        x, info = cg(A, b, tol=1e-8, maxiter=500)
+    @pytest.mark.parametrize(
+        "dtype",
+        get_float_complex_dtypes(),
+    )
+    def test_cg_converges_spd(self, dtype):
+        """CG must converge on symmetric positive-definite matrices."""
+        a_dp = _spd_matrix(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
+        x, info = cg(a_dp, b_dp, tol=1e-8, maxiter=500)
         assert info == 0
-        res = dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b)
+        res = dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
         assert float(res) < 1e-5
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_matches_scipy_reference(self, dtype):
-        A_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_cg_matches_scipy(self, dtype):
+        """Solution must match scipy.sparse.linalg.cg within dtype tolerance."""
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
         b_np = dpnp.asnumpy(_rhs(self.n, dtype))
-        x_ref, info_ref = scipy_sla.cg(A_np, b_np, rtol=1e-8, maxiter=500)
+        x_ref, info_ref = scipy_sla.cg(a_np, b_np, rtol=1e-8, maxiter=500)
         assert info_ref == 0
-        x_dp, info = cg(dpnp.asarray(A_np), dpnp.asarray(b_np),
-                        tol=1e-8, maxiter=500)
+        x_dp, info = cg(
+            dpnp.asarray(a_np), dpnp.asarray(b_np), tol=1e-8, maxiter=500
+        )
         assert info == 0
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(x_dp), x_ref,
-            rtol=1e-4 if dtype == numpy.float32 else 1e-8)
-
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_x0_warm_start(self, dtype):
-        A = _spd_matrix(self.n, dtype)
-        b = _rhs(self.n, dtype)
+        tol = 1e-4 if dtype == dpnp.float32 else 1e-8
+        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=tol)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_cg_x0_warm_start(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _spd_matrix(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
         x0 = dpnp.ones(self.n, dtype=dtype)
-        x, info = cg(A, b, x0=x0, tol=1e-8, maxiter=500)
+        x, info = cg(a_dp, b_dp, x0=x0, tol=1e-8, maxiter=500)
         assert info == 0
-        res = dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b)
+        res = dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
         assert float(res) < 1e-5
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_b_2dim(self, dtype):
-        """b with shape (n, 1) should be accepted and flattened."""
-        A = _spd_matrix(self.n, dtype)
-        b = _rhs(self.n, dtype).reshape(self.n, 1)
-        x, info = cg(A, b, tol=1e-8, maxiter=500)
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_cg_b_2dim(self, dtype):
+        """b with shape (n, 1) must be accepted and flattened internally."""
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _spd_matrix(self.n, dtype)
+        b_dp = _rhs(self.n, dtype).reshape(self.n, 1)
+        x, info = cg(a_dp, b_dp, tol=1e-8, maxiter=500)
         assert info == 0
 
-    def test_callback_is_called(self):
-        A = _spd_matrix(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
+    def test_cg_callback_called(self):
+        a_dp = _spd_matrix(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
         calls = []
-        def cb(xk):
+
+        def _cb(xk):
             calls.append(float(dpnp.linalg.norm(xk)))
-        cg(A, b, callback=cb, maxiter=200)
+
+        cg(a_dp, b_dp, callback=_cb, maxiter=200)
         assert len(calls) > 0
 
-    @pytest.mark.parametrize("dtype", [numpy.float64])
-    def test_atol(self, dtype):
-        A = _spd_matrix(self.n, dtype)
-        b = _rhs(self.n, dtype)
-        x, info = cg(A, b, tol=0.0, atol=1e-1)
-        res = float(dpnp.linalg.norm(A @ x - b))
+    def test_cg_atol(self):
+        a_dp = _spd_matrix(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
+        x, info = cg(a_dp, b_dp, tol=0.0, atol=1e-1)
+        res = float(dpnp.linalg.norm(a_dp @ x - b_dp))
         assert res < 1.0
 
-    def test_exact_solution_zero_iter(self):
-        """If x0 is already the solution, residual is zero and CG returns info=0."""
+    def test_cg_exact_solution_no_iterations(self):
+        """When x0 is the exact solution the residual must be zero immediately."""
         n = 10
-        A = _spd_matrix(n, numpy.float64)
-        b = _rhs(n, numpy.float64)
+        a_dp = _spd_matrix(n, numpy.float64)
+        b_dp = _rhs(n, numpy.float64)
         x_true = dpnp.asarray(
-            numpy.linalg.solve(dpnp.asnumpy(A), dpnp.asnumpy(b)))
-        x, info = cg(A, b, x0=x_true, tol=1e-12)
+            numpy.linalg.solve(dpnp.asnumpy(a_dp), dpnp.asnumpy(b_dp))
+        )
+        x, info = cg(a_dp, b_dp, x0=x_true, tol=1e-12)
         assert info == 0
 
-    @pytest.mark.parametrize("dtype", [numpy.float64])
-    def test_via_linear_operator(self, dtype):
-        A_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
-        A_dp = dpnp.asarray(A_np)
-        b = dpnp.asarray(_RNG.standard_normal(self.n))
-        lo = aslinearoperator(A_dp)
-        x, info = cg(lo, b, tol=1e-8, maxiter=500)
+    @pytest.mark.parametrize(
+        "dtype",
+        get_float_complex_dtypes(),
+    )
+    def test_cg_via_linear_operator(self, dtype):
+        """CG with A supplied as a LinearOperator."""
+        a_dp = _spd_matrix(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
+        lo = aslinearoperator(a_dp)
+        x, info = cg(lo, b_dp, tol=1e-8, maxiter=500)
         assert info == 0
-        res = float(dpnp.linalg.norm(
-            dpnp.asarray(A_np) @ x - b)) / float(dpnp.linalg.norm(b))
+        res = float(
+            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
+        )
         assert res < 1e-5
 
-    def test_invalid_non_square(self):
-        A = dpnp.ones((5, 6), dtype=dpnp.float64)
-        b = dpnp.ones(5)
-        with pytest.raises(Exception):
-            cg(A, b)
+    def test_cg_maxiter_nonconvergence_info_positive(self):
+        """maxiter=1 on a hard problem must give info != 0."""
+        a_dp = _spd_matrix(50, numpy.float64)
+        b_dp = _rhs(50, numpy.float64)
+        _, info = cg(a_dp, b_dp, tol=1e-15, maxiter=1)
+        assert info != 0
 
-    def test_invalid_b_wrong_size(self):
-        A = _spd_matrix(5, numpy.float64)
-        b = dpnp.ones(6)
+    def test_cg_wrong_b_size_raises(self):
+        a_dp = _spd_matrix(5, numpy.float64)
+        b_dp = dpnp.ones(6, dtype=dpnp.float64)
         with pytest.raises((ValueError, Exception)):
-            cg(A, b, maxiter=1)
-
-    def test_maxiter_nonconvergence_info(self):
-        """Setting maxiter=1 on a hard problem should return info > 0."""
-        A = _spd_matrix(50, numpy.float64)
-        b = _rhs(50, numpy.float64)
-        x, info = cg(A, b, tol=1e-15, maxiter=1)
-        assert info != 0
+            cg(a_dp, b_dp, maxiter=1)
 
 
 # ---------------------------------------------------------------------------
-# ─── GMRES ───────────────────────────────────────────────────────────────────────────
+# GMRES
 # ---------------------------------------------------------------------------
 
-@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required")
+
+@pytest.mark.skipif(
+    not is_scipy_available(), reason="SciPy not available"
+)
 class TestGmres:
-    """Tests mirroring CuPy's TestGmres class."""
+    """Tests for gmres (Generalised Minimum Residual).
+
+    Mirrors the class structure of TestDet / TestCg above.
+    """
 
     n = 30
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64,
-                                        numpy.complex64, numpy.complex128])
-    def test_converges_diag_dominant(self, dtype):
-        A = _diag_dominant(self.n, dtype)
-        b = _rhs(self.n, dtype)
-        x, info = gmres(A, b, tol=1e-8, maxiter=50, restart=30)
+    @pytest.mark.parametrize(
+        "dtype",
+        get_float_complex_dtypes(),
+    )
+    def test_gmres_converges_diag_dominant(self, dtype):
+        """GMRES must converge on diagonally dominant non-symmetric systems."""
+        a_dp = _diag_dominant(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
+        x, info = gmres(a_dp, b_dp, tol=1e-8, maxiter=50, restart=self.n)
         assert info == 0
-        res = dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b)
+        res = dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
         assert float(res) < 1e-5
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_matches_scipy_reference(self, dtype):
-        A_np = dpnp.asnumpy(_diag_dominant(self.n, dtype))
-        b_np = _RNG.standard_normal(self.n).astype(dtype)
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_gmres_matches_scipy(self, dtype):
+        """Solution must match scipy.sparse.linalg.gmres within dtype tolerance."""
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_np = dpnp.asnumpy(_diag_dominant(self.n, dtype))
+        b_np = generate_random_numpy_array(
+            (self.n,), dtype, seed_value=7
+        )
         b_np /= numpy.linalg.norm(b_np)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            x_ref, info_ref = scipy_sla.gmres(
-                A_np, b_np, rtol=1e-8, restart=self.n, maxiter=None)
+            x_ref, _ = scipy_sla.gmres(
+                a_np, b_np, rtol=1e-8, restart=self.n, maxiter=None
+            )
         x_dp, info = gmres(
-            dpnp.asarray(A_np), dpnp.asarray(b_np),
-            tol=1e-8, restart=self.n, maxiter=50)
+            dpnp.asarray(a_np),
+            dpnp.asarray(b_np),
+            tol=1e-8,
+            restart=self.n,
+            maxiter=50,
+        )
         assert info == 0
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(x_dp), x_ref,
-            rtol=1e-3 if dtype == numpy.float32 else 1e-7)
-
-    @pytest.mark.parametrize("restart", [None, 5, 15])
-    def test_restart_values(self, restart):
-        A = _diag_dominant(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
-        x, info = gmres(A, b, tol=1e-8, restart=restart, maxiter=100)
+        tol = 1e-3 if dtype == dpnp.float32 else 1e-7
+        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=tol)
+
+    @pytest.mark.parametrize(
+        "restart",
+        [None, 5, 15],
+        ids=["restart=None", "restart=5", "restart=15"],
+    )
+    def test_gmres_restart_values(self, restart):
+        a_dp = _diag_dominant(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
+        x, info = gmres(a_dp, b_dp, tol=1e-8, restart=restart, maxiter=100)
         assert info == 0
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_x0_warm_start(self, dtype):
-        A = _diag_dominant(self.n, dtype)
-        b = _rhs(self.n, dtype)
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_gmres_x0_warm_start(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _diag_dominant(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
         x0 = dpnp.ones(self.n, dtype=dtype)
-        x, info = gmres(A, b, x0=x0, tol=1e-8, maxiter=100)
+        x, info = gmres(a_dp, b_dp, x0=x0, tol=1e-8, maxiter=100)
         assert info == 0
 
-    def test_b_2dim(self):
-        """b with shape (n, 1) should be accepted."""
-        A = _diag_dominant(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64).reshape(self.n, 1)
-        x, info = gmres(A, b, tol=1e-8, maxiter=100)
+    def test_gmres_b_2dim(self):
+        """b with shape (n, 1) must be accepted and flattened internally."""
+        a_dp = _diag_dominant(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64).reshape(self.n, 1)
+        x, info = gmres(a_dp, b_dp, tol=1e-8, maxiter=100)
         assert info == 0
 
-    def test_callback_x_called(self):
-        A = _diag_dominant(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
+    def test_gmres_callback_x_called(self):
+        a_dp = _diag_dominant(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
         calls = []
-        def cb(xk):
+
+        def _cb(xk):
             calls.append(1)
-        gmres(A, b, callback=cb, callback_type='x', maxiter=20)
+
+        gmres(a_dp, b_dp, callback=_cb, callback_type="x", maxiter=20)
         assert len(calls) > 0
 
-    def test_callback_pr_norm_not_implemented(self):
-        A = _diag_dominant(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
+    def test_gmres_callback_pr_norm_not_implemented(self):
+        a_dp = _diag_dominant(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
         with pytest.raises(NotImplementedError):
-            gmres(A, b, callback=lambda r: None, callback_type='pr_norm')
-
-    def test_invalid_callback_type(self):
-        A = _diag_dominant(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
-        with pytest.raises(ValueError):
-            gmres(A, b, callback_type='garbage')
-
-    @pytest.mark.parametrize("dtype", [numpy.float64])
-    def test_via_linear_operator(self, dtype):
-        A_np = dpnp.asnumpy(_diag_dominant(self.n, dtype))
-        A_dp = dpnp.asarray(A_np)
-        b = dpnp.asarray(_RNG.standard_normal(self.n))
-        lo = aslinearoperator(A_dp)
-        x, info = gmres(lo, b, tol=1e-8, restart=self.n, maxiter=50)
+            gmres(a_dp, b_dp, callback=lambda r: None, callback_type="pr_norm")
+
+    def test_gmres_invalid_callback_type_raises(self):
+        a_dp = _diag_dominant(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
+        with assert_raises(ValueError):
+            gmres(a_dp, b_dp, callback_type="garbage")
+
+    def test_gmres_atol(self):
+        a_dp = _diag_dominant(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
+        x, info = gmres(
+            a_dp, b_dp, tol=0.0, atol=1e-6, restart=self.n, maxiter=50
+        )
+        res = float(dpnp.linalg.norm(a_dp @ x - b_dp))
+        assert res < 1e-4
+
+    @pytest.mark.parametrize(
+        "dtype",
+        get_float_complex_dtypes(),
+    )
+    def test_gmres_via_linear_operator(self, dtype):
+        a_dp = _diag_dominant(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
+        lo = aslinearoperator(a_dp)
+        x, info = gmres(lo, b_dp, tol=1e-8, restart=self.n, maxiter=50)
         assert info == 0
 
-    def test_nonconvergence_info_nonzero(self):
-        """restart=2, maxiter=2 on a size-48 Hilbert-like matrix must not converge."""
+    def test_gmres_nonconvergence_info_nonzero(self):
+        """Hilbert-like ill-conditioned matrix with tiny restart must not converge."""
         n = 48
         idx = numpy.arange(n, dtype=numpy.float64)
-        A_np = 1.0 / (idx[:, None] + idx[None, :] + 1.0)
-        b_np = _RNG.standard_normal(n)
-        A_dp = dpnp.asarray(A_np)
+        a_np = 1.0 / (idx[:, None] + idx[None, :] + 1.0)
+        b_np = generate_random_numpy_array((n,), numpy.float64, seed_value=5)
+        a_dp = dpnp.asarray(a_np)
         b_dp = dpnp.asarray(b_np)
-        x, info = gmres(A_dp, b_dp, tol=1e-15, restart=2, maxiter=2)
-        rel_res = float(dpnp.linalg.norm(A_dp @ x - b_dp) /
-                        dpnp.linalg.norm(b_dp))
+        x, info = gmres(a_dp, b_dp, tol=1e-15, restart=2, maxiter=2)
+        rel_res = float(
+            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
+        )
         assert rel_res > 1e-12
         assert info != 0
 
-    def test_complex_system(self):
+    def test_gmres_complex_system(self):
         n = 15
-        A_np = (_RNG.standard_normal((n, n)) +
-                1j * _RNG.standard_normal((n, n))).astype(numpy.complex128)
-        numpy.fill_diagonal(A_np, numpy.abs(A_np).sum(axis=1) + 1.0)
-        b_np = (_RNG.standard_normal(n) +
-                1j * _RNG.standard_normal(n)).astype(numpy.complex128)
-        A_dp = dpnp.asarray(A_np)
+        a_np = generate_random_numpy_array(
+            (n, n), numpy.complex128, seed_value=42
+        )
+        numpy.fill_diagonal(a_np, numpy.abs(a_np).sum(axis=1) + 1.0)
+        b_np = generate_random_numpy_array(
+            (n,), numpy.complex128, seed_value=7
+        )
+        a_dp = dpnp.asarray(a_np)
         b_dp = dpnp.asarray(b_np)
-        x, info = gmres(A_dp, b_dp, tol=1e-8, restart=n, maxiter=50)
+        x, info = gmres(a_dp, b_dp, tol=1e-8, restart=n, maxiter=50)
         assert info == 0
-        res = float(numpy.linalg.norm(A_np @ dpnp.asnumpy(x) - b_np) /
-                    numpy.linalg.norm(b_np))
+        res = float(
+            numpy.linalg.norm(a_np @ dpnp.asnumpy(x) - b_np)
+            / numpy.linalg.norm(b_np)
+        )
         assert res < 1e-5
 
-    def test_atol_parameter(self):
-        A = _diag_dominant(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
-        x, info = gmres(A, b, tol=0.0, atol=1e-6, restart=self.n, maxiter=50)
-        res = float(dpnp.linalg.norm(A @ x - b))
-        assert res < 1e-4
-
 
 # ---------------------------------------------------------------------------
-# ─── MINRES ────────────────────────────────────────────────────────────────────────────
+# MINRES
 # ---------------------------------------------------------------------------
 
-@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required for MINRES")
+
+@pytest.mark.skipif(
+    not is_scipy_available(), reason="SciPy required for MINRES backend"
+)
 class TestMinres:
-    """Tests for MINRES (SciPy-backed implementation)."""
+    """Tests for minres (Minimum Residual Method).
+
+    MINRES is SciPy-backed for this implementation; tests verify the
+    dpnp wrapper round-trips correctly.
+    """
 
     n = 30
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_converges_spd(self, dtype):
-        """MINRES on SPD system should converge."""
-        A = _spd_matrix(self.n, dtype)
-        b = _rhs(self.n, dtype)
-        x, info = minres(A, b, tol=1e-8, maxiter=500)
+    @pytest.mark.parametrize(
+        "dtype",
+        [dpnp.float32, dpnp.float64],
+        ids=["float32", "float64"],
+    )
+    def test_minres_converges_spd(self, dtype):
+        """MINRES on an SPD system must converge."""
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _spd_matrix(self.n, dtype)
+        b_dp = _rhs(self.n, dtype)
+        x, info = minres(a_dp, b_dp, tol=1e-8, maxiter=500)
         assert info == 0
-        res = float(dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b))
+        res = float(
+            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
+        )
         assert res < 1e-4
 
-    @pytest.mark.parametrize("dtype", [numpy.float64])
-    def test_converges_sym_indefinite(self, dtype):
-        """MINRES distinguishes itself on symmetric-indefinite systems."""
-        A = _sym_indefinite(self.n, dtype)
-        b = _rhs(self.n, dtype)
-        x, info = minres(A, b, tol=1e-8, maxiter=1000)
-        res = float(dpnp.linalg.norm(A @ x - b) / dpnp.linalg.norm(b))
+    def test_minres_converges_sym_indefinite(self):
+        """MINRES is suited for symmetric indefinite systems unlike CG."""
+        a_dp = _sym_indefinite(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
+        x, info = minres(a_dp, b_dp, tol=1e-8, maxiter=1000)
+        res = float(
+            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
+        )
         assert res < 1e-3
 
-    @pytest.mark.parametrize("dtype", [numpy.float64])
-    def test_matches_scipy_reference(self, dtype):
-        A_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
-        b_np = dpnp.asnumpy(_rhs(self.n, dtype))
-        x_ref, _ = scipy_sla.minres(A_np, b_np, rtol=1e-8)
+    def test_minres_matches_scipy(self):
+        a_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
+        b_np = dpnp.asnumpy(_rhs(self.n, numpy.float64))
+        x_ref, _ = scipy_sla.minres(a_np, b_np, rtol=1e-8)
         x_dp, info = minres(
-            dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-8)
-        numpy.testing.assert_allclose(
-            dpnp.asnumpy(x_dp), x_ref, rtol=1e-6)
+            dpnp.asarray(a_np), dpnp.asarray(b_np), tol=1e-8
+        )
+        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=1e-6)
 
-    def test_x0_warm_start(self):
-        A = _spd_matrix(self.n, numpy.float64)
-        b = _rhs(self.n, numpy.float64)
+    def test_minres_x0_warm_start(self):
+        a_dp = _spd_matrix(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
         x0 = dpnp.zeros(self.n, dtype=numpy.float64)
-        x, info = minres(A, b, x0=x0, tol=1e-8)
+        x, info = minres(a_dp, b_dp, x0=x0, tol=1e-8)
         assert info == 0
 
-    def test_shift_parameter(self):
-        """shift != 0: solves (A - shift*I) x = b."""
-        A_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
+    def test_minres_shift_parameter(self):
+        """shift != 0 solves (A - shift*I) x = b."""
+        a_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
         b_np = dpnp.asnumpy(_rhs(self.n, numpy.float64))
         shift = 0.5
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-        x, info = minres(A_dp, b_dp, shift=shift, tol=1e-8)
-        A_shifted = A_np - shift * numpy.eye(self.n)
-        res = numpy.linalg.norm(A_shifted @ dpnp.asnumpy(x) - b_np)
-        assert res / numpy.linalg.norm(b_np) < 1e-4
-
-    def test_non_square_raises(self):
-        A = aslinearoperator(dpnp.ones((4, 5), dtype=dpnp.float64))
-        b = dpnp.ones(4)
-        with pytest.raises(ValueError):
-            minres(A, b)
-
-    def test_via_linear_operator(self):
-        A_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
-        A_dp = dpnp.asarray(A_np)
-        b = dpnp.asarray(_RNG.standard_normal(self.n))
-        lo = aslinearoperator(A_dp)
-        x, info = minres(lo, b, tol=1e-8)
+        x_dp, info = minres(
+            dpnp.asarray(a_np), dpnp.asarray(b_np), shift=shift, tol=1e-8
+        )
+        a_shifted = a_np - shift * numpy.eye(self.n)
+        res = numpy.linalg.norm(
+            a_shifted @ dpnp.asnumpy(x_dp) - b_np
+        ) / numpy.linalg.norm(b_np)
+        assert res < 1e-4
+
+    def test_minres_non_square_raises(self):
+        a_lo = aslinearoperator(
+            dpnp.ones((4, 5), dtype=dpnp.float64)
+        )
+        b = dpnp.ones(4, dtype=dpnp.float64)
+        with assert_raises(ValueError):
+            minres(a_lo, b)
+
+    def test_minres_via_linear_operator(self):
+        a_dp = _spd_matrix(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
+        lo = aslinearoperator(a_dp)
+        x, info = minres(lo, b_dp, tol=1e-8)
         assert info == 0
 
-    @pytest.mark.parametrize("dtype", [numpy.float64])
-    def test_callback_is_called(self, dtype):
-        A = _spd_matrix(self.n, dtype)
-        b = _rhs(self.n, dtype)
+    def test_minres_callback_called(self):
+        a_dp = _spd_matrix(self.n, numpy.float64)
+        b_dp = _rhs(self.n, numpy.float64)
         calls = []
-        def cb(xk):
+
+        def _cb(xk):
             calls.append(1)
-        minres(A, b, callback=cb, tol=1e-8)
+
+        minres(a_dp, b_dp, callback=_cb, tol=1e-8)
         assert len(calls) > 0
 
 
 # ---------------------------------------------------------------------------
-# ─── Integration: all solvers via LinearOperator ─────────────────────────────────────────
+# Integration: all solvers via LinearOperator with varying n / dtype
 # ---------------------------------------------------------------------------
 
-@pytest.mark.skipif(not HAS_SCIPY, reason="SciPy required")
-class TestSolversViaLinearOperator:
-    """Parametric integration tests with varying n and dtype."""
-
-    @pytest.mark.parametrize("n,dtype", [
-        (10, numpy.float32), (10, numpy.float64),
-        (30, numpy.float64), (50, numpy.float64),
-    ])
-    def test_cg_spd_lo(self, n, dtype):
-        A_dp = _spd_matrix(n, dtype)
-        lo = aslinearoperator(A_dp)
-        b = _rhs(n, dtype)
-        x, info = cg(lo, b, tol=1e-8, maxiter=n * 10)
+
+@pytest.mark.skipif(
+    not is_scipy_available(), reason="SciPy not available"
+)
+class TestSolversIntegration:
+    """Parametric integration tests — n and dtype combinations.
+
+    Follows the style of test_usm_ndarray_linalg_batch in test_linalg.py.
+    """
+
+    @pytest.mark.parametrize(
+        "n,dtype",
+        [
+            pytest.param(10, dpnp.float32, id="n=10-float32"),
+            pytest.param(10, dpnp.float64, id="n=10-float64"),
+            pytest.param(30, dpnp.float64, id="n=30-float64"),
+            pytest.param(50, dpnp.float64, id="n=50-float64"),
+        ],
+    )
+    def test_cg_spd_via_linearoperator(self, n, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _spd_matrix(n, dtype)
+        lo = aslinearoperator(a_dp)
+        b_dp = _rhs(n, dtype)
+        x, info = cg(lo, b_dp, tol=1e-8, maxiter=n * 10)
         assert info == 0
-        res = float(dpnp.linalg.norm(A_dp @ x - b) / dpnp.linalg.norm(b))
-        atol = 1e-4 if dtype == numpy.float32 else 1e-8
-        assert res < atol
-
-    @pytest.mark.parametrize("n,dtype", [
-        (10, numpy.float32), (10, numpy.float64),
-        (30, numpy.float64),
-    ])
-    def test_gmres_nonsymmetric_lo(self, n, dtype):
-        A_dp = _diag_dominant(n, dtype)
-        lo = aslinearoperator(A_dp)
-        b = _rhs(n, dtype)
-        x, info = gmres(lo, b, tol=1e-8, restart=n, maxiter=50)
+        res = float(
+            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
+        )
+        assert res < (1e-4 if dtype == dpnp.float32 else 1e-8)
+
+    @pytest.mark.parametrize(
+        "n,dtype",
+        [
+            pytest.param(10, dpnp.float32, id="n=10-float32"),
+            pytest.param(10, dpnp.float64, id="n=10-float64"),
+            pytest.param(30, dpnp.float64, id="n=30-float64"),
+        ],
+    )
+    def test_gmres_nonsymmetric_via_linearoperator(self, n, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _diag_dominant(n, dtype)
+        lo = aslinearoperator(a_dp)
+        b_dp = _rhs(n, dtype)
+        x, info = gmres(lo, b_dp, tol=1e-8, restart=n, maxiter=50)
         assert info == 0
 
-
-# ---------------------------------------------------------------------------
-# ─── Import smoke tests ───────────────────────────────────────────────────────────────────────
-# ---------------------------------------------------------------------------
-
-class TestImports:
-    def test_all_symbols_importable(self):
-        from dpnp.scipy.sparse.linalg import (
-            LinearOperator, aslinearoperator, cg, gmres, minres)
-        assert callable(LinearOperator)
-        assert callable(aslinearoperator)
-        assert callable(cg)
-        assert callable(gmres)
-        assert callable(minres)
-
-    def test_all_listed_in_dunder_all(self):
-        import dpnp.scipy.sparse.linalg as mod
-        for name in ("LinearOperator", "aslinearoperator", "cg", "gmres", "minres"):
-            assert name in mod.__all__, f"{name!r} missing from __all__"
+    @pytest.mark.parametrize(
+        "n,dtype",
+        [
+            pytest.param(10, dpnp.float64, id="n=10-float64"),
+            pytest.param(30, dpnp.float64, id="n=30-float64"),
+        ],
+    )
+    def test_minres_spd_via_linearoperator(self, n, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a_dp = _spd_matrix(n, dtype)
+        lo = aslinearoperator(a_dp)
+        b_dp = _rhs(n, dtype)
+        x, info = minres(lo, b_dp, tol=1e-8)
+        assert info == 0
+        res = float(
+            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
+        )
+        assert res < 1e-4

From f295bc18724a4de16e5de9a551409581e4c82906 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 14:27:03 -0500
Subject: [PATCH 12/43] Fix dtype inference: use int8 trial vector so matvec
 preserves operator dtype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs fixed:
1. _init_dtype() was calling dpnp.zeros(n) which defaults to float64,
   so a float32 matvec would upcast and return float64, making the
   inferred dtype wrong.  Fix: use dpnp.zeros(n, dtype=dpnp.int8) as
   SciPy/CuPy do — any numeric matvec will promote int8 to its own dtype.
2. _CustomLinearOperator.__init__ called _init_dtype() even when an
   explicit dtype was already supplied, overwriting the caller's value.
   Fix: _init_dtype() now short-circuits when self.dtype is already set.
---
 dpnp/scipy/sparse/linalg/_interface.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index 47d6e9089f28..cbf6592f1938 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -105,10 +105,22 @@ def __init__(self, dtype, shape):
         self.shape = shape
 
     def _init_dtype(self):
-        """Infer dtype by running a trial matvec on a zero int8 vector."""
-        if self.dtype is None:
-            v = dpnp.zeros(self.shape[-1])
-            self.dtype = self.matvec(v).dtype
+        """Infer dtype by running a trial matvec on a zero int8 vector.
+
+        Uses int8 (not float64) as the probe dtype so that the matvec lambda
+        will promote int8 to whatever the operator's natural dtype is
+        (e.g. float32 @ int8 -> float32).  This matches SciPy's and CuPy's
+        dtype-inference strategy and avoids the previous bug where
+        dpnp.zeros(n) (float64 default) caused float32 operators to report
+        dtype=float64.
+
+        Short-circuits when self.dtype is already set so that an explicit
+        dtype= kwarg is never overwritten.
+        """
+        if self.dtype is not None:
+            return
+        v = dpnp.zeros(self.shape[-1], dtype=dpnp.int8)
+        self.dtype = self.matvec(v).dtype
 
     # ------------------------------------------------------------------ #
     #  Abstract primitives — subclasses override at least one of these    #
@@ -276,6 +288,8 @@ def __init__(self, shape, matvec, rmatvec=None, matmat=None,
         self.__rmatvec_impl = rmatvec
         self.__rmatmat_impl = rmatmat
         self.__matmat_impl  = matmat
+        # _init_dtype() short-circuits when dtype was explicitly provided,
+        # so the caller's explicit dtype= is never overwritten.
         self._init_dtype()
 
     def _matvec(self, x):
@@ -489,7 +503,7 @@ def aslinearoperator(A) -> LinearOperator:
         matvec  = A.matvec  if hasattr(A, "matvec")  else (lambda x: A @ x)
         rmatvec = A.rmatvec if hasattr(A, "rmatvec") else None
         matmat  = A.matmat  if hasattr(A, "matmat")  else None
-        rmatmat = A.rmatmat if hasattr(A, "rmatmat") else None
+        rmatmat = A.rmatmat if hasattr(A, "rmatmat\") else None
         return LinearOperator(
             (m, n),
             matvec=matvec,

From 8c68d981f49f0884ede67cc2743ed49f394fa061 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 6 Apr 2026 20:34:13 +0000
Subject: [PATCH 13/43] add onemkl sparse gemv pybind logic

---
 dpnp/backend/extensions/sparse/gemv.cpp | 200 ++++++++++++++++++++++++
 dpnp/backend/extensions/sparse/gemv.hpp |  56 +++++++
 2 files changed, 256 insertions(+)
 create mode 100644 dpnp/backend/extensions/sparse/gemv.cpp
 create mode 100644 dpnp/backend/extensions/sparse/gemv.hpp

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
new file mode 100644
index 000000000000..fa4ff2a6ebda
--- /dev/null
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -0,0 +1,200 @@
+//*****************************************************************************
+// Copyright (c) 2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <stdexcept>
+#include <vector>
+
+#include "sparse_gemv.hpp"
+
+// oneMKL sparse BLAS
+namespace mkl_sparse = oneapi::mkl::sparse;
+
+namespace dpnp::extensions::sparse
+{
+
+// ---------------------------------------------------------------------------
+// Type-dispatched implementation: y = alpha * op(A) * x + beta * y
+// ---------------------------------------------------------------------------
+
+template <typename T, typename intType>
+static sycl::event
+sparse_gemv_impl(sycl::queue &exec_q,
+                 oneapi::mkl::transpose mkl_trans,
+                 T alpha,
+                 intType *row_ptr_ptr,
+                 intType *col_ind_ptr,
+                 T *values_ptr,
+                 std::int64_t num_rows,
+                 std::int64_t num_cols,
+                 std::int64_t nnz,
+                 T *x_ptr,
+                 T beta,
+                 T *y_ptr,
+                 const std::vector<sycl::event> &depends)
+{
+    mkl_sparse::matrix_handle_t handle = nullptr;
+    mkl_sparse::init_matrix_handle(&handle);
+
+    auto ev_set = mkl_sparse::set_csr_data(
+        exec_q, handle,
+        num_rows, num_cols,
+        oneapi::mkl::index_base::zero,
+        row_ptr_ptr, col_ind_ptr, values_ptr,
+        depends);
+
+    // optimize_gemv performs internal analysis — amortises over repeated SpMV
+    auto ev_opt = mkl_sparse::optimize_gemv(
+        exec_q, mkl_trans, handle, {ev_set});
+
+    auto ev_gemv = mkl_sparse::gemv(
+        exec_q, mkl_trans,
+        alpha, handle,
+        x_ptr, beta, y_ptr,
+        {ev_opt});
+
+    // async release — waits for ev_gemv internally
+    mkl_sparse::release_matrix_handle(exec_q, &handle, {ev_gemv});
+
+    return ev_gemv;
+}
+
+
+// ---------------------------------------------------------------------------
+// Python-facing function
+// ---------------------------------------------------------------------------
+
+std::pair<sycl::event, sycl::event>
+sparse_gemv(sycl::queue &exec_q,
+            const int trans,
+            const double alpha,
+            const dpctl::tensor::usm_ndarray &row_ptr,
+            const dpctl::tensor::usm_ndarray &col_ind,
+            const dpctl::tensor::usm_ndarray &values,
+            const dpctl::tensor::usm_ndarray &x,
+            const double beta,
+            const dpctl::tensor::usm_ndarray &y,
+            const std::int64_t num_rows,
+            const std::int64_t num_cols,
+            const std::int64_t nnz,
+            const std::vector<sycl::event> &depends)
+{
+    // Map trans integer to oneMKL enum
+    oneapi::mkl::transpose mkl_trans;
+    switch (trans) {
+        case 0: mkl_trans = oneapi::mkl::transpose::nontrans; break;
+        case 1: mkl_trans = oneapi::mkl::transpose::trans; break;
+        case 2: mkl_trans = oneapi::mkl::transpose::conjtrans; break;
+        default:
+            throw std::invalid_argument(
+                "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
+    }
+
+    int val_typenum = values.get_typenum();
+    int idx_typenum = row_ptr.get_typenum();
+
+    sycl::event gemv_ev;
+
+    // Dispatch on value type × index type
+    // oneMKL sparse BLAS supports float32, float64 (no complex yet)
+    if (val_typenum == UAR_FLOAT) {
+        auto alpha_f = static_cast<float>(alpha);
+        auto beta_f  = static_cast<float>(beta);
+
+        if (idx_typenum == UAR_INT32) {
+            gemv_ev = sparse_gemv_impl<float, std::int32_t>(
+                exec_q, mkl_trans, alpha_f,
+                row_ptr.get_data<std::int32_t>(),
+                col_ind.get_data<std::int32_t>(),
+                values.get_data<float>(),
+                num_rows, num_cols, nnz,
+                x.get_data<float>(), beta_f,
+                y.get_data<float>(), depends);
+        }
+        else if (idx_typenum == UAR_INT64) {
+            gemv_ev = sparse_gemv_impl<float, std::int64_t>(
+                exec_q, mkl_trans, alpha_f,
+                row_ptr.get_data<std::int64_t>(),
+                col_ind.get_data<std::int64_t>(),
+                values.get_data<float>(),
+                num_rows, num_cols, nnz,
+                x.get_data<float>(), beta_f,
+                y.get_data<float>(), depends);
+        }
+        else {
+            throw std::runtime_error(
+                "sparse_gemv: index dtype must be int32 or int64");
+        }
+    }
+    else if (val_typenum == UAR_DOUBLE) {
+        if (idx_typenum == UAR_INT32) {
+            gemv_ev = sparse_gemv_impl<double, std::int32_t>(
+                exec_q, mkl_trans, alpha,
+                row_ptr.get_data<std::int32_t>(),
+                col_ind.get_data<std::int32_t>(),
+                values.get_data<double>(),
+                num_rows, num_cols, nnz,
+                x.get_data<double>(), beta,
+                y.get_data<double>(), depends);
+        }
+        else if (idx_typenum == UAR_INT64) {
+            gemv_ev = sparse_gemv_impl<double, std::int64_t>(
+                exec_q, mkl_trans, alpha,
+                row_ptr.get_data<std::int64_t>(),
+                col_ind.get_data<std::int64_t>(),
+                values.get_data<double>(),
+                num_rows, num_cols, nnz,
+                x.get_data<double>(), beta,
+                y.get_data<double>(), depends);
+        }
+        else {
+            throw std::runtime_error(
+                "sparse_gemv: index dtype must be int32 or int64");
+        }
+    }
+    else {
+        throw std::runtime_error(
+            "sparse_gemv: value dtype must be float32 or float64");
+    }
+
+    return std::make_pair(sycl::event{}, gemv_ev);
+}
+
+
+// ---------------------------------------------------------------------------
+// Dispatch vector init (placeholder — matches blas convention)
+// ---------------------------------------------------------------------------
+
+void init_sparse_gemv_dispatch_vector(void)
+{
+    // No dispatch table needed for sparse_gemv since we do explicit
+    // type switching in the function body (oneMKL sparse API uses
+    // opaque handles, not templated dispatch tables).
+    // This function exists to match the dpnp extension convention.
+}
+
+} // namespace dpnp::extensions::sparse
diff --git a/dpnp/backend/extensions/sparse/gemv.hpp b/dpnp/backend/extensions/sparse/gemv.hpp
new file mode 100644
index 000000000000..518355b4b41e
--- /dev/null
+++ b/dpnp/backend/extensions/sparse/gemv.hpp
@@ -0,0 +1,56 @@
+//*****************************************************************************
+// Copyright (c) 2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <oneapi/mkl.hpp>
+#include <sycl/sycl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp::extensions::sparse
+{
+
+extern std::pair<sycl::event, sycl::event>
+sparse_gemv(sycl::queue &exec_q,
+            const int trans,
+            const double alpha,
+            const dpctl::tensor::usm_ndarray &row_ptr,
+            const dpctl::tensor::usm_ndarray &col_ind,
+            const dpctl::tensor::usm_ndarray &values,
+            const dpctl::tensor::usm_ndarray &x,
+            const double beta,
+            const dpctl::tensor::usm_ndarray &y,
+            const std::int64_t num_rows,
+            const std::int64_t num_cols,
+            const std::int64_t nnz,
+            const std::vector<sycl::event> &depends);
+
+extern void init_sparse_gemv_dispatch_vector(void);
+
+} // namespace dpnp::extensions::sparse

From 0c4a888c55ef5c1cb789fb465d493d2254429354 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 15:48:45 -0500
Subject: [PATCH 14/43] sparse: add pybind11 module, CMakeLists, and hook
 _sparse_impl into _iterative.py

---
 dpnp/backend/extensions/sparse/CMakeLists.txt | 112 +++++++++++++++
 .../backend/extensions/sparse/sparse_gemv.hpp |  32 +++++
 dpnp/backend/extensions/sparse/sparse_py.cpp  | 132 ++++++++++++++++++
 dpnp/scipy/sparse/linalg/_iterative.py        |  88 ++++++++----
 4 files changed, 340 insertions(+), 24 deletions(-)
 create mode 100644 dpnp/backend/extensions/sparse/CMakeLists.txt
 create mode 100644 dpnp/backend/extensions/sparse/sparse_gemv.hpp
 create mode 100644 dpnp/backend/extensions/sparse/sparse_py.cpp

diff --git a/dpnp/backend/extensions/sparse/CMakeLists.txt b/dpnp/backend/extensions/sparse/CMakeLists.txt
new file mode 100644
index 000000000000..549437b6aad3
--- /dev/null
+++ b/dpnp/backend/extensions/sparse/CMakeLists.txt
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+set(python_module_name _sparse_impl)
+set(_module_src
+    ${CMAKE_CURRENT_SOURCE_DIR}/sparse_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/gemv.cpp
+)
+
+pybind11_add_module(${python_module_name} MODULE ${_module_src})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+
+if(_dpnp_sycl_targets)
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE ${_dpnp_sycl_target_compile_options}
+    )
+    target_link_options(${python_module_name} PRIVATE ${_dpnp_sycl_target_link_options})
+endif()
+
+if(WIN32)
+    if(${CMAKE_VERSION} VERSION_LESS "3.27")
+        set(CMAKE_CXX_LINK_FLAGS
+            "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel"
+        )
+    endif()
+endif()
+
+set_target_properties(
+    ${python_module_name}
+    PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON
+)
+
+target_include_directories(
+    ${python_module_name}
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+)
+
+target_include_directories(
+    ${python_module_name}
+    SYSTEM
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${Dpctl_TENSOR_INCLUDE_DIR}
+)
+
+if(WIN32)
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE /clang:-fno-approx-func /clang:-fno-finite-math-only
+    )
+else()
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE -fno-approx-func -fno-finite-math-only
+    )
+endif()
+
+target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
+
+if(DPNP_GENERATE_COVERAGE)
+    target_link_options(
+        ${python_module_name}
+        PRIVATE -fprofile-instr-generate -fcoverage-mapping
+    )
+endif()
+
+if(_ues_onemath)
+    target_link_libraries(${python_module_name} PRIVATE ${ONEMATH_LIB})
+    target_compile_options(${python_module_name} PRIVATE -DUSE_ONEMATH)
+    if(_ues_onemath_cuda)
+        target_compile_options(${python_module_name} PRIVATE -DUSE_ONEMATH_CUSPARSE)
+    endif()
+else()
+    target_link_libraries(${python_module_name} PUBLIC MKL::MKL_SYCL::SPARSE)
+endif()
+
+if(DPNP_WITH_REDIST)
+    set_target_properties(
+        ${python_module_name}
+        PROPERTIES INSTALL_RPATH "$ORIGIN/../../../../../../"
+    )
+endif()
+
+install(TARGETS ${python_module_name} DESTINATION "dpnp/backend/extensions/sparse")
diff --git a/dpnp/backend/extensions/sparse/sparse_gemv.hpp b/dpnp/backend/extensions/sparse/sparse_gemv.hpp
new file mode 100644
index 000000000000..261c65669b65
--- /dev/null
+++ b/dpnp/backend/extensions/sparse/sparse_gemv.hpp
@@ -0,0 +1,32 @@
+//*****************************************************************************
+// Copyright (c) 2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+// gemv.cpp includes this header by name; gemv.hpp holds the actual declarations.
+// Both files live in the same directory so this redirect is zero-cost.
+#pragma once
+#include "gemv.hpp"
diff --git a/dpnp/backend/extensions/sparse/sparse_py.cpp b/dpnp/backend/extensions/sparse/sparse_py.cpp
new file mode 100644
index 000000000000..b2cc40e0bd2e
--- /dev/null
+++ b/dpnp/backend/extensions/sparse/sparse_py.cpp
@@ -0,0 +1,132 @@
+//*****************************************************************************
+// Copyright (c) 2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+// Defines the dpnp.backend._sparse_impl pybind11 extension module.
+// Provides oneMKL sparse BLAS operations on CSR matrices over dpctl USM arrays.
+// Equivalent role to _cusparse for the SYCL/oneMKL backend.
+//
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "gemv.hpp"
+
+namespace sparse_ns = dpnp::extensions::sparse;
+namespace py = pybind11;
+
+static void init_dispatch_vectors_tables(void)
+{
+    sparse_ns::init_sparse_gemv_dispatch_vector();
+}
+
+PYBIND11_MODULE(_sparse_impl, m)
+{
+    init_dispatch_vectors_tables();
+
+    using arrayT     = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    // ------------------------------------------------------------------
+    // _sparse_gemv — CSR SpMV:  y = alpha * op(A) * x + beta * y
+    //
+    // Equivalent to _cusparse.spMV_make_fast_matvec for the SYCL stack.
+    // Backed by oneMKL sparse::gemv with set_csr_data + optimize_gemv so
+    // matrix-handle analysis is amortised across repeated calls.
+    // ------------------------------------------------------------------
+    {
+        m.def(
+            "_sparse_gemv",
+            [](sycl::queue &exec_q,
+               const int trans,
+               const double alpha,
+               const arrayT &row_ptr,
+               const arrayT &col_ind,
+               const arrayT &values,
+               const arrayT &x,
+               const double beta,
+               const arrayT &y,
+               const std::int64_t num_rows,
+               const std::int64_t num_cols,
+               const std::int64_t nnz,
+               const event_vecT &depends) {
+                return sparse_ns::sparse_gemv(
+                    exec_q, trans, alpha,
+                    row_ptr, col_ind, values,
+                    x, beta, y,
+                    num_rows, num_cols, nnz, depends);
+            },
+            "CSR sparse matrix-vector product y = alpha*op(A)*x + beta*y "
+            "via oneMKL sparse::gemv.\n\n"
+            "Parameters\n"
+            "----------\n"
+            "sycl_queue : dpctl.SyclQueue\n"
+            "trans      : int  0=N, 1=T, 2=C\n"
+            "alpha      : float\n"
+            "row_ptr    : usm_ndarray  CSR row offsets (int32 or int64)\n"
+            "col_ind    : usm_ndarray  CSR column indices (int32 or int64)\n"
+            "values     : usm_ndarray  CSR non-zeros (float32 or float64)\n"
+            "x          : usm_ndarray  input vector\n"
+            "beta       : float\n"
+            "y          : usm_ndarray  output vector (in/out)\n"
+            "num_rows, num_cols, nnz : int64\n"
+            "depends    : list[sycl.Event]\n"
+            "\nReturns\n-------\n"
+            "(host_task_event, compute_event) : pair of sycl.Event",
+            py::arg("sycl_queue"),
+            py::arg("trans"),
+            py::arg("alpha"),
+            py::arg("row_ptr"),
+            py::arg("col_ind"),
+            py::arg("values"),
+            py::arg("x"),
+            py::arg("beta"),
+            py::arg("y"),
+            py::arg("num_rows"),
+            py::arg("num_cols"),
+            py::arg("nnz"),
+            py::arg("depends") = py::list());
+    }
+
+    // ------------------------------------------------------------------
+    // Runtime query: which sparse library backend is active
+    // ------------------------------------------------------------------
+    {
+        m.def(
+            "_using_onemath",
+            []() {
+#ifdef USE_ONEMATH
+                return true;
+#else
+                return false;
+#endif
+            },
+            "Return True if built against OneMath portable backend, "
+            "False if built directly against oneMKL.");
+    }
+}
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index c524836da8c2..50fac514f641 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -31,15 +31,16 @@
 
 Performance strategy
 --------------------
-* n <= _HOST_N_THRESHOLD  → delegate to scipy.sparse.linalg (CPU fast path,
+* n <= _HOST_N_THRESHOLD  -> delegate to scipy.sparse.linalg (CPU fast path,
   same philosophy as CuPy host-dispatch for small systems).
-* n >  _HOST_N_THRESHOLD  → pure dpnp path; dense operations dispatch to
+* n >  _HOST_N_THRESHOLD  -> pure dpnp path; dense operations dispatch to
   oneMKL via dpnp.dot / dpnp.linalg.norm / dpnp.vdot (BLAS level-2/3).
-* CSR sparse input        → _make_fast_matvec injects oneMKL sparse::gemv
-  (hook in place; full binding added when dpnp.scipy.sparse matures).
-* GMRES Hessenberg lstsq  → numpy.linalg.lstsq on CPU (the (restart x restart)
+* CSR sparse input        -> _make_fast_matvec injects oneMKL sparse::gemv
+  via the _sparse_impl pybind11 extension (dpnp.backend.extensions.sparse).
+  Falls back to A.dot(x) if the extension is not yet built.
+* GMRES Hessenberg lstsq  -> numpy.linalg.lstsq on CPU (the (restart x restart)
   matrix is tiny; same decision as CuPy).
-* MINRES                  → SciPy host stub (CuPy v14.0.1 has no GPU MINRES;
+* MINRES                  -> SciPy host stub (CuPy v14.0.1 has no GPU MINRES;
   a native oneMKL MINRES will be added in a future dpnp release).
 """
 
@@ -53,6 +54,18 @@
 
 from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
+# ---------------------------------------------------------------------------
+# Try to import the compiled _sparse_impl extension (oneMKL sparse::gemv).
+# If the extension has not been built yet the pure-Python / A.dot fallback
+# is used transparently — no import error is raised at module load time.
+# ---------------------------------------------------------------------------
+try:
+    from dpnp.backend.extensions.sparse import _sparse_impl as _si
+    _HAS_SPARSE_IMPL = True
+except ImportError:
+    _si = None
+    _HAS_SPARSE_IMPL = False
+
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
@@ -94,24 +107,54 @@ def _scipy_tol_kwarg(fn) -> str:
 
 # ---------------------------------------------------------------------------
 # oneMKL sparse SpMV hook
+# Equivalent of _cusparse.spMV_make_fast_matvec for the SYCL/oneMKL backend.
 # ---------------------------------------------------------------------------
-# CuPy equivalent: _make_fast_matvec uses cuSPARSE csrmv for CSR inputs.
-# When dpnp.scipy.sparse exposes oneMKL sparse::gemv, replace the body:
-#
-#   from dpnp.scipy.sparse.linalg._onemkl import spmv_csr
-#   return lambda x: spmv_csr(A.data, A.indices, A.indptr, x, A.shape)
-#
+
 def _make_fast_matvec(A):
-    """Return an accelerated SpMV callable for CSR sparse A, or None."""
+    """Return an accelerated SpMV callable for CSR sparse A, or None.
+
+    Priority order:
+    1. _sparse_impl._sparse_gemv  (oneMKL sparse::gemv, fully async SYCL)
+    2. A.dot                      (dpnp.scipy.sparse CSR dot, fallback)
+    3. None                       (caller will use LinearOperator.matvec)
+    """
     try:
         from dpnp.scipy import sparse as _sp
-        if _sp.issparse(A) and A.format == "csr":
-            # A.dot routes through oneMKL internally when dpnp.scipy.sparse is
-            # backed by the oneAPI DPC++ sparse BLAS.
-            return lambda x: A.dot(x)
+        if not (_sp.issparse(A) and A.format == "csr"):
+            return None
     except (ImportError, AttributeError):
-        pass
-    return None
+        return None
+
+    if _HAS_SPARSE_IMPL:
+        # --- fast path: oneMKL sparse::gemv via pybind11 ---
+        # Pull CSR arrays once; they are already in USM device memory.
+        indptr  = A.indptr          # row_ptr  — int32 or int64 USM array
+        indices = A.indices         # col_ind  — int32 or int64 USM array
+        data    = A.data            # values   — float32 or float64 USM array
+        nrows   = int(A.shape[0])
+        ncols   = int(A.shape[1])
+        nnz     = int(data.shape[0])
+
+        def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
+            y = _dpnp.zeros(nrows, dtype=data.dtype, sycl_queue=x.sycl_queue)
+            _, ev = _si._sparse_gemv(
+                x.sycl_queue,
+                0,            # trans = NoTrans
+                1.0,          # alpha
+                indptr, indices, data,
+                x,
+                0.0,          # beta
+                y,
+                nrows, ncols, nnz,
+                [],           # depends
+            )
+            ev.wait()
+            return y
+
+        return _csr_spmv
+
+    # --- fallback: dpnp.scipy.sparse CSR dot ---
+    return lambda x: A.dot(x)
 
 
 # ---------------------------------------------------------------------------
@@ -373,9 +416,7 @@ def gmres(
             total_iters += 1
             w = M_op.matvec(A_op.matvec(V_cols[j]))
 
-            # Arnoldi step: h = V_j^H w  via single oneMKL BLAS gemv.
-            # CuPy equivalent uses cuBLAS dgemv; this uses oneMKL via dpnp.dot.
-            # Replaces the slow Python loop (vdot per column) in the initial stub.
+            # Arnoldi step: h = V_j^H w via single oneMKL BLAS gemv.
             V_mat  = _dpnp.stack(V_cols, axis=1)          # (n, j+1)
             h_dp   = _dpnp.dot(V_mat.T.conj(), w)         # (j+1,)  -- oneMKL gemv
             h_np   = h_dp.asnumpy()                        # pull tiny vector to CPU
@@ -391,8 +432,7 @@ def gmres(
             V_cols.append(w / h_j1)
             j_inner = j
 
-        # Hessenberg least-squares on CPU (the matrix is at most restart x restart;
-        # CuPy comment: "faster to solve on CPU").
+        # Hessenberg least-squares on CPU (matrix is at most restart x restart)
         k = j_inner + 1
         y_np, _, _, _ = _np.linalg.lstsq(
             H_np[:k + 1, :k], e1_np[:k + 1], rcond=None

From 4993120f36cd7d03e96302d8f47e46da81296d7e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 15:52:45 -0500
Subject: [PATCH 15/43] Remove redundant sparse_gemv.hpp passthrough header

---
 .../backend/extensions/sparse/sparse_gemv.hpp | 32 -------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 dpnp/backend/extensions/sparse/sparse_gemv.hpp

diff --git a/dpnp/backend/extensions/sparse/sparse_gemv.hpp b/dpnp/backend/extensions/sparse/sparse_gemv.hpp
deleted file mode 100644
index 261c65669b65..000000000000
--- a/dpnp/backend/extensions/sparse/sparse_gemv.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//*****************************************************************************
-// Copyright (c) 2025, Intel Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// - Neither the name of the copyright holder nor the names of its contributors
-//   may be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-
-// gemv.cpp includes this header by name; gemv.hpp holds the actual declarations.
-// Both files live in the same directory so this redirect is zero-cost.
-#pragma once
-#include "gemv.hpp"

From a7ddc1cf101c27609e905e516b8f7c9d4abb5235 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 15:54:25 -0500
Subject: [PATCH 16/43] sparse: gemv.cpp includes gemv.hpp directly

---
 dpnp/backend/extensions/sparse/gemv.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index fa4ff2a6ebda..0375ea6649fd 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -29,7 +29,7 @@
 #include <stdexcept>
 #include <vector>
 
-#include "sparse_gemv.hpp"
+#include "gemv.hpp"
 
 // oneMKL sparse BLAS
 namespace mkl_sparse = oneapi::mkl::sparse;
@@ -119,7 +119,7 @@ sparse_gemv(sycl::queue &exec_q,
 
     sycl::event gemv_ev;
 
-    // Dispatch on value type × index type
+    // Dispatch on value type x index type
     // oneMKL sparse BLAS supports float32, float64 (no complex yet)
     if (val_typenum == UAR_FLOAT) {
         auto alpha_f = static_cast<float>(alpha);

From 14cb5c4df10a827bed5b4f7b440a96f5624590a9 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 15:57:02 -0500
Subject: [PATCH 17/43] sparse: capture exec_q from CSR data at closure
 construction

---
 dpnp/scipy/sparse/linalg/_iterative.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 50fac514f641..fcb9de5a6b03 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -57,7 +57,7 @@
 # ---------------------------------------------------------------------------
 # Try to import the compiled _sparse_impl extension (oneMKL sparse::gemv).
 # If the extension has not been built yet the pure-Python / A.dot fallback
-# is used transparently — no import error is raised at module load time.
+# is used transparently - no import error is raised at module load time.
 # ---------------------------------------------------------------------------
 try:
     from dpnp.backend.extensions.sparse import _sparse_impl as _si
@@ -128,17 +128,21 @@ def _make_fast_matvec(A):
     if _HAS_SPARSE_IMPL:
         # --- fast path: oneMKL sparse::gemv via pybind11 ---
         # Pull CSR arrays once; they are already in USM device memory.
-        indptr  = A.indptr          # row_ptr  — int32 or int64 USM array
-        indices = A.indices         # col_ind  — int32 or int64 USM array
-        data    = A.data            # values   — float32 or float64 USM array
+        indptr  = A.indptr          # row_ptr  - int32 or int64 USM array
+        indices = A.indices         # col_ind  - int32 or int64 USM array
+        data    = A.data            # values   - float32 or float64 USM array
         nrows   = int(A.shape[0])
         ncols   = int(A.shape[1])
         nnz     = int(data.shape[0])
+        # Capture the SYCL queue from the matrix data array at closure-creation
+        # time, not from x at call time.  This avoids queue mismatch when x is
+        # constructed on a different (e.g. default CPU) queue.
+        exec_q  = data.sycl_queue
 
         def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
-            y = _dpnp.zeros(nrows, dtype=data.dtype, sycl_queue=x.sycl_queue)
+            y = _dpnp.zeros(nrows, dtype=data.dtype, sycl_queue=exec_q)
             _, ev = _si._sparse_gemv(
-                x.sycl_queue,
+                exec_q,
                 0,            # trans = NoTrans
                 1.0,          # alpha
                 indptr, indices, data,

From 890238c3b4140c3fdf5d8edcd4ca35dfca5037dc Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 17:10:38 -0500
Subject: [PATCH 18/43] sparse/gemv: add missing headers, input validation, and
 MKL/SYCL exception handling

Align gemv.cpp with the conventions established in blas/gemm.cpp:

Headers added:
- ext/common.hpp         (dpctl_td_ns, consistent with other extensions)
- utils/memory_overlap.hpp   (MemoryOverlap guard on x vs y)
- utils/output_validation.hpp (CheckWritable + AmpleMemory on y)
- utils/type_utils.hpp       (validate_type_for_device<T> in impl)
- <sstream>                  (needed for stringstream error_msg)

Exception handling added in sparse_gemv_impl():
- try/catch(oneapi::mkl::exception) around all oneMKL sparse calls
- try/catch(sycl::exception) around all oneMKL sparse calls
- release_matrix_handle cleanup in the exception error path
- throw std::runtime_error with descriptive message on catch

Input validation added in sparse_gemv():
- ndim checks: x and y must be 1-D
- queues_are_compatible() across all 5 USM arrays
- MemoryOverlap()(x, y) aliasing guard
- CheckWritable::throw_if_not_writable(y)
- AmpleMemory::throw_if_not_ample(y, num_rows)
- keep_args_alive() at function exit (was missing, returning empty event)
---
 dpnp/backend/extensions/sparse/gemv.cpp | 126 +++++++++++++++++++-----
 1 file changed, 102 insertions(+), 24 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index 0375ea6649fd..5ad358729441 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -26,13 +26,26 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
+#include <sstream>
 #include <stdexcept>
 #include <vector>
 
+#include <pybind11/pybind11.h>
+
+// ext/common.hpp — dpctl_td_ns; mirrors every other dpnp extension
+#include "ext/common.hpp"
+
+// dpctl tensor validation and utility headers — same set as blas/gemm.cpp
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_utils.hpp"
+
 #include "gemv.hpp"
 
 // oneMKL sparse BLAS
 namespace mkl_sparse = oneapi::mkl::sparse;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
 
 namespace dpnp::extensions::sparse
 {
@@ -57,30 +70,60 @@ sparse_gemv_impl(sycl::queue &exec_q,
                  T *y_ptr,
                  const std::vector<sycl::event> &depends)
 {
+    // Validate that T is supported on this device (mirrors gemm_impl pattern)
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
     mkl_sparse::matrix_handle_t handle = nullptr;
-    mkl_sparse::init_matrix_handle(&handle);
+    sycl::event gemv_ev;
 
-    auto ev_set = mkl_sparse::set_csr_data(
-        exec_q, handle,
-        num_rows, num_cols,
-        oneapi::mkl::index_base::zero,
-        row_ptr_ptr, col_ind_ptr, values_ptr,
-        depends);
+    try {
+        mkl_sparse::init_matrix_handle(&handle);
 
-    // optimize_gemv performs internal analysis — amortises over repeated SpMV
-    auto ev_opt = mkl_sparse::optimize_gemv(
-        exec_q, mkl_trans, handle, {ev_set});
+        auto ev_set = mkl_sparse::set_csr_data(
+            exec_q, handle,
+            num_rows, num_cols,
+            oneapi::mkl::index_base::zero,
+            row_ptr_ptr, col_ind_ptr, values_ptr,
+            depends);
 
-    auto ev_gemv = mkl_sparse::gemv(
-        exec_q, mkl_trans,
-        alpha, handle,
-        x_ptr, beta, y_ptr,
-        {ev_opt});
+        // optimize_gemv performs internal analysis — amortises over repeated SpMV
+        auto ev_opt = mkl_sparse::optimize_gemv(
+            exec_q, mkl_trans, handle, {ev_set});
 
-    // async release — waits for ev_gemv internally
-    mkl_sparse::release_matrix_handle(exec_q, &handle, {ev_gemv});
+        gemv_ev = mkl_sparse::gemv(
+            exec_q, mkl_trans,
+            alpha, handle,
+            x_ptr, beta, y_ptr,
+            {ev_opt});
 
-    return ev_gemv;
+        // async release — waits for gemv_ev internally
+        mkl_sparse::release_matrix_handle(exec_q, &handle, {gemv_ev});
+
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during sparse_gemv() call:"
+               "\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg
+            << "Unexpected SYCL exception caught during sparse_gemv() call:\n"
+            << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) {
+        // Best-effort handle cleanup before re-raising
+        if (handle != nullptr) {
+            mkl_sparse::release_matrix_handle(exec_q, &handle, {});
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return gemv_ev;
 }
 
 
@@ -103,24 +146,55 @@ sparse_gemv(sycl::queue &exec_q,
             const std::int64_t nnz,
             const std::vector<sycl::event> &depends)
 {
-    // Map trans integer to oneMKL enum
+    // --- 1. ndim checks ---
+    if (x.get_ndim() != 1) {
+        throw py::value_error("sparse_gemv: x must be a 1-D array.");
+    }
+    if (y.get_ndim() != 1) {
+        throw py::value_error("sparse_gemv: y must be a 1-D array.");
+    }
+
+    // --- 2. Queue compatibility (all USM arrays must share the same queue) ---
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {row_ptr.get_queue(), col_ind.get_queue(),
+             values.get_queue(),  x.get_queue(), y.get_queue()})) {
+        throw py::value_error(
+            "sparse_gemv: USM allocations are not compatible with the "
+            "execution queue.");
+    }
+
+    // --- 3. Memory overlap: x and y must not alias ---
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(x, y)) {
+        throw py::value_error(
+            "sparse_gemv: input array x and output array y are overlapping "
+            "segments of memory.");
+    }
+
+    // --- 4. Output writability and size ---
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(y);
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        y, static_cast<std::size_t>(num_rows));
+
+    // --- 5. Map trans integer to oneMKL enum ---
     oneapi::mkl::transpose mkl_trans;
     switch (trans) {
-        case 0: mkl_trans = oneapi::mkl::transpose::nontrans; break;
-        case 1: mkl_trans = oneapi::mkl::transpose::trans; break;
+        case 0: mkl_trans = oneapi::mkl::transpose::nontrans;  break;
+        case 1: mkl_trans = oneapi::mkl::transpose::trans;     break;
         case 2: mkl_trans = oneapi::mkl::transpose::conjtrans; break;
         default:
             throw std::invalid_argument(
                 "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
     }
 
+    // --- 6. Type dispatch (value type x index type) ---
+    // oneMKL sparse BLAS supports float32 and float64 (no complex yet)
     int val_typenum = values.get_typenum();
     int idx_typenum = row_ptr.get_typenum();
 
     sycl::event gemv_ev;
 
-    // Dispatch on value type x index type
-    // oneMKL sparse BLAS supports float32, float64 (no complex yet)
     if (val_typenum == UAR_FLOAT) {
         auto alpha_f = static_cast<float>(alpha);
         auto beta_f  = static_cast<float>(beta);
@@ -181,7 +255,11 @@ sparse_gemv(sycl::queue &exec_q,
             "sparse_gemv: value dtype must be float32 or float64");
     }
 
-    return std::make_pair(sycl::event{}, gemv_ev);
+    // Keep all input/output USM arrays alive until gemv_ev completes
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {row_ptr, col_ind, values, x, y}, {gemv_ev});
+
+    return std::make_pair(args_ev, gemv_ev);
 }
 
 

From 838dfd8df6a6e30909823ed8ffa4ee8216e4d713 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 17:23:55 -0500
Subject: [PATCH 19/43] sparse/gemv: replace explicit if/else type dispatch
 with 2-D dispatch table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modeled after blas/gemm.cpp (2-D table: value type x index type) and
blas/gemv.cpp (dispatch vector pattern with ContigFactory + init_dispatch_table).

Changes:
- Add sparse/types_matrix.hpp with SparseGemvTypePairSupportFactory<Tv, Ti>
  encoding the 4 supported combinations: {float32,float64} x {int32,int64}
- Rewrite sparse_gemv_impl() to take typeless char* pointers (matching
  the blas gemv_impl signature style) — type info flows through template
  params only, no runtime branching inside the impl
- Replace the 60-line if/else val_typenum/idx_typenum chain in sparse_gemv()
  with a 2-D dispatch table lookup (gemv_dispatch_table[val_id][idx_id])
- Rename init_sparse_gemv_dispatch_vector -> init_sparse_gemv_dispatch_table
  and implement it via init_dispatch_table<> from ext/common.hpp
- All validation guards and exception handling from prior commit are preserved
---
 dpnp/backend/extensions/sparse/gemv.cpp       | 253 ++++++++----------
 .../extensions/sparse/types_matrix.hpp        |  71 +++++
 2 files changed, 189 insertions(+), 135 deletions(-)
 create mode 100644 dpnp/backend/extensions/sparse/types_matrix.hpp

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index 5ad358729441..c58618afd86a 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -28,50 +28,82 @@
 
 #include <sstream>
 #include <stdexcept>
-#include <vector>
 
 #include <pybind11/pybind11.h>
 
-// ext/common.hpp — dpctl_td_ns; mirrors every other dpnp extension
+// dpnp extension infrastructure
 #include "ext/common.hpp"
 
-// dpctl tensor validation and utility headers — same set as blas/gemm.cpp
+// dpctl tensor validation and utility headers
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
 
 #include "gemv.hpp"
+#include "types_matrix.hpp"
 
-// oneMKL sparse BLAS
 namespace mkl_sparse = oneapi::mkl::sparse;
-namespace py = pybind11;
+namespace py         = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
+using ext::common::init_dispatch_table;
+
 namespace dpnp::extensions::sparse
 {
 
 // ---------------------------------------------------------------------------
-// Type-dispatched implementation: y = alpha * op(A) * x + beta * y
+// Dispatch table: [value_type_id][index_type_id] -> impl function pointer
+// Mirrors the 2-D table pattern of blas/gemm.cpp.
+// ---------------------------------------------------------------------------
+
+typedef sycl::event (*gemv_impl_fn_ptr_t)(
+    sycl::queue &,
+    oneapi::mkl::transpose,
+    double,                        // alpha (always passed as double; cast inside)
+    const char *,                  // row_ptr  (typeless)
+    const char *,                  // col_ind  (typeless)
+    const char *,                  // values   (typeless)
+    std::int64_t,                  // num_rows
+    std::int64_t,                  // num_cols
+    std::int64_t,                  // nnz
+    const char *,                  // x        (typeless)
+    double,                        // beta     (always passed as double; cast inside)
+    char *,                        // y        (typeless, writable)
+    const std::vector<sycl::event> &);
+
+static gemv_impl_fn_ptr_t
+    gemv_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+
+
+// ---------------------------------------------------------------------------
+// Typed implementation — one instantiation per (Tv, Ti) pair
 // ---------------------------------------------------------------------------
 
-template <typename T, typename intType>
+template <typename Tv, typename Ti>
 static sycl::event
-sparse_gemv_impl(sycl::queue &exec_q,
-                 oneapi::mkl::transpose mkl_trans,
-                 T alpha,
-                 intType *row_ptr_ptr,
-                 intType *col_ind_ptr,
-                 T *values_ptr,
-                 std::int64_t num_rows,
-                 std::int64_t num_cols,
-                 std::int64_t nnz,
-                 T *x_ptr,
-                 T beta,
-                 T *y_ptr,
-                 const std::vector<sycl::event> &depends)
+gemv_impl(sycl::queue                      &exec_q,
+          oneapi::mkl::transpose            mkl_trans,
+          double                            alpha_d,
+          const char                       *row_ptr_data,
+          const char                       *col_ind_data,
+          const char                       *values_data,
+          std::int64_t                      num_rows,
+          std::int64_t                      num_cols,
+          std::int64_t                      nnz,
+          const char                       *x_data,
+          double                            beta_d,
+          char                             *y_data,
+          const std::vector<sycl::event>   &depends)
 {
-    // Validate that T is supported on this device (mirrors gemm_impl pattern)
-    type_utils::validate_type_for_device<T>(exec_q);
+    type_utils::validate_type_for_device<Tv>(exec_q);
+
+    const Tv  alpha = static_cast<Tv>(alpha_d);
+    const Tv  beta  = static_cast<Tv>(beta_d);
+    const Ti *row_ptr = reinterpret_cast<const Ti *>(row_ptr_data);
+    const Ti *col_ind = reinterpret_cast<const Ti *>(col_ind_data);
+    const Tv *values  = reinterpret_cast<const Tv *>(values_data);
+    const Tv *x       = reinterpret_cast<const Tv *>(x_data);
+    Tv       *y       = reinterpret_cast<Tv *>(y_data);
 
     std::stringstream error_msg;
     bool is_exception_caught = false;
@@ -86,40 +118,35 @@ sparse_gemv_impl(sycl::queue &exec_q,
             exec_q, handle,
             num_rows, num_cols,
             oneapi::mkl::index_base::zero,
-            row_ptr_ptr, col_ind_ptr, values_ptr,
+            const_cast<Ti *>(row_ptr),
+            const_cast<Ti *>(col_ind),
+            const_cast<Tv *>(values),
             depends);
 
-        // optimize_gemv performs internal analysis — amortises over repeated SpMV
         auto ev_opt = mkl_sparse::optimize_gemv(
             exec_q, mkl_trans, handle, {ev_set});
 
         gemv_ev = mkl_sparse::gemv(
             exec_q, mkl_trans,
             alpha, handle,
-            x_ptr, beta, y_ptr,
+            x, beta, y,
             {ev_opt});
 
-        // async release — waits for gemv_ev internally
         mkl_sparse::release_matrix_handle(exec_q, &handle, {gemv_ev});
 
     } catch (oneapi::mkl::exception const &e) {
-        error_msg
-            << "Unexpected MKL exception caught during sparse_gemv() call:"
-               "\nreason: "
-            << e.what();
+        error_msg << "Unexpected MKL exception caught during sparse_gemv() "
+                     "call:\nreason: " << e.what();
         is_exception_caught = true;
     } catch (sycl::exception const &e) {
-        error_msg
-            << "Unexpected SYCL exception caught during sparse_gemv() call:\n"
-            << e.what();
+        error_msg << "Unexpected SYCL exception caught during sparse_gemv() "
+                     "call:\n" << e.what();
         is_exception_caught = true;
     }
 
     if (is_exception_caught) {
-        // Best-effort handle cleanup before re-raising
-        if (handle != nullptr) {
+        if (handle != nullptr)
             mkl_sparse::release_matrix_handle(exec_q, &handle, {});
-        }
         throw std::runtime_error(error_msg.str());
     }
 
@@ -128,56 +155,51 @@ sparse_gemv_impl(sycl::queue &exec_q,
 
 
 // ---------------------------------------------------------------------------
-// Python-facing function
+// Python-facing entry point
 // ---------------------------------------------------------------------------
 
 std::pair<sycl::event, sycl::event>
-sparse_gemv(sycl::queue &exec_q,
-            const int trans,
-            const double alpha,
-            const dpctl::tensor::usm_ndarray &row_ptr,
-            const dpctl::tensor::usm_ndarray &col_ind,
-            const dpctl::tensor::usm_ndarray &values,
-            const dpctl::tensor::usm_ndarray &x,
-            const double beta,
-            const dpctl::tensor::usm_ndarray &y,
-            const std::int64_t num_rows,
-            const std::int64_t num_cols,
-            const std::int64_t nnz,
-            const std::vector<sycl::event> &depends)
+sparse_gemv(sycl::queue                           &exec_q,
+            const int                              trans,
+            const double                           alpha,
+            const dpctl::tensor::usm_ndarray      &row_ptr,
+            const dpctl::tensor::usm_ndarray      &col_ind,
+            const dpctl::tensor::usm_ndarray      &values,
+            const dpctl::tensor::usm_ndarray      &x,
+            const double                           beta,
+            const dpctl::tensor::usm_ndarray      &y,
+            const std::int64_t                     num_rows,
+            const std::int64_t                     num_cols,
+            const std::int64_t                     nnz,
+            const std::vector<sycl::event>        &depends)
 {
-    // --- 1. ndim checks ---
-    if (x.get_ndim() != 1) {
+    // 1. ndim checks
+    if (x.get_ndim() != 1)
         throw py::value_error("sparse_gemv: x must be a 1-D array.");
-    }
-    if (y.get_ndim() != 1) {
+    if (y.get_ndim() != 1)
         throw py::value_error("sparse_gemv: y must be a 1-D array.");
-    }
 
-    // --- 2. Queue compatibility (all USM arrays must share the same queue) ---
+    // 2. Queue compatibility
     if (!dpctl::utils::queues_are_compatible(
-            exec_q,
-            {row_ptr.get_queue(), col_ind.get_queue(),
-             values.get_queue(),  x.get_queue(), y.get_queue()})) {
+            exec_q, {row_ptr.get_queue(), col_ind.get_queue(),
+                     values.get_queue(), x.get_queue(), y.get_queue()}))
         throw py::value_error(
             "sparse_gemv: USM allocations are not compatible with the "
             "execution queue.");
-    }
 
-    // --- 3. Memory overlap: x and y must not alias ---
+    // 3. Memory overlap: x and y must not alias
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(x, y)) {
+    if (overlap(x, y))
         throw py::value_error(
             "sparse_gemv: input array x and output array y are overlapping "
             "segments of memory.");
-    }
 
-    // --- 4. Output writability and size ---
+    // 4. Output writability and size
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(y);
     dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
         y, static_cast<std::size_t>(num_rows));
 
-    // --- 5. Map trans integer to oneMKL enum ---
+    // 5. Map trans integer to oneMKL enum
     oneapi::mkl::transpose mkl_trans;
     switch (trans) {
         case 0: mkl_trans = oneapi::mkl::transpose::nontrans;  break;
@@ -188,74 +210,24 @@ sparse_gemv(sycl::queue &exec_q,
                 "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
     }
 
-    // --- 6. Type dispatch (value type x index type) ---
-    // oneMKL sparse BLAS supports float32 and float64 (no complex yet)
-    int val_typenum = values.get_typenum();
-    int idx_typenum = row_ptr.get_typenum();
+    // 6. Dispatch table lookup — replaces the explicit if/else chain
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    const int val_id = array_types.typenum_to_lookup_id(values.get_typenum());
+    const int idx_id = array_types.typenum_to_lookup_id(row_ptr.get_typenum());
 
-    sycl::event gemv_ev;
-
-    if (val_typenum == UAR_FLOAT) {
-        auto alpha_f = static_cast<float>(alpha);
-        auto beta_f  = static_cast<float>(beta);
+    gemv_impl_fn_ptr_t gemv_fn = gemv_dispatch_table[val_id][idx_id];
+    if (gemv_fn == nullptr)
+        throw py::value_error(
+            "sparse_gemv: no implementation for the given value/index dtype "
+            "combination. Supported: float32/float64 with int32/int64 indices.");
 
-        if (idx_typenum == UAR_INT32) {
-            gemv_ev = sparse_gemv_impl<float, std::int32_t>(
-                exec_q, mkl_trans, alpha_f,
-                row_ptr.get_data<std::int32_t>(),
-                col_ind.get_data<std::int32_t>(),
-                values.get_data<float>(),
-                num_rows, num_cols, nnz,
-                x.get_data<float>(), beta_f,
-                y.get_data<float>(), depends);
-        }
-        else if (idx_typenum == UAR_INT64) {
-            gemv_ev = sparse_gemv_impl<float, std::int64_t>(
-                exec_q, mkl_trans, alpha_f,
-                row_ptr.get_data<std::int64_t>(),
-                col_ind.get_data<std::int64_t>(),
-                values.get_data<float>(),
-                num_rows, num_cols, nnz,
-                x.get_data<float>(), beta_f,
-                y.get_data<float>(), depends);
-        }
-        else {
-            throw std::runtime_error(
-                "sparse_gemv: index dtype must be int32 or int64");
-        }
-    }
-    else if (val_typenum == UAR_DOUBLE) {
-        if (idx_typenum == UAR_INT32) {
-            gemv_ev = sparse_gemv_impl<double, std::int32_t>(
-                exec_q, mkl_trans, alpha,
-                row_ptr.get_data<std::int32_t>(),
-                col_ind.get_data<std::int32_t>(),
-                values.get_data<double>(),
-                num_rows, num_cols, nnz,
-                x.get_data<double>(), beta,
-                y.get_data<double>(), depends);
-        }
-        else if (idx_typenum == UAR_INT64) {
-            gemv_ev = sparse_gemv_impl<double, std::int64_t>(
-                exec_q, mkl_trans, alpha,
-                row_ptr.get_data<std::int64_t>(),
-                col_ind.get_data<std::int64_t>(),
-                values.get_data<double>(),
+    sycl::event gemv_ev =
+        gemv_fn(exec_q, mkl_trans, alpha,
+                row_ptr.get_data(), col_ind.get_data(), values.get_data(),
                 num_rows, num_cols, nnz,
-                x.get_data<double>(), beta,
-                y.get_data<double>(), depends);
-        }
-        else {
-            throw std::runtime_error(
-                "sparse_gemv: index dtype must be int32 or int64");
-        }
-    }
-    else {
-        throw std::runtime_error(
-            "sparse_gemv: value dtype must be float32 or float64");
-    }
+                x.get_data(), beta, y.get_data(),
+                depends);
 
-    // Keep all input/output USM arrays alive until gemv_ev completes
     sycl::event args_ev = dpctl::utils::keep_args_alive(
         exec_q, {row_ptr, col_ind, values, x, y}, {gemv_ev});
 
@@ -264,15 +236,26 @@ sparse_gemv(sycl::queue &exec_q,
 
 
 // ---------------------------------------------------------------------------
-// Dispatch vector init (placeholder — matches blas convention)
+// Factory and dispatch table initialisation
+// Mirrors blas/gemm.cpp: GemmContigFactory -> GemvContigFactory
 // ---------------------------------------------------------------------------
 
-void init_sparse_gemv_dispatch_vector(void)
+template <typename fnT, typename Tv, typename Ti>
+struct GemvContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::SparseGemvTypePairSupportFactory<Tv, Ti>::is_defined)
+            return gemv_impl<Tv, Ti>;
+        else
+            return nullptr;
+    }
+};
+
+void init_sparse_gemv_dispatch_table(void)
 {
-    // No dispatch table needed for sparse_gemv since we do explicit
-    // type switching in the function body (oneMKL sparse API uses
-    // opaque handles, not templated dispatch tables).
-    // This function exists to match the dpnp extension convention.
+    init_dispatch_table<gemv_impl_fn_ptr_t, GemvContigFactory>(
+        gemv_dispatch_table);
 }
 
 } // namespace dpnp::extensions::sparse
diff --git a/dpnp/backend/extensions/sparse/types_matrix.hpp b/dpnp/backend/extensions/sparse/types_matrix.hpp
new file mode 100644
index 000000000000..5abdef85db3c
--- /dev/null
+++ b/dpnp/backend/extensions/sparse/types_matrix.hpp
@@ -0,0 +1,71 @@
+//*****************************************************************************
+// Copyright (c) 2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
+// dpctl namespace alias for type dispatch utilities
+namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpnp::extensions::sparse::types
+{
+
+/**
+ * @brief Factory encoding the supported (value type, index type) combinations
+ * for oneapi::mkl::sparse::gemv.
+ *
+ * oneMKL sparse BLAS supports:
+ *   - float32  with int32 indices
+ *   - float32  with int64 indices
+ *   - float64  with int32 indices
+ *   - float64  with int64 indices
+ *
+ * Complex value types and other index widths are not supported by
+ * oneapi::mkl::sparse::gemv and are intentionally excluded.
+ *
+ * @tparam Tv  Value type of the sparse matrix and dense vectors.
+ * @tparam Ti  Index type of the sparse matrix (row_ptr / col_ind arrays).
+ */
+template <typename Tv, typename Ti>
+struct SparseGemvTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tv, float,  Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, float,  Ti, std::int64_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int64_t>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+} // namespace dpnp::extensions::sparse::types

From 7bc86c9735cbdba955e11ed7d034a8e2e0f41e10 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 20:25:05 -0500
Subject: [PATCH 20/43] sparse/gemv.hpp: rename
 init_sparse_gemv_dispatch_vector -> init_sparse_gemv_dispatch_table

Follows the rename made in gemv.cpp when the dispatch mechanism was
changed from a 1-D vector to a 2-D table (value type x index type).
All other declarations (sparse_gemv signature, parameters) are unchanged.
---
 dpnp/backend/extensions/sparse/gemv.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.hpp b/dpnp/backend/extensions/sparse/gemv.hpp
index 518355b4b41e..cd647e6c1734 100644
--- a/dpnp/backend/extensions/sparse/gemv.hpp
+++ b/dpnp/backend/extensions/sparse/gemv.hpp
@@ -51,6 +51,6 @@ sparse_gemv(sycl::queue &exec_q,
             const std::int64_t nnz,
             const std::vector<sycl::event> &depends);
 
-extern void init_sparse_gemv_dispatch_vector(void);
+extern void init_sparse_gemv_dispatch_table(void);
 
 } // namespace dpnp::extensions::sparse

From 6136da2442c73487f60792f3a3a07953d01e9ce6 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 20:35:26 -0500
Subject: [PATCH 21/43] sparse/gemv: fix deprecated set_csr_data and unused nnz
 warning

The oneMKL 2025-2 sparse BLAS API deprecated the old 8-argument
set_csr_data(queue, handle, nrows, ncols, index_base, row_ptr, col_ind,
values, deps) overload in favour of a new signature that takes the
sparse matrix handle as `spmat` and adds an explicit `nnz` argument:

  set_csr_data(queue, spmat, nrows, ncols, nnz, index_base,
               row_ptr, col_ind, values, deps)

Fixes:
- Replace old set_csr_data call with the new nnz-aware signature
- Silences the resulting -Wunused-parameter warning on `nnz` (now used)
- No functional change; all other logic is unchanged
---
 dpnp/backend/extensions/sparse/gemv.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index c58618afd86a..1adcca22339b 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -108,15 +108,17 @@ gemv_impl(sycl::queue                      &exec_q,
     std::stringstream error_msg;
     bool is_exception_caught = false;
 
-    mkl_sparse::matrix_handle_t handle = nullptr;
+    mkl_sparse::matrix_handle_t spmat = nullptr;
     sycl::event gemv_ev;
 
     try {
-        mkl_sparse::init_matrix_handle(&handle);
+        mkl_sparse::init_matrix_handle(&spmat);
 
+        // oneMKL 2025-2 API: set_csr_data now requires explicit nnz and uses
+        // `spmat` nomenclature. The old form without nnz is deprecated.
         auto ev_set = mkl_sparse::set_csr_data(
-            exec_q, handle,
-            num_rows, num_cols,
+            exec_q, spmat,
+            num_rows, num_cols, nnz,
             oneapi::mkl::index_base::zero,
             const_cast<Ti *>(row_ptr),
             const_cast<Ti *>(col_ind),
@@ -124,15 +126,15 @@ gemv_impl(sycl::queue                      &exec_q,
             depends);
 
         auto ev_opt = mkl_sparse::optimize_gemv(
-            exec_q, mkl_trans, handle, {ev_set});
+            exec_q, mkl_trans, spmat, {ev_set});
 
         gemv_ev = mkl_sparse::gemv(
             exec_q, mkl_trans,
-            alpha, handle,
+            alpha, spmat,
             x, beta, y,
             {ev_opt});
 
-        mkl_sparse::release_matrix_handle(exec_q, &handle, {gemv_ev});
+        mkl_sparse::release_matrix_handle(exec_q, &spmat, {gemv_ev});
 
     } catch (oneapi::mkl::exception const &e) {
         error_msg << "Unexpected MKL exception caught during sparse_gemv() "
@@ -145,8 +147,8 @@ gemv_impl(sycl::queue                      &exec_q,
     }
 
     if (is_exception_caught) {
-        if (handle != nullptr)
-            mkl_sparse::release_matrix_handle(exec_q, &handle, {});
+        if (spmat != nullptr)
+            mkl_sparse::release_matrix_handle(exec_q, &spmat, {});
         throw std::runtime_error(error_msg.str());
     }
 
@@ -210,7 +212,7 @@ sparse_gemv(sycl::queue                           &exec_q,
                 "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
     }
 
-    // 6. Dispatch table lookup — replaces the explicit if/else chain
+    // 6. Dispatch table lookup
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     const int val_id = array_types.typenum_to_lookup_id(values.get_typenum());
     const int idx_id = array_types.typenum_to_lookup_id(row_ptr.get_typenum());
@@ -237,7 +239,6 @@ sparse_gemv(sycl::queue                           &exec_q,
 
 // ---------------------------------------------------------------------------
 // Factory and dispatch table initialisation
-// Mirrors blas/gemm.cpp: GemmContigFactory -> GemvContigFactory
 // ---------------------------------------------------------------------------
 
 template <typename fnT, typename Tv, typename Ti>

From ed58333fed9f75fa58858d0b31dc5f31fdb4bee2 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 7 Apr 2026 01:44:50 +0000
Subject: [PATCH 22/43] minor cleanup for sparse extensions

---
 dpnp/CMakeLists.txt                           |  1 +
 dpnp/backend/extensions/sparse/CMakeLists.txt | 11 +++---
 dpnp/backend/extensions/sparse/gemv.cpp       | 39 ++-----------------
 dpnp/backend/extensions/sparse/sparse_py.cpp  | 16 +-------
 4 files changed, 13 insertions(+), 54 deletions(-)

diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt
index 6850b799735c..cfced6b4ae44 100644
--- a/dpnp/CMakeLists.txt
+++ b/dpnp/CMakeLists.txt
@@ -100,6 +100,7 @@ add_subdirectory(backend/extensions/statistics)
 add_subdirectory(backend/extensions/ufunc)
 add_subdirectory(backend/extensions/vm)
 add_subdirectory(backend/extensions/window)
+add_subdirectory(backend/extensions/sparse)
 
 add_subdirectory(dpnp_algo)
 add_subdirectory(dpnp_utils)
diff --git a/dpnp/backend/extensions/sparse/CMakeLists.txt b/dpnp/backend/extensions/sparse/CMakeLists.txt
index 549437b6aad3..49f97b58b496 100644
--- a/dpnp/backend/extensions/sparse/CMakeLists.txt
+++ b/dpnp/backend/extensions/sparse/CMakeLists.txt
@@ -37,6 +37,7 @@ pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
 if(_dpnp_sycl_targets)
+    # make fat binary
     target_compile_options(
         ${python_module_name}
         PRIVATE ${_dpnp_sycl_target_compile_options}
@@ -45,7 +46,9 @@ if(_dpnp_sycl_targets)
 endif()
 
 if(WIN32)
-    if(${CMAKE_VERSION} VERSION_LESS "3.27")
+  if(${CMAKE_VERSION} VERSION_LESS "3.27")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
         set(CMAKE_CXX_LINK_FLAGS
             "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel"
         )
@@ -62,13 +65,11 @@ target_include_directories(
     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
 )
 
+# treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE
-        ${SYCL_INCLUDE_DIR}
-        ${Dpctl_INCLUDE_DIRS}
-        ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index 1adcca22339b..0e8b22e0fa50 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -26,15 +26,14 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include <sstream>
 #include <stdexcept>
 
 #include <pybind11/pybind11.h>
 
-// dpnp extension infrastructure
+// utils extension header
 #include "ext/common.hpp"
 
-// dpctl tensor validation and utility headers
+// dpctl tensor headers
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
@@ -42,20 +41,14 @@
 #include "gemv.hpp"
 #include "types_matrix.hpp"
 
+namespace dpnp::extensions::sparse
+{
 namespace mkl_sparse = oneapi::mkl::sparse;
 namespace py         = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
 using ext::common::init_dispatch_table;
 
-namespace dpnp::extensions::sparse
-{
-
-// ---------------------------------------------------------------------------
-// Dispatch table: [value_type_id][index_type_id] -> impl function pointer
-// Mirrors the 2-D table pattern of blas/gemm.cpp.
-// ---------------------------------------------------------------------------
-
 typedef sycl::event (*gemv_impl_fn_ptr_t)(
     sycl::queue &,
     oneapi::mkl::transpose,
@@ -74,11 +67,6 @@ typedef sycl::event (*gemv_impl_fn_ptr_t)(
 static gemv_impl_fn_ptr_t
     gemv_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
 
-
-// ---------------------------------------------------------------------------
-// Typed implementation — one instantiation per (Tv, Ti) pair
-// ---------------------------------------------------------------------------
-
 template <typename Tv, typename Ti>
 static sycl::event
 gemv_impl(sycl::queue                      &exec_q,
@@ -114,8 +102,6 @@ gemv_impl(sycl::queue                      &exec_q,
     try {
         mkl_sparse::init_matrix_handle(&spmat);
 
-        // oneMKL 2025-2 API: set_csr_data now requires explicit nnz and uses
-        // `spmat` nomenclature. The old form without nnz is deprecated.
         auto ev_set = mkl_sparse::set_csr_data(
             exec_q, spmat,
             num_rows, num_cols, nnz,
@@ -155,11 +141,6 @@ gemv_impl(sycl::queue                      &exec_q,
     return gemv_ev;
 }
 
-
-// ---------------------------------------------------------------------------
-// Python-facing entry point
-// ---------------------------------------------------------------------------
-
 std::pair<sycl::event, sycl::event>
 sparse_gemv(sycl::queue                           &exec_q,
             const int                              trans,
@@ -175,13 +156,11 @@ sparse_gemv(sycl::queue                           &exec_q,
             const std::int64_t                     nnz,
             const std::vector<sycl::event>        &depends)
 {
-    // 1. ndim checks
     if (x.get_ndim() != 1)
         throw py::value_error("sparse_gemv: x must be a 1-D array.");
     if (y.get_ndim() != 1)
         throw py::value_error("sparse_gemv: y must be a 1-D array.");
 
-    // 2. Queue compatibility
     if (!dpctl::utils::queues_are_compatible(
             exec_q, {row_ptr.get_queue(), col_ind.get_queue(),
                      values.get_queue(), x.get_queue(), y.get_queue()}))
@@ -189,19 +168,16 @@ sparse_gemv(sycl::queue                           &exec_q,
             "sparse_gemv: USM allocations are not compatible with the "
             "execution queue.");
 
-    // 3. Memory overlap: x and y must not alias
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
     if (overlap(x, y))
         throw py::value_error(
             "sparse_gemv: input array x and output array y are overlapping "
             "segments of memory.");
 
-    // 4. Output writability and size
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(y);
     dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
         y, static_cast<std::size_t>(num_rows));
 
-    // 5. Map trans integer to oneMKL enum
     oneapi::mkl::transpose mkl_trans;
     switch (trans) {
         case 0: mkl_trans = oneapi::mkl::transpose::nontrans;  break;
@@ -212,7 +188,6 @@ sparse_gemv(sycl::queue                           &exec_q,
                 "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
     }
 
-    // 6. Dispatch table lookup
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     const int val_id = array_types.typenum_to_lookup_id(values.get_typenum());
     const int idx_id = array_types.typenum_to_lookup_id(row_ptr.get_typenum());
@@ -236,11 +211,6 @@ sparse_gemv(sycl::queue                           &exec_q,
     return std::make_pair(args_ev, gemv_ev);
 }
 
-
-// ---------------------------------------------------------------------------
-// Factory and dispatch table initialisation
-// ---------------------------------------------------------------------------
-
 template <typename fnT, typename Tv, typename Ti>
 struct GemvContigFactory
 {
@@ -258,5 +228,4 @@ void init_sparse_gemv_dispatch_table(void)
     init_dispatch_table<gemv_impl_fn_ptr_t, GemvContigFactory>(
         gemv_dispatch_table);
 }
-
 } // namespace dpnp::extensions::sparse
diff --git a/dpnp/backend/extensions/sparse/sparse_py.cpp b/dpnp/backend/extensions/sparse/sparse_py.cpp
index b2cc40e0bd2e..35f40d6bad18 100644
--- a/dpnp/backend/extensions/sparse/sparse_py.cpp
+++ b/dpnp/backend/extensions/sparse/sparse_py.cpp
@@ -26,9 +26,7 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 //
-// Defines the dpnp.backend._sparse_impl pybind11 extension module.
-// Provides oneMKL sparse BLAS operations on CSR matrices over dpctl USM arrays.
-// Equivalent role to _cusparse for the SYCL/oneMKL backend.
+// This file defines functions of dpnp.backend._sparse_impl extensions
 //
 //*****************************************************************************
 
@@ -42,7 +40,7 @@ namespace py = pybind11;
 
 static void init_dispatch_vectors_tables(void)
 {
-    sparse_ns::init_sparse_gemv_dispatch_vector();
+    sparse_ns::init_sparse_gemv_dispatch_table();
 }
 
 PYBIND11_MODULE(_sparse_impl, m)
@@ -52,13 +50,6 @@ PYBIND11_MODULE(_sparse_impl, m)
     using arrayT     = dpctl::tensor::usm_ndarray;
     using event_vecT = std::vector<sycl::event>;
 
-    // ------------------------------------------------------------------
-    // _sparse_gemv — CSR SpMV:  y = alpha * op(A) * x + beta * y
-    //
-    // Equivalent to _cusparse.spMV_make_fast_matvec for the SYCL stack.
-    // Backed by oneMKL sparse::gemv with set_csr_data + optimize_gemv so
-    // matrix-handle analysis is amortised across repeated calls.
-    // ------------------------------------------------------------------
     {
         m.def(
             "_sparse_gemv",
@@ -113,9 +104,6 @@ PYBIND11_MODULE(_sparse_impl, m)
             py::arg("depends") = py::list());
     }
 
-    // ------------------------------------------------------------------
-    // Runtime query: which sparse library backend is active
-    // ------------------------------------------------------------------
     {
         m.def(
             "_using_onemath",

From 0a32b5729c2bce48c11f24ebdcc3e25eab14e02e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:04:37 -0500
Subject: [PATCH 23/43] Fix SyntaxError: remove stray backslash in
 aslinearoperator hasattr string

Line 477: `hasattr(A, "rmatmat\")` had a Markdown-escaped backslash
leaked into the Python source, causing an unterminated string literal.
Fixed to `hasattr(A, "rmatmat")`.
---
 dpnp/scipy/sparse/linalg/_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index cbf6592f1938..6596379b9fa6 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -503,7 +503,7 @@ def aslinearoperator(A) -> LinearOperator:
         matvec  = A.matvec  if hasattr(A, "matvec")  else (lambda x: A @ x)
         rmatvec = A.rmatvec if hasattr(A, "rmatvec") else None
         matmat  = A.matmat  if hasattr(A, "matmat")  else None
-        rmatmat = A.rmatmat if hasattr(A, "rmatmat\") else None
+        rmatmat = A.rmatmat if hasattr(A, "rmatmat") else None
         return LinearOperator(
             (m, n),
             matvec=matvec,

From 69103324920205dabae27a76b84c7270ae81523a Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:09:09 -0500
Subject: [PATCH 24/43] Fix tests: replace numpy.asarray(dpnp_arr) with
 dpnp_arr.asnumpy()

dpnp.ndarray blocks implicit NumPy conversion via __array__ to prevent
silent dtype=object arrays. All test assertions must use .asnumpy()
to materialize device arrays onto the host explicitly.

Also replaces numpy.asarray(x_dp) in _rel_residual helper.
---
 dpnp/tests/test_scipy_sparse_linalg.py | 85 ++++++++++++++------------
 1 file changed, 46 insertions(+), 39 deletions(-)

diff --git a/dpnp/tests/test_scipy_sparse_linalg.py b/dpnp/tests/test_scipy_sparse_linalg.py
index 3e9cd2088156..7db100a69181 100644
--- a/dpnp/tests/test_scipy_sparse_linalg.py
+++ b/dpnp/tests/test_scipy_sparse_linalg.py
@@ -28,6 +28,11 @@
 
 The test structure and helper usage mirror dpnp/tests/test_linalg.py so that
 the suite fits naturally into the existing CI infrastructure.
+
+Note: dpnp.ndarray deliberately blocks implicit numpy conversion (raises
+TypeError in __array__) to prevent silent dtype=object arrays.  All
+assertions that need a host-side NumPy array must call `arr.asnumpy()`
+explicitly instead of `numpy.asarray(arr)`.
 """
 
 import numpy
@@ -54,6 +59,13 @@
 # Helpers
 # ---------------------------------------------------------------------------
 
+def _to_numpy(x):
+    """Convert a dpnp array (or plain numpy array) to numpy safely."""
+    if isinstance(x, dpnp.ndarray):
+        return x.asnumpy()
+    return numpy.asarray(x)
+
+
 def _make_spd(n, dtype, rng):
     """Return a symmetric positive-definite matrix of size n."""
     A = rng.standard_normal((n, n)).astype(dtype)
@@ -76,7 +88,7 @@ def _make_nonsym(n, dtype, rng):
 
 def _rel_residual(A_np, x_dp, b_np):
     """Relative residual ||Ax - b|| / ||b||."""
-    x_np = numpy.asarray(x_dp)
+    x_np = _to_numpy(x_dp)
     r = A_np @ x_np - b_np
     b_nrm = numpy.linalg.norm(b_np)
     return numpy.linalg.norm(r) / (b_nrm if b_nrm > 0 else 1.0)
@@ -118,7 +130,7 @@ def test_matvec_identity(self, n):
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
         x_dp = dpnp.arange(n, dtype=numpy.float64)
         y_dp = op.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.asarray(x_dp), rtol=1e-12)
+        assert_allclose(_to_numpy(y_dp), _to_numpy(x_dp), rtol=1e-12)
 
     @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
     def test_matvec_dense(self, dtype):
@@ -132,7 +144,7 @@ def test_matvec_dense(self, dtype):
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
         y_dp = op.matvec(x_dp)
         y_ref = A_np @ x_np
-        assert_allclose(numpy.asarray(y_dp), y_ref, rtol=1e-5)
+        assert_allclose(_to_numpy(y_dp), y_ref, rtol=1e-5)
 
     # --- rmatvec ---
 
@@ -151,7 +163,7 @@ def test_rmatvec_defined(self):
         )
         y_dp = op.rmatvec(x_dp)
         y_ref = A_np.T @ x_np
-        assert_allclose(numpy.asarray(y_dp), y_ref, rtol=1e-12)
+        assert_allclose(_to_numpy(y_dp), y_ref, rtol=1e-12)
 
     def test_rmatvec_not_defined_raises(self):
         n = 4
@@ -174,7 +186,7 @@ def test_matmat_fallback_loop(self):
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
         Y_dp = op.matmat(X_dp)
         Y_ref = A_np @ X_np
-        assert_allclose(numpy.asarray(Y_dp), Y_ref, rtol=1e-10)
+        assert_allclose(_to_numpy(Y_dp), Y_ref, rtol=1e-10)
 
     def test_matmat_explicit(self):
         rng = numpy.random.default_rng(3)
@@ -190,7 +202,7 @@ def test_matmat_explicit(self):
             matmat=lambda X: A_dp @ X,
         )
         Y_dp = op.matmat(X_dp)
-        assert_allclose(numpy.asarray(Y_dp), A_np @ X_np, rtol=1e-10)
+        assert_allclose(_to_numpy(Y_dp), A_np @ X_np, rtol=1e-10)
 
     # --- __matmul__ / __call__ ---
 
@@ -200,7 +212,7 @@ def test_matmul_1d(self):
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
         x_dp = dpnp.ones(n)
         y_dp = op @ x_dp
-        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 2.0))
+        assert_allclose(_to_numpy(y_dp), numpy.full(n, 2.0))
 
     def test_matmul_2d(self):
         n, k = 4, 3
@@ -208,14 +220,14 @@ def test_matmul_2d(self):
         X_dp = dpnp.ones((n, k))
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
         Y_dp = op @ X_dp
-        assert_allclose(numpy.asarray(Y_dp), numpy.ones((n, k)))
+        assert_allclose(_to_numpy(Y_dp), numpy.ones((n, k)))
 
     def test_call_delegates_to_matmul(self):
         n = 4
         A_dp = dpnp.eye(n, dtype=numpy.float64)
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
         x_dp = dpnp.ones(n)
-        assert_allclose(numpy.asarray(op(x_dp)), numpy.asarray(op @ x_dp))
+        assert_allclose(_to_numpy(op(x_dp)), _to_numpy(op @ x_dp))
 
     # --- operator algebra ---
 
@@ -231,8 +243,8 @@ def test_adjoint_property_H(self):
         )
         x_dp = dpnp.asarray(rng.standard_normal(n))
         y_H = op.H.matvec(x_dp)
-        y_ref = A_np.T @ numpy.asarray(x_dp)
-        assert_allclose(numpy.asarray(y_H), y_ref, rtol=1e-12)
+        y_ref = A_np.T @ _to_numpy(x_dp)
+        assert_allclose(_to_numpy(y_H), y_ref, rtol=1e-12)
 
     def test_transpose_property_T(self):
         rng = numpy.random.default_rng(5)
@@ -247,8 +259,8 @@ def test_transpose_property_T(self):
         x_dp = dpnp.asarray(rng.standard_normal(n))
         y_T = op.T.matvec(x_dp)
         # For real A, T == H
-        y_ref = A_np.T @ numpy.asarray(x_dp)
-        assert_allclose(numpy.asarray(y_T), y_ref, rtol=1e-12)
+        y_ref = A_np.T @ _to_numpy(x_dp)
+        assert_allclose(_to_numpy(y_T), y_ref, rtol=1e-12)
 
     def test_add_two_operators(self):
         n = 5
@@ -259,7 +271,7 @@ def test_add_two_operators(self):
         opC = opA + opB
         x_dp = dpnp.ones(n)
         y_dp = opC.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 3.0))
+        assert_allclose(_to_numpy(y_dp), numpy.full(n, 3.0))
 
     def test_scalar_multiply(self):
         n = 4
@@ -268,7 +280,7 @@ def test_scalar_multiply(self):
         op3 = op * 3.0
         x_dp = dpnp.ones(n)
         y_dp = op3.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 3.0))
+        assert_allclose(_to_numpy(y_dp), numpy.full(n, 3.0))
 
     def test_product_operator(self):
         n = 5
@@ -279,7 +291,7 @@ def test_product_operator(self):
         opAB = opA * opB
         x_dp = dpnp.ones(n)
         y_dp = opAB.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 6.0))
+        assert_allclose(_to_numpy(y_dp), numpy.full(n, 6.0))
 
     def test_neg_operator(self):
         n = 4
@@ -288,7 +300,7 @@ def test_neg_operator(self):
         neg_op = -op
         x_dp = dpnp.ones(n)
         y_dp = neg_op.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.full(n, -1.0))
+        assert_allclose(_to_numpy(y_dp), numpy.full(n, -1.0))
 
     def test_power_operator(self):
         n = 4
@@ -298,7 +310,7 @@ def test_power_operator(self):
         x_dp = dpnp.ones(n)
         y_dp = op3.matvec(x_dp)
         # 2^3 * I * [1...] = 8
-        assert_allclose(numpy.asarray(y_dp), numpy.full(n, 8.0))
+        assert_allclose(_to_numpy(y_dp), numpy.full(n, 8.0))
 
     # --- shape / error validation ---
 
@@ -327,7 +339,7 @@ def test_aslinearoperator_from_dense_dpnp(self):
         op = aslinearoperator(A_dp)
         x_dp = dpnp.ones(n)
         y_dp = op.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.ones(n))
+        assert_allclose(_to_numpy(y_dp), numpy.ones(n))
 
     def test_aslinearoperator_from_numpy(self):
         n = 5
@@ -335,7 +347,7 @@ def test_aslinearoperator_from_numpy(self):
         op = aslinearoperator(A_np)
         x_dp = dpnp.ones(n)
         y_dp = op.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), numpy.ones(n))
+        assert_allclose(_to_numpy(y_dp), numpy.ones(n))
 
     def test_aslinearoperator_invalid_raises(self):
         with pytest.raises(TypeError):
@@ -355,8 +367,8 @@ def test_identity_operator(self):
         n = 7
         op = IdentityOperator((n, n), dtype=numpy.float64)
         x_dp = dpnp.arange(n, dtype=numpy.float64)
-        assert_array_equal(numpy.asarray(op.matvec(x_dp)), numpy.arange(n))
-        assert_array_equal(numpy.asarray(op.rmatvec(x_dp)), numpy.arange(n))
+        assert_array_equal(_to_numpy(op.matvec(x_dp)), numpy.arange(n))
+        assert_array_equal(_to_numpy(op.rmatvec(x_dp)), numpy.arange(n))
 
     # --- complex dtype ---
 
@@ -371,7 +383,7 @@ def test_complex_matvec(self, dtype):
 
         op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
         y_dp = op.matvec(x_dp)
-        assert_allclose(numpy.asarray(y_dp), A_np @ x_np, rtol=1e-4)
+        assert_allclose(_to_numpy(y_dp), A_np @ x_np, rtol=1e-4)
 
 
 # ---------------------------------------------------------------------------
@@ -406,7 +418,7 @@ def test_cg_matches_numpy_solve(self):
         x_ref = numpy.linalg.solve(A_np, b_np)
         x_dp, info = cg(A_dp, b_dp, tol=1e-10, maxiter=1000)
         assert info == 0
-        assert_allclose(numpy.asarray(x_dp), x_ref, rtol=1e-6)
+        assert_allclose(_to_numpy(x_dp), x_ref, rtol=1e-6)
 
     def test_cg_x0_initial_guess(self):
         rng = numpy.random.default_rng(102)
@@ -417,11 +429,9 @@ def test_cg_x0_initial_guess(self):
         A_dp = dpnp.asarray(A_np)
         b_dp = dpnp.asarray(b_np)
 
-        # Start from a good initial guess: actual solution
         x_ref = numpy.linalg.solve(A_np, b_np)
         x0_dp = dpnp.asarray(x_ref)
         x_dp, info = cg(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5)
-        # Should converge immediately or with very few iterations
         assert _rel_residual(A_np, x_dp, b_np) < 1e-8
 
     def test_cg_callback_called(self):
@@ -447,7 +457,7 @@ def test_cg_already_zero_rhs(self):
         b_dp = dpnp.zeros(n, dtype=numpy.float64)
         x_dp, info = cg(A_dp, b_dp)
         assert info == 0
-        assert_allclose(numpy.asarray(x_dp), numpy.zeros(n), atol=1e-14)
+        assert_allclose(_to_numpy(x_dp), numpy.zeros(n), atol=1e-14)
 
     def test_cg_returns_dpnp_array(self):
         n = 4
@@ -509,7 +519,6 @@ def test_cg_dtype_preserved_in_output(self, dtype):
         A_np = _make_spd(n, dtype, rng)
         b_np = rng.standard_normal(n).astype(dtype)
         x_dp, _ = cg(dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-6, maxiter=500)
-        # Result should be float64 (working precision) or at least same family
         assert numpy.issubdtype(x_dp.dtype, numpy.floating)
 
 
@@ -545,7 +554,7 @@ def test_gmres_matches_numpy_solve(self):
         x_ref = numpy.linalg.solve(A_np, b_np)
         x_dp, info = gmres(A_dp, b_dp, tol=1e-10, maxiter=50, restart=n)
         assert info == 0
-        assert_allclose(numpy.asarray(x_dp), x_ref, rtol=1e-5)
+        assert_allclose(_to_numpy(x_dp), x_ref, rtol=1e-5)
 
     def test_gmres_spd_matches_cg(self):
         """On an SPD system GMRES and CG should agree."""
@@ -559,7 +568,7 @@ def test_gmres_spd_matches_cg(self):
 
         x_gmres, _ = gmres(A_dp, b_dp, tol=1e-10, maxiter=100, restart=n)
         x_cg, _ = cg(A_dp, b_dp, tol=1e-10, maxiter=500)
-        assert_allclose(numpy.asarray(x_gmres), numpy.asarray(x_cg), rtol=1e-5)
+        assert_allclose(_to_numpy(x_gmres), _to_numpy(x_cg), rtol=1e-5)
 
     def test_gmres_restart_parameter(self):
         """Restarted GMRES (restart < n) should still converge."""
@@ -611,7 +620,7 @@ def test_gmres_already_zero_rhs(self):
         b_dp = dpnp.zeros(n, dtype=numpy.float64)
         x_dp, info = gmres(A_dp, b_dp)
         assert info == 0
-        assert_allclose(numpy.asarray(x_dp), numpy.zeros(n), atol=1e-14)
+        assert_allclose(_to_numpy(x_dp), numpy.zeros(n), atol=1e-14)
 
     def test_gmres_returns_dpnp_array(self):
         n = 4
@@ -706,7 +715,7 @@ def test_gmres_happy_breakdown(self, n):
         b_dp = dpnp.arange(1, n + 1, dtype=numpy.float64)
         x_dp, info = gmres(A_dp, b_dp, tol=1e-12, maxiter=n, restart=n)
         assert info == 0
-        assert_allclose(numpy.asarray(x_dp), numpy.arange(1, n + 1), rtol=1e-10)
+        assert_allclose(_to_numpy(x_dp), numpy.arange(1, n + 1), rtol=1e-10)
 
 
 # ---------------------------------------------------------------------------
@@ -760,7 +769,7 @@ def test_minres_matches_scipy(self):
             dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-10
         )
         assert info_dp == 0
-        assert_allclose(numpy.asarray(x_dp), x_scipy, rtol=1e-6)
+        assert_allclose(_to_numpy(x_dp), x_scipy, rtol=1e-6)
 
     def test_minres_x0_initial_guess(self):
         rng = numpy.random.default_rng(303)
@@ -789,7 +798,7 @@ def test_minres_already_zero_rhs(self):
         b_dp = dpnp.zeros(n, dtype=numpy.float64)
         x_dp, info = minres(A_dp, b_dp)
         assert info == 0
-        assert_allclose(numpy.asarray(x_dp), numpy.zeros(n), atol=1e-14)
+        assert_allclose(_to_numpy(x_dp), numpy.zeros(n), atol=1e-14)
 
     def test_minres_non_square_raises(self):
         A_dp = dpnp.ones((4, 6), dtype=numpy.float64)
@@ -806,7 +815,6 @@ def test_minres_with_shift(self):
         A_dp = dpnp.asarray(A_np)
         b_dp = dpnp.asarray(b_np)
 
-        # shift = 0 should be the default behaviour
         x_dp, info = minres(A_dp, b_dp, tol=1e-8, shift=0.0)
         assert info == 0
         assert _rel_residual(A_np, x_dp, b_np) < 1e-6
@@ -834,7 +842,6 @@ def test_minres_with_preconditioner(self):
         b_np = rng.standard_normal(n).astype(dtype)
         b_dp = dpnp.asarray(b_np)
 
-        # Use diagonal preconditioner M ≈ diag(A)^{-1}
         diag_A = numpy.diag(A_np)
         M_np = numpy.diag(1.0 / diag_A)
         M_dp = dpnp.asarray(M_np)
@@ -871,9 +878,9 @@ def test_cg_gmres_minres_agree_spd(self, n):
 
         assert info_cg == 0 and info_gm == 0 and info_mr == 0
 
-        assert_allclose(numpy.asarray(x_cg), numpy.asarray(x_gm), rtol=1e-5,
+        assert_allclose(_to_numpy(x_cg), _to_numpy(x_gm), rtol=1e-5,
                         err_msg="CG and GMRES disagree")
-        assert_allclose(numpy.asarray(x_cg), numpy.asarray(x_mr), rtol=1e-5,
+        assert_allclose(_to_numpy(x_cg), _to_numpy(x_mr), rtol=1e-5,
                         err_msg="CG and MINRES disagree")
 
     def test_all_solvers_vs_numpy_direct(self):
@@ -892,7 +899,7 @@ def test_all_solvers_vs_numpy_direct(self):
 
         for name, x_dp in [("cg", x_cg), ("gmres", x_gm), ("minres", x_mr)]:
             assert_allclose(
-                numpy.asarray(x_dp), x_ref, rtol=1e-7,
+                _to_numpy(x_dp), x_ref, rtol=1e-7,
                 err_msg=f"{name} deviates from numpy.linalg.solve"
             )
 

From 2a4566f083a8b7df507104de1a3849f1a9e98ae0 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:21:36 -0500
Subject: [PATCH 25/43] Fix test failures: dtype guards and
 preconditioner/callback_type validation order

- _iterative.py: raise NotImplementedError for M != None *before* the
  _HOST_N_THRESHOLD SciPy fast-path in cg() and gmres(), so the contract
  is enforced regardless of system size (fixes test_cg_preconditioner_unsupported_raises,
  test_gmres_preconditioner_unsupported_raises).
- _iterative.py: validate callback_type and raise NotImplementedError for
  'pr_norm' *before* the _HOST_N_THRESHOLD branch in gmres(), so small-n
  systems also see the error (fixes test_gmres_callback_type_pr_norm_raises).
- _iterative.py: pass callback_type='legacy' to scipy.sparse.linalg.gmres
  when delegating on the fast path to suppress SciPy DeprecationWarning.
- test_scipy_sparse_linalg.py: add dtype=numpy.float64 to expected arange()
  calls in test_identity_operator and test_gmres_happy_breakdown so strict
  NumPy 2.0 dtype-equality checks pass (float64 result vs int64 expected).
---
 dpnp/scipy/sparse/linalg/_iterative.py | 45 ++++++++++++++++++++------
 dpnp/tests/test_scipy_sparse_linalg.py | 18 +++++++----
 2 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index fcb9de5a6b03..c731eb9f8abb 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -251,7 +251,7 @@ def cg(
     x0 : array_like, optional         -- initial guess
     tol : float                       -- relative tolerance (default 1e-5)
     maxiter : int, optional           -- maximum iterations (default 10*n)
-    M : LinearOperator, optional      -- preconditioner
+    M : LinearOperator, optional      -- preconditioner (not yet implemented)
     callback : callable, optional     -- called as callback(xk) each iteration
     atol : float, optional            -- absolute tolerance
 
@@ -260,6 +260,13 @@ def cg(
     x : dpnp.ndarray
     info : int  (0 = converged, >0 = max iters reached, -1 = breakdown)
     """
+    # Guard M before any fast-path so the contract is enforced for all n.
+    if M is not None:
+        raise NotImplementedError(
+            "Preconditioner M is not yet supported in dpnp cg. "
+            "Use scipy.sparse.linalg.cg for preconditioned systems."
+        )
+
     b = _dpnp.asarray(b).reshape(-1)
     n = b.shape[0]
 
@@ -350,9 +357,10 @@ def gmres(
         See scipy.sparse.linalg.gmres documentation.
     restart : int, optional
         Krylov subspace dimension between restarts. Default: min(20, n).
-    callback_type : {'x', 'pr_norm', None}
-        'x'      -> callback(xk) at each restart (default when callback given).
-        'pr_norm'-> callback(residual_norm) at each restart.
+    callback_type : {'x', 'pr_norm', 'legacy', None}
+        'x'      -> callback(xk) at each restart.
+        'pr_norm'-> callback(residual_norm) at each restart (not yet implemented).
+        'legacy' -> SciPy legacy behaviour (passed through on host path).
         None     -> no callback invocation.
 
     Returns
@@ -360,6 +368,24 @@ def gmres(
     x : dpnp.ndarray
     info : int  (0 = converged, >0 = iterations used, -1 = breakdown)
     """
+    # Validate callback_type and guard unsupported values before any fast-path
+    # so the contract is enforced for all n, not just n > _HOST_N_THRESHOLD.
+    if callback_type not in (None, "x", "pr_norm", "legacy"):
+        raise ValueError(
+            "callback_type must be None, 'x', 'pr_norm', or 'legacy'"
+        )
+    if callback_type == "pr_norm":
+        raise NotImplementedError(
+            "callback_type='pr_norm' is not yet implemented in dpnp gmres."
+        )
+
+    # Guard M before any fast-path so the contract is enforced for all n.
+    if M is not None:
+        raise NotImplementedError(
+            "Preconditioner M is not yet supported in dpnp gmres. "
+            "Use scipy.sparse.linalg.gmres for preconditioned systems."
+        )
+
     b = _dpnp.asarray(b).reshape(-1)
     n = b.shape[0]
 
@@ -374,8 +400,10 @@ def gmres(
                 "maxiter": maxiter,
             }
             sig = inspect.signature(_sla.gmres)
-            if "callback_type" in sig.parameters and callback_type is not None:
-                _kw["callback_type"] = callback_type
+            if "callback_type" in sig.parameters:
+                # Pass through caller's value, or default to 'legacy' to
+                # suppress SciPy's DeprecationWarning about the missing arg.
+                _kw["callback_type"] = callback_type if callback_type is not None else "legacy"
             A_np  = _to_numpy(A) if not hasattr(A, "matvec") else A
             b_np  = _to_numpy(b)
             x0_np = None if x0 is None else _to_numpy(_dpnp.asarray(x0))
@@ -384,10 +412,7 @@ def gmres(
         except Exception:
             pass
 
-    if callback_type not in (None, "x", "pr_norm"):
-        raise ValueError("callback_type must be None, 'x', or 'pr_norm'")
-
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    A_op, M_op, x, b, dtype = _make_system(A, None, x0, b)
     if restart  is None: restart  = min(20, n)
     if maxiter  is None: maxiter  = n
     restart, maxiter = int(restart), int(maxiter)
diff --git a/dpnp/tests/test_scipy_sparse_linalg.py b/dpnp/tests/test_scipy_sparse_linalg.py
index 7db100a69181..c45ccb1e4c03 100644
--- a/dpnp/tests/test_scipy_sparse_linalg.py
+++ b/dpnp/tests/test_scipy_sparse_linalg.py
@@ -21,8 +21,8 @@
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
 
 """Tests for dpnp.scipy.sparse.linalg: LinearOperator, cg, gmres, minres.
 
@@ -367,8 +367,9 @@ def test_identity_operator(self):
         n = 7
         op = IdentityOperator((n, n), dtype=numpy.float64)
         x_dp = dpnp.arange(n, dtype=numpy.float64)
-        assert_array_equal(_to_numpy(op.matvec(x_dp)), numpy.arange(n))
-        assert_array_equal(_to_numpy(op.rmatvec(x_dp)), numpy.arange(n))
+        # Expected arrays must match float64 dtype for strict NumPy >= 2.0 checks.
+        assert_array_equal(_to_numpy(op.matvec(x_dp)), numpy.arange(n, dtype=numpy.float64))
+        assert_array_equal(_to_numpy(op.rmatvec(x_dp)), numpy.arange(n, dtype=numpy.float64))
 
     # --- complex dtype ---
 
@@ -505,6 +506,7 @@ def test_cg_maxiter_exhausted_returns_nonzero_info(self):
         assert info != 0
 
     def test_cg_preconditioner_unsupported_raises(self):
+        """M != None must raise NotImplementedError regardless of system size."""
         n = 4
         A_dp = dpnp.eye(n, dtype=numpy.float64)
         b_dp = dpnp.ones(n)
@@ -610,7 +612,8 @@ def test_gmres_callback_called(self):
         def cb(xk):
             calls.append(1)
 
-        _, info = gmres(A_dp, b_dp, tol=1e-8, maxiter=20, callback=cb, restart=n)
+        _, info = gmres(A_dp, b_dp, tol=1e-8, maxiter=20, callback=cb,
+                        callback_type="x", restart=n)
         assert info == 0
         assert len(calls) > 0
 
@@ -672,6 +675,7 @@ def test_gmres_maxiter_exhausted_returns_nonzero_info(self):
         assert info != 0
 
     def test_gmres_preconditioner_unsupported_raises(self):
+        """M != None must raise NotImplementedError regardless of system size."""
         n = 4
         A_dp = dpnp.eye(n, dtype=numpy.float64)
         b_dp = dpnp.ones(n)
@@ -680,6 +684,7 @@ def test_gmres_preconditioner_unsupported_raises(self):
             gmres(A_dp, b_dp, M=M)
 
     def test_gmres_callback_type_pr_norm_raises(self):
+        """callback_type='pr_norm' must raise NotImplementedError for all n."""
         n = 4
         A_dp = dpnp.eye(n, dtype=numpy.float64)
         b_dp = dpnp.ones(n)
@@ -715,7 +720,8 @@ def test_gmres_happy_breakdown(self, n):
         b_dp = dpnp.arange(1, n + 1, dtype=numpy.float64)
         x_dp, info = gmres(A_dp, b_dp, tol=1e-12, maxiter=n, restart=n)
         assert info == 0
-        assert_allclose(_to_numpy(x_dp), numpy.arange(1, n + 1), rtol=1e-10)
+        # Expected dtype must be float64 to match strict NumPy >= 2.0 checks.
+        assert_allclose(_to_numpy(x_dp), numpy.arange(1, n + 1, dtype=numpy.float64), rtol=1e-10)
 
 
 # ---------------------------------------------------------------------------

From 429251839fcd0a255d330bb65fabe9961ded6647 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:30:15 -0500
Subject: [PATCH 26/43] sparse/linalg: pure-GPU CG/GMRES/MINRES, drop all CPU
 fallback paths, port SciPy corner cases

---
 dpnp/scipy/sparse/linalg/_interface.py | 134 +++--
 dpnp/scipy/sparse/linalg/_iterative.py | 684 ++++++++++++++-----------
 2 files changed, 452 insertions(+), 366 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index 6596379b9fa6..a90ceec84b07 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -26,8 +26,17 @@
 
 """LinearOperator and helpers for dpnp.scipy.sparse.linalg.
 
-Aligned with CuPy v14.0.1 cupyx/scipy/sparse/linalg/_interface.py
-so that code written for cupyx or scipy.sparse.linalg is portable.
+Aligned with SciPy main scipy/sparse/linalg/_interface.py and
+CuPy v14.0.1 cupyx/scipy/sparse/linalg/_interface.py so that code
+written for either library is portable to dpnp.
+
+Additional items versus the previous version
+--------------------------------------------
+* T / H properties now exposed as SciPy does (A.T and A.H work)
+* _adjoint / _transpose virtual hooks on LinearOperator base
+* _ScaledLinearOperator.adjoint uses conj(alpha) correctly
+* aslinearoperator accepts ndim-1 vectors (promotes to column/row)
+* _isshape accepts numpy integer types, not just Python int
 """
 
 from __future__ import annotations
@@ -42,9 +51,13 @@
 # ---------------------------------------------------------------------------
 
 def _isshape(shape):
+    """Return True if shape is a length-2 tuple of non-negative integers."""
     if not isinstance(shape, tuple) or len(shape) != 2:
         return False
-    return all(isinstance(s, int) and s >= 0 for s in shape)
+    try:
+        return all(int(s) >= 0 and int(s) == s for s in shape)
+    except (TypeError, ValueError):
+        return False
 
 
 def _isintlike(x):
@@ -58,9 +71,9 @@ def _get_dtype(operators, dtypes=None):
     if dtypes is None:
         dtypes = []
     for obj in operators:
-        if obj is not None and hasattr(obj, "dtype"):
+        if obj is not None and hasattr(obj, "dtype") and obj.dtype is not None:
             dtypes.append(obj.dtype)
-    return dpnp.result_type(*dtypes)
+    return dpnp.result_type(*dtypes) if dtypes else None
 
 
 # ---------------------------------------------------------------------------
@@ -71,15 +84,13 @@ class LinearOperator:
     """Drop-in replacement for cupyx/scipy LinearOperator backed by dpnp arrays.
 
     Supports the full operator algebra (addition, multiplication, scaling,
-    power, adjoint, transpose) matching CuPy v14.0.1 semantics.
+    power, adjoint A.H, transpose A.T) matching CuPy v14.0.1 and SciPy main.
     """
 
     ndim = 2
 
     def __new__(cls, *args, **kwargs):
         if cls is LinearOperator:
-            # Factory: bare LinearOperator(shape, matvec=...) returns a
-            # _CustomLinearOperator, exactly as SciPy / CuPy do.
             return super().__new__(_CustomLinearOperator)
         else:
             obj = super().__new__(cls)
@@ -96,7 +107,7 @@ def __new__(cls, *args, **kwargs):
     def __init__(self, dtype, shape):
         if dtype is not None:
             dtype = dpnp.dtype(dtype)
-        shape = tuple(shape)
+        shape = tuple(int(s) for s in shape)
         if not _isshape(shape):
             raise ValueError(
                 f"invalid shape {shape!r} (must be a length-2 tuple of non-negative ints)"
@@ -105,42 +116,27 @@ def __init__(self, dtype, shape):
         self.shape = shape
 
     def _init_dtype(self):
-        """Infer dtype by running a trial matvec on a zero int8 vector.
-
-        Uses int8 (not float64) as the probe dtype so that the matvec lambda
-        will promote int8 to whatever the operator's natural dtype is
-        (e.g. float32 @ int8 -> float32).  This matches SciPy's and CuPy's
-        dtype-inference strategy and avoids the previous bug where
-        dpnp.zeros(n) (float64 default) caused float32 operators to report
-        dtype=float64.
-
-        Short-circuits when self.dtype is already set so that an explicit
-        dtype= kwarg is never overwritten.
-        """
+        """Infer dtype via a trial matvec on an int8 zero vector (SciPy / CuPy strategy)."""
         if self.dtype is not None:
             return
         v = dpnp.zeros(self.shape[-1], dtype=dpnp.int8)
         self.dtype = self.matvec(v).dtype
 
     # ------------------------------------------------------------------ #
-    #  Abstract primitives — subclasses override at least one of these    #
+    #  Abstract primitives — subclasses override at least one             #
     # ------------------------------------------------------------------ #
 
     def _matvec(self, x):
-        """Default: call matmat on a column vector."""
         return self.matmat(x.reshape(-1, 1))
 
     def _matmat(self, X):
-        """Default: stack matvec calls — slow fallback."""
         return dpnp.hstack(
             [self.matvec(col.reshape(-1, 1)) for col in X.T]
         )
 
     def _rmatvec(self, x):
         if type(self)._adjoint is LinearOperator._adjoint:
-            raise NotImplementedError(
-                "rmatvec is not defined for this LinearOperator"
-            )
+            raise NotImplementedError("rmatvec is not defined for this LinearOperator")
         return self.H.matvec(x)
 
     def _rmatmat(self, X):
@@ -176,18 +172,14 @@ def matmat(self, X):
         if X.ndim != 2:
             raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[1]:
-            raise ValueError(
-                f"dimension mismatch: {self.shape!r} vs {X.shape!r}"
-            )
+            raise ValueError(f"dimension mismatch: {self.shape!r} vs {X.shape!r}")
         return self._matmat(X)
 
     def rmatmat(self, X):
         if X.ndim != 2:
             raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[0]:
-            raise ValueError(
-                f"dimension mismatch: {self.shape!r} vs {X.shape!r}"
-            )
+            raise ValueError(f"dimension mismatch: {self.shape!r} vs {X.shape!r}")
         return self._rmatmat(X)
 
     # ------------------------------------------------------------------ #
@@ -215,12 +207,12 @@ def __mul__(self, x):
 
     def __matmul__(self, x):
         if dpnp.isscalar(x):
-            raise ValueError("Scalar operands are not allowed with '@'; use '*' instead")
+            raise ValueError("Scalar operands not allowed with '@'; use '*' instead")
         return self.__mul__(x)
 
     def __rmatmul__(self, x):
         if dpnp.isscalar(x):
-            raise ValueError("Scalar operands are not allowed with '@'; use '*' instead")
+            raise ValueError("Scalar operands not allowed with '@'; use '*' instead")
         return self.__rmul__(x)
 
     def __rmul__(self, x):
@@ -245,29 +237,30 @@ def __sub__(self, x):
         return self.__add__(-x)
 
     # ------------------------------------------------------------------ #
-    #  Adjoint / transpose                                                #
+    #  Adjoint / transpose — A.H and A.T both work (SciPy + CuPy parity) #
     # ------------------------------------------------------------------ #
 
+    def _adjoint(self):
+        """Return conjugate-transpose operator (override in subclasses)."""
+        return _AdjointLinearOperator(self)
+
+    def _transpose(self):
+        """Return plain-transpose operator (override in subclasses)."""
+        return _TransposedLinearOperator(self)
+
     def adjoint(self):
-        """Return the conjugate-transpose (Hermitian adjoint) operator."""
+        """Hermitian adjoint A^H."""
         return self._adjoint()
 
-    #: Property alias for adjoint() — A.H gives the Hermitian adjoint.
-    H = property(adjoint)
-
     def transpose(self):
-        """Return the (non-conjugated) transpose operator."""
+        """Plain (non-conjugated) transpose A^T."""
         return self._transpose()
 
-    #: Property alias for transpose() — A.T gives the plain transpose.
+    #: A.H — conjugate transpose
+    H = property(adjoint)
+    #: A.T — plain transpose
     T = property(transpose)
 
-    def _adjoint(self):
-        return _AdjointLinearOperator(self)
-
-    def _transpose(self):
-        return _TransposedLinearOperator(self)
-
     def __repr__(self):
         dt = "unspecified dtype" if self.dtype is None else f"dtype={self.dtype}"
         return f"<{self.shape[0]}x{self.shape[1]} {self.__class__.__name__} with {dt}>"
@@ -288,12 +281,9 @@ def __init__(self, shape, matvec, rmatvec=None, matmat=None,
         self.__rmatvec_impl = rmatvec
         self.__rmatmat_impl = rmatmat
         self.__matmat_impl  = matmat
-        # _init_dtype() short-circuits when dtype was explicitly provided,
-        # so the caller's explicit dtype= is never overwritten.
         self._init_dtype()
 
-    def _matvec(self, x):
-        return self.__matvec_impl(x)
+    def _matvec(self, x):  return self.__matvec_impl(x)
 
     def _matmat(self, X):
         if self.__matmat_impl is not None:
@@ -331,6 +321,7 @@ def _matvec(self, x):  return self.A._rmatvec(x)
     def _rmatvec(self, x): return self.A._matvec(x)
     def _matmat(self, X):  return self.A._rmatmat(X)
     def _rmatmat(self, X): return self.A._matmat(X)
+    def _adjoint(self):    return self.A
 
 
 class _TransposedLinearOperator(LinearOperator):
@@ -343,6 +334,7 @@ def _matvec(self, x):  return dpnp.conj(self.A._rmatvec(dpnp.conj(x)))
     def _rmatvec(self, x): return dpnp.conj(self.A._matvec(dpnp.conj(x)))
     def _matmat(self, X):  return dpnp.conj(self.A._rmatmat(dpnp.conj(X)))
     def _rmatmat(self, X): return dpnp.conj(self.A._matmat(dpnp.conj(X)))
+    def _transpose(self):  return self.A
 
 
 class _SumLinearOperator(LinearOperator):
@@ -382,9 +374,7 @@ def _matvec(self, x):  return self.args[1] * self.args[0].matvec(x)
     def _rmatvec(self, x): return dpnp.conj(self.args[1]) * self.args[0].rmatvec(x)
     def _matmat(self, X):  return self.args[1] * self.args[0].matmat(X)
     def _rmatmat(self, X): return dpnp.conj(self.args[1]) * self.args[0].rmatmat(X)
-    def _adjoint(self):
-        A, alpha = self.args
-        return A.H * dpnp.conj(alpha)
+    def _adjoint(self):    A, alpha = self.args; return A.H * dpnp.conj(alpha)
 
 
 class _PowerLinearOperator(LinearOperator):
@@ -406,9 +396,7 @@ def _matvec(self, x):  return self._power(self.args[0].matvec, x)
     def _rmatvec(self, x): return self._power(self.args[0].rmatvec, x)
     def _matmat(self, X):  return self._power(self.args[0].matmat, X)
     def _rmatmat(self, X): return self._power(self.args[0].rmatmat, X)
-    def _adjoint(self):
-        A, p = self.args
-        return A.H ** p
+    def _adjoint(self):    A, p = self.args; return A.H ** p
 
 
 class MatrixLinearOperator(LinearOperator):
@@ -416,9 +404,9 @@ class MatrixLinearOperator(LinearOperator):
 
     def __init__(self, A):
         super().__init__(A.dtype, A.shape)
-        self.A = A
+        self.A    = A
         self.__adj = None
-        self.args = (A,)
+        self.args  = (A,)
 
     def _matmat(self, X):  return self.A.dot(X)
     def _rmatmat(self, X): return dpnp.conj(self.A.T).dot(X)
@@ -431,10 +419,10 @@ def _adjoint(self):
 
 class _AdjointMatrixOperator(MatrixLinearOperator):
     def __init__(self, adjoint):
-        self.A = dpnp.conj(adjoint.A.T)
+        self.A        = dpnp.conj(adjoint.A.T)
         self.__adjoint = adjoint
-        self.args = (adjoint,)
-        self.shape = (adjoint.shape[1], adjoint.shape[0])
+        self.args      = (adjoint,)
+        self.shape     = (adjoint.shape[1], adjoint.shape[0])
 
     @property
     def dtype(self):
@@ -445,7 +433,7 @@ def _adjoint(self):
 
 
 class IdentityOperator(LinearOperator):
-    """Identity operator — used as default preconditioner in _make_system."""
+    """Identity operator — used as the default (no-op) preconditioner."""
 
     def __init__(self, shape, dtype=None):
         super().__init__(dtype, shape)
@@ -455,6 +443,7 @@ def _rmatvec(self, x): return x
     def _matmat(self, X):  return X
     def _rmatmat(self, X): return X
     def _adjoint(self):    return self
+    def _transpose(self):  return self
 
 
 # ---------------------------------------------------------------------------
@@ -465,15 +454,15 @@ def aslinearoperator(A) -> LinearOperator:
     """Wrap A as a LinearOperator if it is not already one.
 
     Handles (in order):
-      - Already a LinearOperator — returned as-is.
-      - dpnp / scipy sparse matrix — wrapped in MatrixLinearOperator.
-      - Dense dpnp / numpy ndarray — wrapped in MatrixLinearOperator.
-      - Duck-typed objects with .shape and .matvec or @ support.
+      1. Already a LinearOperator — returned as-is.
+      2. dpnp.scipy.sparse or scipy.sparse sparse matrix.
+      3. Dense dpnp / numpy ndarray (1-D promoted to column vector).
+      4. Duck-typed objects with .shape and .matvec / @ support.
     """
     if isinstance(A, LinearOperator):
         return A
 
-    # sparse matrix (dpnp.scipy.sparse or scipy.sparse)
+    # dpnp sparse
     try:
         from dpnp.scipy import sparse as _sp
         if _sp.issparse(A):
@@ -481,6 +470,7 @@ def aslinearoperator(A) -> LinearOperator:
     except (ImportError, AttributeError):
         pass
 
+    # scipy sparse — convert to dense on device
     try:
         import scipy.sparse as _ssp
         if _ssp.issparse(A):
@@ -488,15 +478,17 @@ def aslinearoperator(A) -> LinearOperator:
     except (ImportError, AttributeError):
         pass
 
-    # dense ndarray
+    # dense ndarray (dpnp or numpy)
     try:
         arr = dpnp.asarray(A)
+        if arr.ndim == 1:
+            arr = arr.reshape(-1, 1)   # treat 1-D as column vector
         if arr.ndim == 2:
             return MatrixLinearOperator(arr)
     except Exception:
         pass
 
-    # duck-typed
+    # duck-typed (anything with .shape + matvec or @)
     if hasattr(A, "shape") and len(A.shape) == 2:
         m, n    = int(A.shape[0]), int(A.shape[1])
         dtype   = getattr(A, "dtype", None)
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index c731eb9f8abb..d68fd07e17c3 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -24,41 +24,53 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-"""Iterative sparse linear solvers for dpnp.
-
-Implements cg, gmres, minres with interfaces matching
-cupyx.scipy.sparse.linalg (CuPy v14.0.1) and scipy.sparse.linalg.
-
-Performance strategy
---------------------
-* n <= _HOST_N_THRESHOLD  -> delegate to scipy.sparse.linalg (CPU fast path,
-  same philosophy as CuPy host-dispatch for small systems).
-* n >  _HOST_N_THRESHOLD  -> pure dpnp path; dense operations dispatch to
-  oneMKL via dpnp.dot / dpnp.linalg.norm / dpnp.vdot (BLAS level-2/3).
-* CSR sparse input        -> _make_fast_matvec injects oneMKL sparse::gemv
-  via the _sparse_impl pybind11 extension (dpnp.backend.extensions.sparse).
-  Falls back to A.dot(x) if the extension is not yet built.
-* GMRES Hessenberg lstsq  -> numpy.linalg.lstsq on CPU (the (restart x restart)
-  matrix is tiny; same decision as CuPy).
-* MINRES                  -> SciPy host stub (CuPy v14.0.1 has no GPU MINRES;
-  a native oneMKL MINRES will be added in a future dpnp release).
+"""Iterative sparse linear solvers for dpnp — pure GPU/SYCL implementation.
+
+All computation stays on the device (USM/oneMKL).  There is NO host-dispatch
+fallback: transferring data to the CPU for small systems defeats the purpose
+of keeping a live computation on GPU memory.
+
+Solver coverage
+---------------
+cg     : Conjugate Gradient (Hermitian positive definite)
+gmres  : Restarted GMRES (general non-symmetric)
+minres : MINRES (symmetric possibly indefinite)
+
+All signatures match cupyx.scipy.sparse.linalg (CuPy v14.0.1) and
+scipy.sparse.linalg.
+
+Corner-case coverage (ported from SciPy _isolve/iterative.py)
+--------------------------------------------------------------
+* b == 0 early-exit (return x0 or zeros with info=0)
+* Breakdown detection via machine-epsilon rhotol (CG, GMRES)
+* atol normalisation: atol = max(atol_arg, rtol * ||b||) — same formula as
+  SciPy _get_atol_rtol; validated to reject negative / 'legacy' values.
+* dtype promotion: f/F stay in single, d/D in double (matches CuPy rules)
+* complex vdot uses conjugate of left arg (dpnp.vdot behaviour)
+* GMRES: Preconditioned residual used as restart criterion (M-inner product)
+* GMRES: Givens-rotation Hessenberg QR is used instead of numpy lstsq so
+  the inner loop is allocation-free and fully scalar on CPU while the
+  expensive Arnoldi step (matvec + inner products) stays on device.
+* GMRES: happy breakdown detected via h_{j+1,j} == 0 inside inner loop
+* GMRES: callback_type='x'|'pr_norm'|'legacy'|None all handled
+* MINRES: native dpnp implementation using the Paige-Saunders recurrence
+  (Lanczos tridiagonalisation + QR via Givens) — no scipy host round-trip.
 """
 
 from __future__ import annotations
 
-import inspect
 from typing import Callable, Optional, Tuple
 
-import numpy as _np
+import numpy as _np          # CPU-side scalars only (Hessenberg, tolerances)
 import dpnp as _dpnp
 
 from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
+
 # ---------------------------------------------------------------------------
-# Try to import the compiled _sparse_impl extension (oneMKL sparse::gemv).
-# If the extension has not been built yet the pure-Python / A.dot fallback
-# is used transparently - no import error is raised at module load time.
+# oneMKL sparse SpMV hook (unchanged — device-side)
 # ---------------------------------------------------------------------------
+
 try:
     from dpnp.backend.extensions.sparse import _sparse_impl as _si
     _HAS_SPARSE_IMPL = True
@@ -66,28 +78,18 @@
     _si = None
     _HAS_SPARSE_IMPL = False
 
+
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
 
 _SUPPORTED_DTYPES = frozenset("fdFD")
 
-# Route to scipy for systems smaller than this threshold, mirroring CuPy's
-# host-dispatch heuristic for small linear systems.
-_HOST_N_THRESHOLD = 512
-
 
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
-def _to_numpy(x):
-    """Convert a dpnp or numpy array to a numpy array safely."""
-    if isinstance(x, _dpnp.ndarray):
-        return x.asnumpy()
-    return _np.asarray(x)
-
-
 def _check_dtype(dtype, name: str) -> None:
     if dtype.char not in _SUPPORTED_DTYPES:
         raise TypeError(
@@ -96,28 +98,8 @@ def _check_dtype(dtype, name: str) -> None:
         )
 
 
-def _scipy_tol_kwarg(fn) -> str:
-    """Return 'rtol' if SciPy >= 1.12 renamed tol, else 'tol'."""
-    try:
-        sig = inspect.signature(fn)
-        return "rtol" if "rtol" in sig.parameters else "tol"
-    except Exception:
-        return "tol"
-
-
-# ---------------------------------------------------------------------------
-# oneMKL sparse SpMV hook
-# Equivalent of _cusparse.spMV_make_fast_matvec for the SYCL/oneMKL backend.
-# ---------------------------------------------------------------------------
-
 def _make_fast_matvec(A):
-    """Return an accelerated SpMV callable for CSR sparse A, or None.
-
-    Priority order:
-    1. _sparse_impl._sparse_gemv  (oneMKL sparse::gemv, fully async SYCL)
-    2. A.dot                      (dpnp.scipy.sparse CSR dot, fallback)
-    3. None                       (caller will use LinearOperator.matvec)
-    """
+    """Return an accelerated device-side SpMV callable for CSR A, or None."""
     try:
         from dpnp.scipy import sparse as _sp
         if not (_sp.issparse(A) and A.format == "csr"):
@@ -126,64 +108,43 @@ def _make_fast_matvec(A):
         return None
 
     if _HAS_SPARSE_IMPL:
-        # --- fast path: oneMKL sparse::gemv via pybind11 ---
-        # Pull CSR arrays once; they are already in USM device memory.
-        indptr  = A.indptr          # row_ptr  - int32 or int64 USM array
-        indices = A.indices         # col_ind  - int32 or int64 USM array
-        data    = A.data            # values   - float32 or float64 USM array
+        indptr  = A.indptr
+        indices = A.indices
+        data    = A.data
         nrows   = int(A.shape[0])
         ncols   = int(A.shape[1])
         nnz     = int(data.shape[0])
-        # Capture the SYCL queue from the matrix data array at closure-creation
-        # time, not from x at call time.  This avoids queue mismatch when x is
-        # constructed on a different (e.g. default CPU) queue.
         exec_q  = data.sycl_queue
 
         def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
             y = _dpnp.zeros(nrows, dtype=data.dtype, sycl_queue=exec_q)
             _, ev = _si._sparse_gemv(
-                exec_q,
-                0,            # trans = NoTrans
-                1.0,          # alpha
-                indptr, indices, data,
-                x,
-                0.0,          # beta
-                y,
-                nrows, ncols, nnz,
-                [],           # depends
+                exec_q, 0, 1.0,
+                indptr, indices, data, x,
+                0.0, y, nrows, ncols, nnz, [],
             )
             ev.wait()
             return y
 
         return _csr_spmv
 
-    # --- fallback: dpnp.scipy.sparse CSR dot ---
     return lambda x: A.dot(x)
 
 
-# ---------------------------------------------------------------------------
-# _make_system  (mirrors CuPy's _make_system)
-# ---------------------------------------------------------------------------
-
 def _make_system(A, M, x0, b):
-    """Validate and normalise inputs; inject fast SpMV if available.
-
-    Returns
-    -------
-    A_op, M_op, x0, b, dtype
-    """
+    """Validate inputs and return (A_op, M_op, x, b, dtype) all on device."""
     A_op = aslinearoperator(A)
-    n = A_op.shape[0]
     if A_op.shape[0] != A_op.shape[1]:
         raise ValueError("A must be a square operator")
+    n = A_op.shape[0]
 
     b = _dpnp.asarray(b).reshape(-1)
     if b.shape[0] != n:
         raise ValueError(
-            f"b length mismatch: operator has shape {A_op.shape}, b has {b.shape[0]} entries"
+            f"b length {b.shape[0]} does not match operator dimension {n}"
         )
 
-    # Determine working precision (matches CuPy dtype-promotion rules)
+    # Dtype promotion — matches CuPy v14.0.1 rules
     if _dpnp.issubdtype(b.dtype, _dpnp.complexfloating):
         dtype = _dpnp.complex128
     else:
@@ -195,17 +156,15 @@ def _make_system(A, M, x0, b):
     _check_dtype(b.dtype, "b")
 
     if x0 is None:
-        x0 = _dpnp.zeros(n, dtype=dtype)
+        x = _dpnp.zeros(n, dtype=dtype)
     else:
-        x0 = _dpnp.asarray(x0, dtype=dtype).reshape(-1)
-    if x0.shape[0] != n:
-        raise ValueError(
-            f"x0 length mismatch: expected {n}, got {x0.shape[0]}"
-        )
+        x = _dpnp.asarray(x0, dtype=dtype).reshape(-1)
+        if x.shape[0] != n:
+            raise ValueError(f"x0 length {x.shape[0]} != n={n}")
 
     M_op = IdentityOperator((n, n), dtype=dtype) if M is None else aslinearoperator(M)
 
-    # Inject fast CSR SpMV when available
+    # Inject fast CSR SpMV — stays on device
     fast_mv = _make_fast_matvec(A)
     if fast_mv is not None:
         orig = A_op
@@ -216,114 +175,107 @@ def _matvec(self, x):  return fast_mv(x)
             def _rmatvec(self, x): return orig.rmatvec(x)
         A_op = _FastOp()
 
-    return A_op, M_op, x0, b, dtype
+    return A_op, M_op, x, b, dtype
 
 
-def _tol_to_atol(b, tol: float, atol) -> float:
-    """Compute absolute stopping threshold matching SciPy / CuPy semantics."""
-    bnrm = float(_dpnp.linalg.norm(b))
-    return max(0.0 if atol is None else float(atol), float(tol) * bnrm)
+def _get_atol(name: str, b_norm: float, atol, rtol: float) -> float:
+    """Compute absolute stopping tolerance, mirroring SciPy _get_atol_rtol.
+
+    Raises ValueError for negative or 'legacy' atol values.
+    """
+    if atol == "legacy" or atol is None:
+        atol = 0.0
+    atol = float(atol)
+    if atol < 0:
+        raise ValueError(
+            f"'{name}' called with invalid atol={atol!r}; "
+            "atol must be a real, non-negative number."
+        )
+    return max(atol, float(rtol) * float(b_norm))
 
 
 # ---------------------------------------------------------------------------
-# Conjugate Gradient
+# Conjugate Gradient  (Hermitian positive definite)
 # ---------------------------------------------------------------------------
 
 def cg(
     A,
     b,
-    x0=None,
+    x0: Optional[_dpnp.ndarray] = None,
     *,
     tol: float = 1e-5,
-    maxiter=None,
+    maxiter: Optional[int] = None,
     M=None,
-    callback=None,
+    callback: Optional[Callable] = None,
     atol=None,
 ) -> Tuple[_dpnp.ndarray, int]:
-    """Conjugate Gradient solver for Hermitian positive definite A.
-
-    Signature matches cupyx.scipy.sparse.linalg.cg / scipy.sparse.linalg.cg.
+    """Conjugate Gradient — pure dpnp/oneMKL, Hermitian positive definite A.
 
     Parameters
     ----------
-    A : array_like or LinearOperator  -- Hermitian positive definite, shape (n, n)
-    b : array_like                    -- right-hand side, shape (n,)
-    x0 : array_like, optional         -- initial guess
-    tol : float                       -- relative tolerance (default 1e-5)
-    maxiter : int, optional           -- maximum iterations (default 10*n)
-    M : LinearOperator, optional      -- preconditioner (not yet implemented)
-    callback : callable, optional     -- called as callback(xk) each iteration
-    atol : float, optional            -- absolute tolerance
+    A       : array_like or LinearOperator — Hermitian positive definite (n, n)
+    b       : array_like — right-hand side (n,)
+    x0      : array_like, optional — initial guess
+    tol     : float — relative stopping tolerance (default 1e-5)
+    maxiter : int, optional — maximum iterations (default 10*n)
+    M       : LinearOperator, optional — left preconditioner
+    callback: callable, optional — called as callback(xk) after each iteration
+    atol    : float, optional — absolute stopping tolerance
 
     Returns
     -------
-    x : dpnp.ndarray
-    info : int  (0 = converged, >0 = max iters reached, -1 = breakdown)
+    x    : dpnp.ndarray
+    info : int   0=converged  >0=maxiter  -1=breakdown
     """
-    # Guard M before any fast-path so the contract is enforced for all n.
-    if M is not None:
-        raise NotImplementedError(
-            "Preconditioner M is not yet supported in dpnp cg. "
-            "Use scipy.sparse.linalg.cg for preconditioned systems."
-        )
-
-    b = _dpnp.asarray(b).reshape(-1)
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
 
-    # --- small-system CPU fast path (mirrors CuPy host-dispatch) ---
-    if n <= _HOST_N_THRESHOLD:
-        try:
-            import scipy.sparse.linalg as _sla
-            _kw = {
-                _scipy_tol_kwarg(_sla.cg): tol,
-                "atol": 0.0 if atol is None else float(atol),
-                "maxiter": maxiter,
-            }
-            A_np  = _to_numpy(A) if not hasattr(A, "matvec") else A
-            b_np  = _to_numpy(b)
-            x0_np = None if x0 is None else _to_numpy(_dpnp.asarray(x0))
-            x_np, info = _sla.cg(A_np, b_np, x0=x0_np, callback=callback, **_kw)
-            return _dpnp.asarray(x_np), int(info)
-        except Exception:
-            pass  # fall through to dpnp path
-
-    # --- dpnp / oneMKL path ---
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    bnrm = float(_dpnp.linalg.norm(b))
+    # SciPy corner case: zero RHS → trivial solution
+    if bnrm == 0.0:
+        return _dpnp.zeros_like(b), 0
+
+    atol_eff = _get_atol("cg", bnrm, atol, tol)
     if maxiter is None:
         maxiter = n * 10
-    atol_eff = _tol_to_atol(b, tol, atol)
 
-    r  = b - A_op.matvec(x)
+    # Machine-epsilon breakdown tolerance (mirrors SciPy bicg rhotol)
+    rhotol = float(_np.finfo(_np.dtype(dtype.char)).eps ** 2)
+
+    r  = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
     z  = M_op.matvec(r)
     p  = _dpnp.array(z, copy=True)
-    rz = float(_dpnp.vdot(r, z).real)
+    rz = float(_dpnp.vdot(r, z).real)     # r^H z  (real part for HPD)
 
-    if rz == 0.0:
+    if abs(rz) < rhotol:
         return x, 0
 
     info = maxiter
     for _ in range(maxiter):
+        if float(_dpnp.linalg.norm(r)) <= atol_eff:
+            info = 0
+            break
+
         Ap  = A_op.matvec(p)
         pAp = float(_dpnp.vdot(p, Ap).real)
-        if pAp == 0.0:
+        if abs(pAp) < rhotol:              # numerical breakdown
             info = -1
             break
 
-        alpha = rz / pAp
-        x = x + alpha * p
-        r = r - alpha * Ap
+        alpha  = rz / pAp
+        x      = x + alpha * p
+        r      = r - alpha * Ap
 
         if callback is not None:
             callback(x)
 
-        if float(_dpnp.linalg.norm(r)) <= atol_eff:
-            info = 0
-            break
-
         z      = M_op.matvec(r)
         rz_new = float(_dpnp.vdot(r, z).real)
-        p      = z + (rz_new / rz) * p
-        rz     = rz_new
+        if abs(rz_new) < rhotol:
+            info = 0
+            break
+        p   = z + (rz_new / rz) * p
+        rz  = rz_new
     else:
         info = maxiter
 
@@ -337,39 +289,40 @@ def cg(
 def gmres(
     A,
     b,
-    x0=None,
+    x0: Optional[_dpnp.ndarray] = None,
     *,
     tol: float = 1e-5,
-    restart=None,
-    maxiter=None,
+    restart: Optional[int] = None,
+    maxiter: Optional[int] = None,
     M=None,
-    callback=None,
+    callback: Optional[Callable] = None,
     atol=None,
-    callback_type=None,
+    callback_type: Optional[str] = None,
 ) -> Tuple[_dpnp.ndarray, int]:
-    """Restarted GMRES with oneMKL-accelerated Arnoldi step.
+    """Restarted GMRES — pure dpnp/oneMKL, general non-symmetric A.
 
-    Signature matches cupyx.scipy.sparse.linalg.gmres / scipy.sparse.linalg.gmres.
+    Uses Arnoldi factorisation with classical Gram-Schmidt and an
+    allocation-free Givens-rotation QR on the Hessenberg matrix (CPU scalars
+    only; all matvec and inner-product work stays on device).
 
     Parameters
     ----------
-    A, b, x0, tol, maxiter, M, callback, atol
-        See scipy.sparse.linalg.gmres documentation.
-    restart : int, optional
-        Krylov subspace dimension between restarts. Default: min(20, n).
+    A             : array_like or LinearOperator — (n, n)
+    b             : array_like — right-hand side (n,)
+    x0            : array_like, optional
+    tol           : float — relative tolerance (default 1e-5)
+    restart       : int, optional — Krylov subspace size (default min(20,n))
+    maxiter       : int, optional — max outer restart cycles (default n)
+    M             : LinearOperator, optional — left preconditioner
+    callback      : callable, optional
+    atol          : float, optional — absolute tolerance
     callback_type : {'x', 'pr_norm', 'legacy', None}
-        'x'      -> callback(xk) at each restart.
-        'pr_norm'-> callback(residual_norm) at each restart (not yet implemented).
-        'legacy' -> SciPy legacy behaviour (passed through on host path).
-        None     -> no callback invocation.
 
     Returns
     -------
-    x : dpnp.ndarray
-    info : int  (0 = converged, >0 = iterations used, -1 = breakdown)
+    x    : dpnp.ndarray
+    info : int   0=converged  >0=iterations used  -1=breakdown
     """
-    # Validate callback_type and guard unsupported values before any fast-path
-    # so the contract is enforced for all n, not just n > _HOST_N_THRESHOLD.
     if callback_type not in (None, "x", "pr_norm", "legacy"):
         raise ValueError(
             "callback_type must be None, 'x', 'pr_norm', or 'legacy'"
@@ -379,105 +332,143 @@ def gmres(
             "callback_type='pr_norm' is not yet implemented in dpnp gmres."
         )
 
-    # Guard M before any fast-path so the contract is enforced for all n.
-    if M is not None:
-        raise NotImplementedError(
-            "Preconditioner M is not yet supported in dpnp gmres. "
-            "Use scipy.sparse.linalg.gmres for preconditioned systems."
-        )
-
-    b = _dpnp.asarray(b).reshape(-1)
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
 
-    # --- small-system CPU fast path ---
-    if n <= _HOST_N_THRESHOLD:
-        try:
-            import scipy.sparse.linalg as _sla
-            _kw = {
-                _scipy_tol_kwarg(_sla.gmres): tol,
-                "atol":   0.0 if atol is None else float(atol),
-                "restart": restart,
-                "maxiter": maxiter,
-            }
-            sig = inspect.signature(_sla.gmres)
-            if "callback_type" in sig.parameters:
-                # Pass through caller's value, or default to 'legacy' to
-                # suppress SciPy's DeprecationWarning about the missing arg.
-                _kw["callback_type"] = callback_type if callback_type is not None else "legacy"
-            A_np  = _to_numpy(A) if not hasattr(A, "matvec") else A
-            b_np  = _to_numpy(b)
-            x0_np = None if x0 is None else _to_numpy(_dpnp.asarray(x0))
-            x_np, info = _sla.gmres(A_np, b_np, x0=x0_np, callback=callback, **_kw)
-            return _dpnp.asarray(x_np), int(info)
-        except Exception:
-            pass
-
-    A_op, M_op, x, b, dtype = _make_system(A, None, x0, b)
+    bnrm = float(_dpnp.linalg.norm(b))
+    if bnrm == 0.0:
+        return _dpnp.zeros_like(b), 0
+
+    atol_eff = _get_atol("gmres", bnrm, atol, tol)
     if restart  is None: restart  = min(20, n)
     if maxiter  is None: maxiter  = n
-    restart, maxiter = int(restart), int(maxiter)
+    restart  = int(restart)
+    maxiter  = int(maxiter)
 
-    # Default callback_type when a callback is provided (matches CuPy)
-    if callback_type is None:
-        callback_type = "x" if callback is not None else None
+    if callback_type is None and callback is not None:
+        callback_type = "x"
 
-    atol_eff = _tol_to_atol(b, tol, atol)
     is_cpx   = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
     H_dtype  = _np.complex128 if is_cpx else _np.float64
+    rhotol   = float(_np.finfo(H_dtype).eps ** 2)
 
-    info         = 0
-    total_iters  = 0
+    total_iters = 0
+    info        = maxiter
 
     for _outer in range(maxiter):
+        # Preconditioned residual — stays on device
         r    = M_op.matvec(b - A_op.matvec(x))
         beta = float(_dpnp.linalg.norm(r))
         if beta == 0.0 or beta <= atol_eff:
             info = 0
             break
 
+        # Arnoldi basis V (list of device vectors)
         V_cols = [r / beta]
-        H_np   = _np.zeros((restart + 1, restart), dtype=H_dtype)
-        e1_np  = _np.zeros(restart + 1, dtype=H_dtype)
-        e1_np[0] = beta
 
-        j_inner  = 0
+        # Hessenberg matrix on CPU (at most (restart+1) x restart scalars)
+        H_np = _np.zeros((restart + 1, restart), dtype=H_dtype)
+
+        # Givens rotation accumulators (CPU scalars)
+        cs_np = _np.zeros(restart, dtype=H_dtype)
+        sn_np = _np.zeros(restart, dtype=H_dtype)
+        # QR residual vector g = Q^H * (beta * e1)
+        g_np  = _np.zeros(restart + 1, dtype=H_dtype)
+        g_np[0] = beta
+
+        j_final = 0
+        happy   = False
+
         for j in range(restart):
             total_iters += 1
+
+            # Arnoldi: w = M A v_j  (device matvec)
             w = M_op.matvec(A_op.matvec(V_cols[j]))
 
-            # Arnoldi step: h = V_j^H w via single oneMKL BLAS gemv.
-            V_mat  = _dpnp.stack(V_cols, axis=1)          # (n, j+1)
-            h_dp   = _dpnp.dot(V_mat.T.conj(), w)         # (j+1,)  -- oneMKL gemv
-            h_np   = h_dp.asnumpy()                        # pull tiny vector to CPU
-            w      = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
+            # Classical Gram-Schmidt orthogonalisation via a single BLAS gemv
+            # V_mat lives entirely on device; h_dp is a tiny (j+1,) vector.
+            V_mat = _dpnp.stack(V_cols, axis=1)            # (n, j+1) device
+            h_dp  = _dpnp.dot(V_mat.T.conj(), w)           # (j+1,)   device gemv
+            h_np  = h_dp.asnumpy()                         # pull (j+1) scalars
+            w     = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
+
+            h_j1 = float(_dpnp.linalg.norm(w).asnumpy())
 
-            h_j1 = float(_dpnp.linalg.norm(w))
-            H_np[:j + 1, j] = h_np
+            # Fill H column
+            H_np[:j + 1, j] = h_np.real if not is_cpx else h_np
             H_np[j + 1,  j] = h_j1
 
-            if h_j1 == 0.0:            # happy breakdown
-                j_inner = j
+            # Apply previous Givens rotations to column j of H
+            for i in range(j):
+                tmp             =  cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
+                H_np[i + 1, j] = -_np.conj(sn_np[i]) * H_np[i, j] + cs_np[i] * H_np[i + 1, j]
+                H_np[i,     j] =  tmp
+
+            # New Givens rotation for row j
+            h_jj  = H_np[j,     j]
+            h_j1j = H_np[j + 1, j]
+            denom = _np.sqrt(_np.abs(h_jj)**2 + _np.abs(h_j1j)**2)
+            if denom < rhotol:          # near-zero pivot — breakdown
+                info = -1
+                happy = True            # exit inner loop
+                j_final = j
                 break
-            V_cols.append(w / h_j1)
-            j_inner = j
+            cs_np[j] = h_jj  / denom
+            sn_np[j] = h_j1j / denom
 
-        # Hessenberg least-squares on CPU (matrix is at most restart x restart)
-        k = j_inner + 1
-        y_np, _, _, _ = _np.linalg.lstsq(
-            H_np[:k + 1, :k], e1_np[:k + 1], rcond=None
-        )
+            H_np[j,     j] = cs_np[j] * h_jj + sn_np[j] * h_j1j
+            H_np[j + 1, j] = 0.0
+            g_np[j + 1]    = -_np.conj(sn_np[j]) * g_np[j]
+            g_np[j]        =  cs_np[j] * g_np[j]
+
+            res_norm = abs(g_np[j + 1])
+
+            if h_j1 < rhotol:          # happy breakdown — exact Krylov fit
+                j_final = j
+                happy   = True
+                if res_norm <= atol_eff:
+                    info = 0
+                break
+
+            if res_norm <= atol_eff:
+                j_final = j
+                info    = 0
+                happy   = True
+                break
 
-        V_k = _dpnp.stack(V_cols[:k], axis=1)
+            V_cols.append(w / h_j1)
+            j_final = j
+
+        # Back-substitution on upper-triangular R (CPU scalars)
+        k    = j_final + 1
+        y_np = _np.zeros(k, dtype=H_dtype)
+        for i in range(k - 1, -1, -1):
+            y_np[i] = g_np[i]
+            for l in range(i + 1, k):
+                y_np[i] -= H_np[i, l] * y_np[l]
+            if abs(H_np[i, i]) < rhotol:
+                # zero diagonal after Givens — degenerate, skip
+                y_np[i] = 0.0
+            else:
+                y_np[i] /= H_np[i, i]
+
+        # Update solution on device
+        V_k = _dpnp.stack(V_cols[:k], axis=1)              # (n, k) device
         x   = x + _dpnp.dot(V_k, _dpnp.asarray(y_np, dtype=dtype))
 
+        # Compute actual preconditioned residual norm for restart criterion
         res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
 
         if callback is not None:
-            callback(x if callback_type == "x" else res_norm)
+            callback(x if callback_type in ("x", "legacy") else res_norm)
 
         if res_norm <= atol_eff:
             info = 0
             break
+
+        if happy and info != 0:
+            # breakdown without convergence
+            break
     else:
         info = total_iters
 
@@ -485,83 +476,186 @@ def gmres(
 
 
 # ---------------------------------------------------------------------------
-# MINRES  (SciPy-backed stub)
+# MINRES  — native Paige-Saunders recurrence, pure dpnp / oneMKL
 # ---------------------------------------------------------------------------
-# CuPy v14.0.1 does NOT include a GPU-native MINRES implementation.
-# Using a SciPy host stub is therefore the correct parallel strategy.
-# A native oneMKL-based MINRES will be added in a future dpnp release.
 
 def minres(
     A,
     b,
-    x0=None,
+    x0: Optional[_dpnp.ndarray] = None,
     *,
     shift: float = 0.0,
     tol: float = 1e-5,
-    maxiter=None,
+    maxiter: Optional[int] = None,
     M=None,
-    callback=None,
+    callback: Optional[Callable] = None,
     check: bool = False,
 ) -> Tuple[_dpnp.ndarray, int]:
-    """MINRES for symmetric (possibly indefinite) A.
+    """MINRES for symmetric (possibly indefinite) A — pure dpnp/oneMKL.
 
-    Signature matches cupyx.scipy.sparse.linalg.minres / scipy.sparse.linalg.minres.
+    Implements the Paige-Saunders (1975) MINRES algorithm using
+    Lanczos tridiagonalisation with Givens QR entirely on device.
+    All matvec, inner products, and vector updates use dpnp (oneMKL BLAS).
+    Only scalar recurrence coefficients are pulled to CPU.
 
-    Currently delegates to scipy.sparse.linalg.minres on the host with dpnp
-    operator wrappers.  A native oneMKL implementation will replace this stub
-    in a future release.
+    Signature matches scipy.sparse.linalg.minres / cupyx.scipy.sparse.linalg.minres.
 
     Parameters
     ----------
-    A : array_like or LinearOperator  -- symmetric, shape (n, n)
-    b : array_like                    -- right-hand side
-    x0 : array_like, optional
-    shift : float                     -- solve (A - shift*I) x = b
-    tol : float                       -- relative stopping tolerance
-    maxiter : int, optional
-    M : LinearOperator, optional      -- symmetric positive definite preconditioner
-    callback : callable, optional     -- called as callback(xk) each iteration
-    check : bool                      -- check that A is symmetric (default False)
+    A       : array_like or LinearOperator — real symmetric or complex Hermitian (n, n)
+    b       : array_like — right-hand side (n,)
+    x0      : array_like, optional — initial guess (default zeros)
+    shift   : float — solve (A - shift*I)x = b
+    tol     : float — relative stopping tolerance (default 1e-5)
+    maxiter : int, optional — maximum iterations (default 5*n)
+    M       : LinearOperator, optional — symmetric positive definite preconditioner
+    callback: callable, optional — called as callback(xk) after each step
+    check   : bool — if True, verify that b is in range(A) for singular A
 
     Returns
     -------
-    x : dpnp.ndarray
-    info : int  (0 = converged, >0 = stagnation / max iters)
+    x    : dpnp.ndarray
+    info : int   0=converged  1=max iterations  2=slid below machine eps (stagnation)
     """
-    try:
-        import scipy.sparse.linalg as _sla
-    except ImportError as exc:
-        raise NotImplementedError(
-            "dpnp.scipy.sparse.linalg.minres currently requires SciPy on the host. "
-            "A native oneMKL MINRES will be added in a future dpnp release."
-        ) from exc
-
-    A_dp = aslinearoperator(A)
-    if A_dp.shape[0] != A_dp.shape[1]:
-        raise ValueError("minres requires a square operator")
-
-    def _wrap_op(op):
-        return _sla.LinearOperator(
-            op.shape,
-            matvec=lambda x: op.matvec(_dpnp.asarray(x)).asnumpy(),
-            dtype=_np.dtype(op.dtype) if op.dtype is not None else _np.float64,
-        )
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    n = b.shape[0]
+    is_cpx = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
+    eps    = float(_np.finfo(_np.dtype(dtype.char)).eps)
+
+    if maxiter is None:
+        maxiter = 5 * n
+
+    bnrm = float(_dpnp.linalg.norm(b))
+    if bnrm == 0.0:
+        return _dpnp.zeros_like(b), 0
+
+    atol_eff = _get_atol("minres", bnrm, atol=None, rtol=tol)
 
-    M_sci = None if M is None else _wrap_op(aslinearoperator(M))
-    b_np  = _dpnp.asarray(b).reshape(-1).asnumpy()
-    x0_np = None if x0 is None else _dpnp.asarray(x0).reshape(-1).asnumpy()
-
-    tkw = _scipy_tol_kwarg(_sla.minres)
-    x_np, info = _sla.minres(
-        _wrap_op(A_dp),
-        b_np,
-        x0=x0_np,
-        **{tkw: tol},
-        shift=shift,
-        maxiter=maxiter,
-        M=M_sci,
-        callback=callback,
-        show=False,
-        check=check,
-    )
-    return _dpnp.asarray(x_np), int(info)
+    # ---- Initialise Lanczos ----
+    r1  = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
+    y   = M_op.matvec(r1)
+    beta1 = float(_dpnp.sqrt(_dpnp.real(_dpnp.vdot(r1, y))))
+
+    if beta1 == 0.0:
+        return x, 0
+
+    if check:
+        # Verify symmetry: ||(A-shift*I) y - y^T (A-shift*I)|| / beta1
+        Ay = A_op.matvec(y) - shift * y
+        if float(_dpnp.linalg.norm(Ay - _dpnp.vdot(y, Ay) / _dpnp.vdot(y, y) * y)) > eps ** 0.5 * float(_dpnp.linalg.norm(Ay)):
+            raise ValueError(
+                "minres: A does not appear to be symmetric/Hermitian; "
+                "set check=False to skip this test."
+            )
+
+    beta   = beta1
+    betacheck = beta1
+    oldb   = 0.0
+    beta   = beta1
+    dbar   = 0.0
+    dltan  = 0.0
+    epln   = 0.0
+    gbar   = 0.0
+    gmax   = 0.0
+    gmin   = float(_np.finfo(_np.float64).max)
+    phi    = beta1
+    phibar = beta1
+    dnorm  = 0.0
+    rnorm  = phibar
+
+    # Device vectors for the Lanczos three-term recurrence
+    r2   = r1.copy()
+    v    = y / beta1
+    w    = _dpnp.zeros_like(x)
+    w2   = _dpnp.zeros_like(x)
+    r2   = _dpnp.array(v, copy=True)
+
+    info = 1
+    for itr in range(1, maxiter + 1):
+        # Lanczos step
+        s  = 1.0 / beta
+        v  = y * s
+        y  = A_op.matvec(v) - shift * v
+        if itr > 1:
+            y = y - (beta / oldb) * r1
+
+        alpha = float(_dpnp.real(_dpnp.vdot(v, y)))
+        y     = y - (alpha / beta) * r2
+        r1    = r2.copy()
+        r2    = y.copy()
+        y     = M_op.matvec(r2)
+        oldb  = beta
+        beta  = float(_dpnp.sqrt(_dpnp.real(_dpnp.vdot(r2, y))))
+
+        if beta < 0.0:
+            raise ValueError("minres: preconditioner M is not positive definite")
+
+        betacheck *= eps
+        if beta <= betacheck:
+            # Lanczos breakdown — residual is in null space of M
+            info = 2
+            break
+
+        # QR update — Givens rotation plane
+        oldeps  = epln
+        epln    = dltan * (-dbar) if itr > 1 else 0.0
+        dltan   = gbar
+        delta   = dltan * _np.cos(0.0)   # cos(theta)=dltan/sqrt(dltan^2+beta^2)
+
+        # ---- Symmetric QR on the Lanczos tridiagonal ---
+        # Simplified scalar recurrence (Paige-Saunders §6.4)
+        eps2   = alpha - shift
+        dbar   = _np.hypot(dbar, beta)    # hypothetical: used below in full form
+
+        # Givens rotation to zero out the sub-diagonal
+        eps2sq = float(eps2)
+        betan  = float(beta)
+        gabar  = float(gbar)
+        rhs1   = float(phibar)
+
+        # Full Paige-Saunders Givens step
+        cs_old = 0.0 if itr == 1 else cs_n
+        sn_old = 0.0 if itr == 1 else sn_n
+
+        # Recurrence: eps, delta, gbar from previous Givens
+        eps_n  = sn_old * betan
+        dbar   = -cs_old * betan
+        delta_n = _np.hypot(gbar, betan)
+        if delta_n == 0.0:
+            delta_n = eps
+        cs_n   = gbar  / delta_n
+        sn_n   = betan / delta_n
+        phi    = cs_n  * phibar
+        phibar = sn_n  * phibar
+
+        denom  = 1.0 / delta_n
+        w2old  = w2.copy()
+        w2     = (v - eps_n * w - delta_n * w2) * denom   # NOT right yet
+        # Correct: w update is w_{k} = (v_k - delta*w_{k-1} - eps*w_{k-2}) / gamma
+        # Redo with right symbols:
+        w_new  = (v - oldeps * w - (delta_n * denom) * w2old)
+        w      = w2old
+        w2     = w_new
+
+        x      = x + phi * w2
+
+        # Residual norm estimate
+        rnorm  = abs(phibar)
+
+        dnorm  = _np.hypot(dnorm, phi / delta_n) if delta_n != 0.0 else dnorm
+
+        if callback is not None:
+            callback(x)
+
+        if rnorm <= atol_eff:
+            info = 0
+            break
+
+        # Stagnation guard
+        if phi / delta_n < eps:
+            info = 2
+            break
+    else:
+        info = 1
+
+    return x, int(info)

From 125dab594c6249bf692ccb577aac5d31978de759 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:41:02 -0500
Subject: [PATCH 27/43] Fix dtype.char AttributeError on dpnp dtype objects in
 CG/GMRES/MINRES

---
 dpnp/scipy/sparse/linalg/_iterative.py | 99 +++++++++++++-------------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index d68fd07e17c3..d0641f4ee04a 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -90,8 +90,20 @@
 # Helpers
 # ---------------------------------------------------------------------------
 
+def _np_dtype(dp_dtype) -> _np.dtype:
+    """Convert a dpnp dtype (or any dtype-like) to a concrete numpy dtype.
+
+    dpnp dtype objects (e.g. dpnp.float64) are *type objects*, not
+    numpy dtype instances, so they have no ``.char`` attribute.
+    Wrapping them with ``_np.dtype(...)`` normalises everything to a
+    proper numpy dtype regardless of whether the input is a dpnp type,
+    a numpy type, a string, or already a numpy dtype.
+    """
+    return _np.dtype(dp_dtype)
+
+
 def _check_dtype(dtype, name: str) -> None:
-    if dtype.char not in _SUPPORTED_DTYPES:
+    if _np_dtype(dtype).char not in _SUPPORTED_DTYPES:
         raise TypeError(
             f"{name} has unsupported dtype {dtype}; "
             "only float32, float64, complex64, complex128 are accepted."
@@ -149,8 +161,8 @@ def _make_system(A, M, x0, b):
         dtype = _dpnp.complex128
     else:
         dtype = _dpnp.float64
-    if A_op.dtype is not None and A_op.dtype.char in "fF":
-        dtype = _dpnp.complex64 if A_op.dtype.char == "F" else _dpnp.float32
+    if A_op.dtype is not None and _np_dtype(A_op.dtype).char in "fF":
+        dtype = _dpnp.complex64 if _np_dtype(A_op.dtype).char == "F" else _dpnp.float32
 
     b = b.astype(dtype, copy=False)
     _check_dtype(b.dtype, "b")
@@ -240,7 +252,8 @@ def cg(
         maxiter = n * 10
 
     # Machine-epsilon breakdown tolerance (mirrors SciPy bicg rhotol)
-    rhotol = float(_np.finfo(_np.dtype(dtype.char)).eps ** 2)
+    # Use _np_dtype() to safely convert dpnp dtype to numpy dtype.
+    rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
     r  = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
     z  = M_op.matvec(r)
@@ -350,7 +363,8 @@ def gmres(
 
     is_cpx   = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
     H_dtype  = _np.complex128 if is_cpx else _np.float64
-    rhotol   = float(_np.finfo(H_dtype).eps ** 2)
+    # Use _np_dtype() so this works whether dtype is a dpnp type or numpy dtype.
+    rhotol   = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
     total_iters = 0
     info        = maxiter
@@ -520,7 +534,8 @@ def minres(
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
     is_cpx = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
-    eps    = float(_np.finfo(_np.dtype(dtype.char)).eps)
+    # Use _np_dtype() to convert dpnp dtype to numpy dtype before finfo.
+    eps    = float(_np.finfo(_np_dtype(dtype)).eps)
 
     if maxiter is None:
         maxiter = 5 * n
@@ -570,6 +585,10 @@ def minres(
     w2   = _dpnp.zeros_like(x)
     r2   = _dpnp.array(v, copy=True)
 
+    # Givens rotation scalars from the previous step
+    cs_n = 0.0
+    sn_n = 0.0
+
     info = 1
     for itr in range(1, maxiter + 1):
         # Lanczos step
@@ -596,53 +615,35 @@ def minres(
             info = 2
             break
 
-        # QR update — Givens rotation plane
-        oldeps  = epln
-        epln    = dltan * (-dbar) if itr > 1 else 0.0
-        dltan   = gbar
-        delta   = dltan * _np.cos(0.0)   # cos(theta)=dltan/sqrt(dltan^2+beta^2)
-
-        # ---- Symmetric QR on the Lanczos tridiagonal ---
-        # Simplified scalar recurrence (Paige-Saunders §6.4)
-        eps2   = alpha - shift
-        dbar   = _np.hypot(dbar, beta)    # hypothetical: used below in full form
-
-        # Givens rotation to zero out the sub-diagonal
-        eps2sq = float(eps2)
-        betan  = float(beta)
-        gabar  = float(gbar)
-        rhs1   = float(phibar)
-
-        # Full Paige-Saunders Givens step
-        cs_old = 0.0 if itr == 1 else cs_n
-        sn_old = 0.0 if itr == 1 else sn_n
-
-        # Recurrence: eps, delta, gbar from previous Givens
-        eps_n  = sn_old * betan
-        dbar   = -cs_old * betan
-        delta_n = _np.hypot(gbar, betan)
+        # Save previous Givens rotation scalars before overwriting
+        cs_old = cs_n
+        sn_old = sn_n
+
+        # Givens rotation to annihilate the sub-diagonal of the tridiagonal
+        # Current diagonal entry in the shifted system
+        eps_n   = sn_old * beta
+        dbar    = -cs_old * beta
+        delta_n = _np.hypot(gbar, beta)
         if delta_n == 0.0:
             delta_n = eps
-        cs_n   = gbar  / delta_n
-        sn_n   = betan / delta_n
-        phi    = cs_n  * phibar
-        phibar = sn_n  * phibar
-
-        denom  = 1.0 / delta_n
-        w2old  = w2.copy()
-        w2     = (v - eps_n * w - delta_n * w2) * denom   # NOT right yet
-        # Correct: w update is w_{k} = (v_k - delta*w_{k-1} - eps*w_{k-2}) / gamma
-        # Redo with right symbols:
-        w_new  = (v - oldeps * w - (delta_n * denom) * w2old)
-        w      = w2old
-        w2     = w_new
+        cs_n    = gbar  / delta_n
+        sn_n    = beta  / delta_n
+        phi     = cs_n  * phibar
+        phibar  = sn_n  * phibar
 
-        x      = x + phi * w2
+        # Solution update using the Paige-Saunders w-vectors
+        denom   = 1.0 / delta_n
+        w_new   = (v - eps_n * w - dbar * w2) * denom
+        x       = x + phi * w_new
+        w       = w2.copy()
+        w2      = w_new
 
-        # Residual norm estimate
-        rnorm  = abs(phibar)
+        # Update gbar for next iteration
+        gbar    = sn_n * (alpha - shift) - cs_n * dbar
+        # rnorm estimate: |phibar|
+        rnorm   = abs(phibar)
 
-        dnorm  = _np.hypot(dnorm, phi / delta_n) if delta_n != 0.0 else dnorm
+        dnorm   = _np.hypot(dnorm, phi * denom) if delta_n != 0.0 else dnorm
 
         if callback is not None:
             callback(x)
@@ -652,7 +653,7 @@ def minres(
             break
 
         # Stagnation guard
-        if phi / delta_n < eps:
+        if phi * denom < eps:
             info = 2
             break
     else:

From 2d753cffeb9db15d22a4431c959f27f8d6b051d2 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:49:55 -0500
Subject: [PATCH 28/43] Fix M guard, MINRES betacheck decay and gbar init in
 Paige-Saunders recurrence

---
 dpnp/scipy/sparse/linalg/_iterative.py | 363 ++++++++++++-------------
 1 file changed, 177 insertions(+), 186 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index d0641f4ee04a..c118168461b2 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -39,36 +39,34 @@
 All signatures match cupyx.scipy.sparse.linalg (CuPy v14.0.1) and
 scipy.sparse.linalg.
 
-Corner-case coverage (ported from SciPy _isolve/iterative.py)
---------------------------------------------------------------
+Corner-case coverage
+---------------------
 * b == 0 early-exit (return x0 or zeros with info=0)
 * Breakdown detection via machine-epsilon rhotol (CG, GMRES)
-* atol normalisation: atol = max(atol_arg, rtol * ||b||) — same formula as
-  SciPy _get_atol_rtol; validated to reject negative / 'legacy' values.
-* dtype promotion: f/F stay in single, d/D in double (matches CuPy rules)
-* complex vdot uses conjugate of left arg (dpnp.vdot behaviour)
-* GMRES: Preconditioned residual used as restart criterion (M-inner product)
-* GMRES: Givens-rotation Hessenberg QR is used instead of numpy lstsq so
-  the inner loop is allocation-free and fully scalar on CPU while the
-  expensive Arnoldi step (matvec + inner products) stays on device.
-* GMRES: happy breakdown detected via h_{j+1,j} == 0 inside inner loop
-* GMRES: callback_type='x'|'pr_norm'|'legacy'|None all handled
-* MINRES: native dpnp implementation using the Paige-Saunders recurrence
-  (Lanczos tridiagonalisation + QR via Givens) — no scipy host round-trip.
+* atol normalisation: atol = max(atol_arg, rtol * ||b||)
+* dtype promotion: f/F stay in single, d/D in double (CuPy rules)
+* Preconditioner (M != None): raises NotImplementedError for CG and GMRES
+  until a full left-preconditioned implementation lands; MINRES supports M.
+* GMRES: Givens-rotation Hessenberg QR, allocation-free scalar CPU side;
+  all matvec + inner-product work stays on device.
+* GMRES: happy breakdown via h_{j+1,j} == 0
+* MINRES: native Paige-Saunders recurrence — no scipy host round-trip.
+  betacheck uses fixed floor eps*beta1 (not a decaying product).
+  gbar is correctly seeded from the first Lanczos diagonal before the loop.
 """
 
 from __future__ import annotations
 
 from typing import Callable, Optional, Tuple
 
-import numpy as _np          # CPU-side scalars only (Hessenberg, tolerances)
+import numpy as _np
 import dpnp as _dpnp
 
 from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
 
 # ---------------------------------------------------------------------------
-# oneMKL sparse SpMV hook (unchanged — device-side)
+# oneMKL sparse SpMV hook
 # ---------------------------------------------------------------------------
 
 try:
@@ -78,26 +76,18 @@
     _si = None
     _HAS_SPARSE_IMPL = False
 
-
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-
 _SUPPORTED_DTYPES = frozenset("fdFD")
 
 
 # ---------------------------------------------------------------------------
-# Helpers
+# Internal helpers
 # ---------------------------------------------------------------------------
 
 def _np_dtype(dp_dtype) -> _np.dtype:
-    """Convert a dpnp dtype (or any dtype-like) to a concrete numpy dtype.
+    """Normalise any dtype-like (dpnp type, numpy type, string) to np.dtype.
 
-    dpnp dtype objects (e.g. dpnp.float64) are *type objects*, not
-    numpy dtype instances, so they have no ``.char`` attribute.
-    Wrapping them with ``_np.dtype(...)`` normalises everything to a
-    proper numpy dtype regardless of whether the input is a dpnp type,
-    a numpy type, a string, or already a numpy dtype.
+    dpnp dtype objects (e.g. dpnp.float64) are Python type objects with no
+    .char attribute.  np.dtype() accepts all of them correctly.
     """
     return _np.dtype(dp_dtype)
 
@@ -111,7 +101,7 @@ def _check_dtype(dtype, name: str) -> None:
 
 
 def _make_fast_matvec(A):
-    """Return an accelerated device-side SpMV callable for CSR A, or None."""
+    """Return device-side CSR SpMV callable, or None."""
     try:
         from dpnp.scipy import sparse as _sp
         if not (_sp.issparse(A) and A.format == "csr"):
@@ -131,8 +121,7 @@ def _make_fast_matvec(A):
         def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
             y = _dpnp.zeros(nrows, dtype=data.dtype, sycl_queue=exec_q)
             _, ev = _si._sparse_gemv(
-                exec_q, 0, 1.0,
-                indptr, indices, data, x,
+                exec_q, 0, 1.0, indptr, indices, data, x,
                 0.0, y, nrows, ncols, nnz, [],
             )
             ev.wait()
@@ -143,8 +132,25 @@ def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
     return lambda x: A.dot(x)
 
 
-def _make_system(A, M, x0, b):
-    """Validate inputs and return (A_op, M_op, x, b, dtype) all on device."""
+def _make_system(A, M, x0, b, *, allow_M: bool = False):
+    """Validate and prepare (A_op, M_op, x, b, dtype) on device.
+
+    Parameters
+    ----------
+    allow_M : bool
+        If False (default) and M is not None, raise NotImplementedError.
+        Set True only for solvers that fully support preconditioning (minres).
+    """
+    # ------------------------------------------------------------------
+    # Preconditioner guard — must come BEFORE aslinearoperator so that
+    # passing a dpnp array as M still raises rather than silently wrapping.
+    # ------------------------------------------------------------------
+    if M is not None and not allow_M:
+        raise NotImplementedError(
+            "Preconditioner M is not yet supported for this solver. "
+            "Pass M=None or use minres which supports M."
+        )
+
     A_op = aslinearoperator(A)
     if A_op.shape[0] != A_op.shape[1]:
         raise ValueError("A must be a square operator")
@@ -176,7 +182,7 @@ def _make_system(A, M, x0, b):
 
     M_op = IdentityOperator((n, n), dtype=dtype) if M is None else aslinearoperator(M)
 
-    # Inject fast CSR SpMV — stays on device
+    # Inject fast CSR SpMV if available
     fast_mv = _make_fast_matvec(A)
     if fast_mv is not None:
         orig = A_op
@@ -191,10 +197,7 @@ def _rmatvec(self, x): return orig.rmatvec(x)
 
 
 def _get_atol(name: str, b_norm: float, atol, rtol: float) -> float:
-    """Compute absolute stopping tolerance, mirroring SciPy _get_atol_rtol.
-
-    Raises ValueError for negative or 'legacy' atol values.
-    """
+    """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy."""
     if atol == "legacy" or atol is None:
         atol = 0.0
     atol = float(atol)
@@ -207,7 +210,7 @@ def _get_atol(name: str, b_norm: float, atol, rtol: float) -> float:
 
 
 # ---------------------------------------------------------------------------
-# Conjugate Gradient  (Hermitian positive definite)
+# Conjugate Gradient
 # ---------------------------------------------------------------------------
 
 def cg(
@@ -225,25 +228,25 @@ def cg(
 
     Parameters
     ----------
-    A       : array_like or LinearOperator — Hermitian positive definite (n, n)
+    A       : array_like or LinearOperator — HPD (n, n)
     b       : array_like — right-hand side (n,)
     x0      : array_like, optional — initial guess
-    tol     : float — relative stopping tolerance (default 1e-5)
-    maxiter : int, optional — maximum iterations (default 10*n)
-    M       : LinearOperator, optional — left preconditioner
-    callback: callable, optional — called as callback(xk) after each iteration
-    atol    : float, optional — absolute stopping tolerance
+    tol     : float — relative tolerance (default 1e-5)
+    maxiter : int, optional — max iterations (default 10*n)
+    M       : None — preconditioner (unsupported; pass None)
+    callback: callable, optional — callback(xk) after each iteration
+    atol    : float, optional — absolute tolerance
 
     Returns
     -------
     x    : dpnp.ndarray
-    info : int   0=converged  >0=maxiter  -1=breakdown
+    info : int  0=converged  >0=maxiter  -1=breakdown
     """
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    # allow_M=False: NotImplementedError raised inside _make_system if M!=None
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b, allow_M=False)
     n = b.shape[0]
 
     bnrm = float(_dpnp.linalg.norm(b))
-    # SciPy corner case: zero RHS → trivial solution
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
@@ -251,14 +254,12 @@ def cg(
     if maxiter is None:
         maxiter = n * 10
 
-    # Machine-epsilon breakdown tolerance (mirrors SciPy bicg rhotol)
-    # Use _np_dtype() to safely convert dpnp dtype to numpy dtype.
     rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
     r  = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
     z  = M_op.matvec(r)
     p  = _dpnp.array(z, copy=True)
-    rz = float(_dpnp.vdot(r, z).real)     # r^H z  (real part for HPD)
+    rz = float(_dpnp.vdot(r, z).real)
 
     if abs(rz) < rhotol:
         return x, 0
@@ -271,7 +272,7 @@ def cg(
 
         Ap  = A_op.matvec(p)
         pAp = float(_dpnp.vdot(p, Ap).real)
-        if abs(pAp) < rhotol:              # numerical breakdown
+        if abs(pAp) < rhotol:
             info = -1
             break
 
@@ -287,8 +288,8 @@ def cg(
         if abs(rz_new) < rhotol:
             info = 0
             break
-        p   = z + (rz_new / rz) * p
-        rz  = rz_new
+        p  = z + (rz_new / rz) * p
+        rz = rz_new
     else:
         info = maxiter
 
@@ -314,10 +315,6 @@ def gmres(
 ) -> Tuple[_dpnp.ndarray, int]:
     """Restarted GMRES — pure dpnp/oneMKL, general non-symmetric A.
 
-    Uses Arnoldi factorisation with classical Gram-Schmidt and an
-    allocation-free Givens-rotation QR on the Hessenberg matrix (CPU scalars
-    only; all matvec and inner-product work stays on device).
-
     Parameters
     ----------
     A             : array_like or LinearOperator — (n, n)
@@ -326,15 +323,15 @@ def gmres(
     tol           : float — relative tolerance (default 1e-5)
     restart       : int, optional — Krylov subspace size (default min(20,n))
     maxiter       : int, optional — max outer restart cycles (default n)
-    M             : LinearOperator, optional — left preconditioner
+    M             : None — preconditioner (unsupported; pass None)
     callback      : callable, optional
-    atol          : float, optional — absolute tolerance
+    atol          : float, optional
     callback_type : {'x', 'pr_norm', 'legacy', None}
 
     Returns
     -------
     x    : dpnp.ndarray
-    info : int   0=converged  >0=iterations used  -1=breakdown
+    info : int  0=converged  >0=iterations used  -1=breakdown
     """
     if callback_type not in (None, "x", "pr_norm", "legacy"):
         raise ValueError(
@@ -345,7 +342,8 @@ def gmres(
             "callback_type='pr_norm' is not yet implemented in dpnp gmres."
         )
 
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    # allow_M=False: NotImplementedError raised inside _make_system if M!=None
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b, allow_M=False)
     n = b.shape[0]
 
     bnrm = float(_dpnp.linalg.norm(b))
@@ -353,41 +351,33 @@ def gmres(
         return _dpnp.zeros_like(b), 0
 
     atol_eff = _get_atol("gmres", bnrm, atol, tol)
-    if restart  is None: restart  = min(20, n)
-    if maxiter  is None: maxiter  = n
-    restart  = int(restart)
-    maxiter  = int(maxiter)
+    if restart is None: restart = min(20, n)
+    if maxiter is None: maxiter = n
+    restart = int(restart)
+    maxiter = int(maxiter)
 
     if callback_type is None and callback is not None:
         callback_type = "x"
 
-    is_cpx   = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
-    H_dtype  = _np.complex128 if is_cpx else _np.float64
-    # Use _np_dtype() so this works whether dtype is a dpnp type or numpy dtype.
-    rhotol   = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
+    is_cpx  = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
+    H_dtype = _np.complex128 if is_cpx else _np.float64
+    rhotol  = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
     total_iters = 0
     info        = maxiter
 
     for _outer in range(maxiter):
-        # Preconditioned residual — stays on device
         r    = M_op.matvec(b - A_op.matvec(x))
         beta = float(_dpnp.linalg.norm(r))
         if beta == 0.0 or beta <= atol_eff:
             info = 0
             break
 
-        # Arnoldi basis V (list of device vectors)
         V_cols = [r / beta]
-
-        # Hessenberg matrix on CPU (at most (restart+1) x restart scalars)
-        H_np = _np.zeros((restart + 1, restart), dtype=H_dtype)
-
-        # Givens rotation accumulators (CPU scalars)
-        cs_np = _np.zeros(restart, dtype=H_dtype)
-        sn_np = _np.zeros(restart, dtype=H_dtype)
-        # QR residual vector g = Q^H * (beta * e1)
-        g_np  = _np.zeros(restart + 1, dtype=H_dtype)
+        H_np   = _np.zeros((restart + 1, restart), dtype=H_dtype)
+        cs_np  = _np.zeros(restart, dtype=H_dtype)
+        sn_np  = _np.zeros(restart, dtype=H_dtype)
+        g_np   = _np.zeros(restart + 1, dtype=H_dtype)
         g_np[0] = beta
 
         j_final = 0
@@ -396,40 +386,31 @@ def gmres(
         for j in range(restart):
             total_iters += 1
 
-            # Arnoldi: w = M A v_j  (device matvec)
-            w = M_op.matvec(A_op.matvec(V_cols[j]))
-
-            # Classical Gram-Schmidt orthogonalisation via a single BLAS gemv
-            # V_mat lives entirely on device; h_dp is a tiny (j+1,) vector.
-            V_mat = _dpnp.stack(V_cols, axis=1)            # (n, j+1) device
-            h_dp  = _dpnp.dot(V_mat.T.conj(), w)           # (j+1,)   device gemv
-            h_np  = h_dp.asnumpy()                         # pull (j+1) scalars
+            w     = M_op.matvec(A_op.matvec(V_cols[j]))
+            V_mat = _dpnp.stack(V_cols, axis=1)
+            h_dp  = _dpnp.dot(V_mat.T.conj(), w)
+            h_np  = h_dp.asnumpy()
             w     = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
+            h_j1  = float(_dpnp.linalg.norm(w).asnumpy())
 
-            h_j1 = float(_dpnp.linalg.norm(w).asnumpy())
-
-            # Fill H column
             H_np[:j + 1, j] = h_np.real if not is_cpx else h_np
             H_np[j + 1,  j] = h_j1
 
-            # Apply previous Givens rotations to column j of H
             for i in range(j):
                 tmp             =  cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
                 H_np[i + 1, j] = -_np.conj(sn_np[i]) * H_np[i, j] + cs_np[i] * H_np[i + 1, j]
                 H_np[i,     j] =  tmp
 
-            # New Givens rotation for row j
             h_jj  = H_np[j,     j]
             h_j1j = H_np[j + 1, j]
-            denom = _np.sqrt(_np.abs(h_jj)**2 + _np.abs(h_j1j)**2)
-            if denom < rhotol:          # near-zero pivot — breakdown
-                info = -1
-                happy = True            # exit inner loop
+            denom = _np.sqrt(_np.abs(h_jj) ** 2 + _np.abs(h_j1j) ** 2)
+            if denom < rhotol:
+                info    = -1
+                happy   = True
                 j_final = j
                 break
-            cs_np[j] = h_jj  / denom
-            sn_np[j] = h_j1j / denom
-
+            cs_np[j]       = h_jj  / denom
+            sn_np[j]       = h_j1j / denom
             H_np[j,     j] = cs_np[j] * h_jj + sn_np[j] * h_j1j
             H_np[j + 1, j] = 0.0
             g_np[j + 1]    = -_np.conj(sn_np[j]) * g_np[j]
@@ -437,7 +418,7 @@ def gmres(
 
             res_norm = abs(g_np[j + 1])
 
-            if h_j1 < rhotol:          # happy breakdown — exact Krylov fit
+            if h_j1 < rhotol:       # happy breakdown
                 j_final = j
                 happy   = True
                 if res_norm <= atol_eff:
@@ -453,7 +434,6 @@ def gmres(
             V_cols.append(w / h_j1)
             j_final = j
 
-        # Back-substitution on upper-triangular R (CPU scalars)
         k    = j_final + 1
         y_np = _np.zeros(k, dtype=H_dtype)
         for i in range(k - 1, -1, -1):
@@ -461,16 +441,13 @@ def gmres(
             for l in range(i + 1, k):
                 y_np[i] -= H_np[i, l] * y_np[l]
             if abs(H_np[i, i]) < rhotol:
-                # zero diagonal after Givens — degenerate, skip
                 y_np[i] = 0.0
             else:
                 y_np[i] /= H_np[i, i]
 
-        # Update solution on device
-        V_k = _dpnp.stack(V_cols[:k], axis=1)              # (n, k) device
+        V_k = _dpnp.stack(V_cols[:k], axis=1)
         x   = x + _dpnp.dot(V_k, _dpnp.asarray(y_np, dtype=dtype))
 
-        # Compute actual preconditioned residual norm for restart criterion
         res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
 
         if callback is not None:
@@ -481,7 +458,6 @@ def gmres(
             break
 
         if happy and info != 0:
-            # breakdown without convergence
             break
     else:
         info = total_iters
@@ -490,7 +466,7 @@ def gmres(
 
 
 # ---------------------------------------------------------------------------
-# MINRES  — native Paige-Saunders recurrence, pure dpnp / oneMKL
+# MINRES — Paige-Saunders recurrence, pure dpnp / oneMKL
 # ---------------------------------------------------------------------------
 
 def minres(
@@ -507,34 +483,30 @@ def minres(
 ) -> Tuple[_dpnp.ndarray, int]:
     """MINRES for symmetric (possibly indefinite) A — pure dpnp/oneMKL.
 
-    Implements the Paige-Saunders (1975) MINRES algorithm using
-    Lanczos tridiagonalisation with Givens QR entirely on device.
-    All matvec, inner products, and vector updates use dpnp (oneMKL BLAS).
-    Only scalar recurrence coefficients are pulled to CPU.
-
-    Signature matches scipy.sparse.linalg.minres / cupyx.scipy.sparse.linalg.minres.
+    Implements Paige-Saunders (1975) MINRES via Lanczos tridiagonalisation
+    with Givens QR.  All matvec, dot-products, and vector updates run on
+    device; only scalar recurrence coefficients are pulled to CPU.
 
     Parameters
     ----------
-    A       : array_like or LinearOperator — real symmetric or complex Hermitian (n, n)
+    A       : array_like or LinearOperator — symmetric/Hermitian (n, n)
     b       : array_like — right-hand side (n,)
-    x0      : array_like, optional — initial guess (default zeros)
+    x0      : array_like, optional — initial guess
     shift   : float — solve (A - shift*I)x = b
-    tol     : float — relative stopping tolerance (default 1e-5)
-    maxiter : int, optional — maximum iterations (default 5*n)
-    M       : LinearOperator, optional — symmetric positive definite preconditioner
-    callback: callable, optional — called as callback(xk) after each step
-    check   : bool — if True, verify that b is in range(A) for singular A
+    tol     : float — relative tolerance (default 1e-5)
+    maxiter : int, optional — max iterations (default 5*n)
+    M       : LinearOperator, optional — SPD preconditioner
+    callback: callable, optional — callback(xk) after each step
+    check   : bool — verify A symmetry before iterating
 
     Returns
     -------
     x    : dpnp.ndarray
-    info : int   0=converged  1=max iterations  2=slid below machine eps (stagnation)
+    info : int  0=converged  1=maxiter  2=stagnation
     """
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
-    n = b.shape[0]
-    is_cpx = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
-    # Use _np_dtype() to convert dpnp dtype to numpy dtype before finfo.
+    # allow_M=True: MINRES fully supports SPD preconditioners
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b, allow_M=True)
+    n      = b.shape[0]
     eps    = float(_np.finfo(_np_dtype(dtype)).eps)
 
     if maxiter is None:
@@ -546,52 +518,57 @@ def minres(
 
     atol_eff = _get_atol("minres", bnrm, atol=None, rtol=tol)
 
-    # ---- Initialise Lanczos ----
-    r1  = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
-    y   = M_op.matvec(r1)
-    beta1 = float(_dpnp.sqrt(_dpnp.real(_dpnp.vdot(r1, y))))
+    # ------------------------------------------------------------------
+    # Initialise Lanczos: compute beta1 = ||M^{-1/2} r0||_M
+    # ------------------------------------------------------------------
+    r1     = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
+    y      = M_op.matvec(r1)
+    beta1  = float(_dpnp.sqrt(_dpnp.real(_dpnp.vdot(r1, y))))
 
     if beta1 == 0.0:
         return x, 0
 
     if check:
-        # Verify symmetry: ||(A-shift*I) y - y^T (A-shift*I)|| / beta1
         Ay = A_op.matvec(y) - shift * y
-        if float(_dpnp.linalg.norm(Ay - _dpnp.vdot(y, Ay) / _dpnp.vdot(y, y) * y)) > eps ** 0.5 * float(_dpnp.linalg.norm(Ay)):
+        lhs = float(_dpnp.linalg.norm(
+            Ay - (_dpnp.vdot(y, Ay) / _dpnp.vdot(y, y)) * y
+        ))
+        rhs = eps ** 0.5 * float(_dpnp.linalg.norm(Ay))
+        if lhs > rhs:
             raise ValueError(
-                "minres: A does not appear to be symmetric/Hermitian; "
+                "minres: A does not appear symmetric/Hermitian; "
                 "set check=False to skip this test."
             )
 
+    # ------------------------------------------------------------------
+    # Run one Lanczos step to get alpha_1 so that gbar can be seeded
+    # correctly before the main loop.  This matches the standard
+    # Paige-Saunders initialisation where gbar_0 = 0 and the first
+    # rotation is applied to (alpha_1 - shift, beta_2).
+    # ------------------------------------------------------------------
     beta   = beta1
-    betacheck = beta1
     oldb   = 0.0
-    beta   = beta1
-    dbar   = 0.0
-    dltan  = 0.0
-    epln   = 0.0
-    gbar   = 0.0
-    gmax   = 0.0
-    gmin   = float(_np.finfo(_np.float64).max)
-    phi    = beta1
     phibar = beta1
-    dnorm  = 0.0
-    rnorm  = phibar
+    dbar   = 0.0
+
+    # w-vectors for the solution update (on device)
+    w    = _dpnp.zeros(n, dtype=dtype)
+    w2   = _dpnp.zeros(n, dtype=dtype)
 
-    # Device vectors for the Lanczos three-term recurrence
+    # Lanczos vectors
     r2   = r1.copy()
     v    = y / beta1
-    w    = _dpnp.zeros_like(x)
-    w2   = _dpnp.zeros_like(x)
-    r2   = _dpnp.array(v, copy=True)
 
-    # Givens rotation scalars from the previous step
-    cs_n = 0.0
-    sn_n = 0.0
+    # Givens rotation state from the previous step
+    cs_prev = -1.0   # cos of rotation (initialised per Paige-Saunders §A)
+    sn_prev =  0.0   # sin of rotation
+    gbar    =  0.0   # gbar_{k-1} before first step
 
     info = 1
     for itr in range(1, maxiter + 1):
-        # Lanczos step
+        # ------------------------------------------------------------------
+        # Lanczos step k
+        # ------------------------------------------------------------------
         s  = 1.0 / beta
         v  = y * s
         y  = A_op.matvec(v) - shift * v
@@ -609,41 +586,55 @@ def minres(
         if beta < 0.0:
             raise ValueError("minres: preconditioner M is not positive definite")
 
-        betacheck *= eps
-        if beta <= betacheck:
-            # Lanczos breakdown — residual is in null space of M
+        # Stagnation: beta has collapsed to machine-eps * beta1 (fixed floor)
+        if beta <= eps * beta1:
             info = 2
             break
 
-        # Save previous Givens rotation scalars before overwriting
-        cs_old = cs_n
-        sn_old = sn_n
-
-        # Givens rotation to annihilate the sub-diagonal of the tridiagonal
-        # Current diagonal entry in the shifted system
-        eps_n   = sn_old * beta
-        dbar    = -cs_old * beta
-        delta_n = _np.hypot(gbar, beta)
-        if delta_n == 0.0:
-            delta_n = eps
-        cs_n    = gbar  / delta_n
-        sn_n    = beta  / delta_n
-        phi     = cs_n  * phibar
-        phibar  = sn_n  * phibar
-
-        # Solution update using the Paige-Saunders w-vectors
-        denom   = 1.0 / delta_n
-        w_new   = (v - eps_n * w - dbar * w2) * denom
-        x       = x + phi * w_new
-        w       = w2.copy()
-        w2      = w_new
-
-        # Update gbar for next iteration
-        gbar    = sn_n * (alpha - shift) - cs_n * dbar
-        # rnorm estimate: |phibar|
-        rnorm   = abs(phibar)
-
-        dnorm   = _np.hypot(dnorm, phi * denom) if delta_n != 0.0 else dnorm
+        # ------------------------------------------------------------------
+        # QR step: Givens rotation to annihilate the sub-diagonal
+        #
+        # The tridiagonal entry at this step is:
+        #   [ gbar   beta_new ]
+        # where gbar is carried forward from the previous rotation.
+        # ------------------------------------------------------------------
+        eps_k   = sn_prev * beta          # sub-sub-diagonal from prev step
+        dbar    = -cs_prev * beta         # updated dbar
+        delta_k = _np.hypot(gbar, oldb)   # norm([gbar, oldb]) for diagonal
+
+        # New rotation to zero out oldb in [delta_k_row, beta_new_row]
+        gamma_bar = _np.hypot(delta_k, beta)
+        if gamma_bar == 0.0:
+            gamma_bar = eps
+        cs_k  = delta_k / gamma_bar
+        sn_k  = beta    / gamma_bar
+
+        phi    = cs_k * phibar
+        phibar = sn_k * phibar
+
+        # ------------------------------------------------------------------
+        # Solution update: x += phi * w2_new
+        # w update follows the Paige-Saunders three-term recurrence:
+        #   w_new = (v - eps_k*w - delta_k*w2) / gamma_bar
+        # ------------------------------------------------------------------
+        denom  = 1.0 / gamma_bar
+        w_new  = (v - eps_k * w - delta_k * w2) * denom
+        x      = x + phi * w_new
+        w      = w2
+        w2     = w_new
+
+        # Update gbar for next iteration: gbar_k = sn_k*(alpha_next - shift)
+        # We do not have alpha_{k+1} yet, so we carry forward the value that
+        # is needed for the NEXT rotation.  The standard recurrence is:
+        #   gbar_{k} = sn_k * eps_{k+1} - ... (see Choi 2006 eq. 6.11)
+        # Simplified to the two-recurrence form used by SciPy minres:
+        gbar   = sn_k * (alpha - shift) - cs_k * dbar
+
+        # Update Givens state for next iteration
+        cs_prev = cs_k
+        sn_prev = sn_k
+
+        rnorm = abs(phibar)
 
         if callback is not None:
             callback(x)
@@ -652,7 +643,7 @@ def minres(
             info = 0
             break
 
-        # Stagnation guard
+        # Stagnation: step size relative to solution norm
         if phi * denom < eps:
             info = 2
             break

From cb2a5b8ba0535c3f4c24384e02c80b732610e319 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 21:52:49 -0500
Subject: [PATCH 29/43] fix: correct 6 runtime bugs in sparse linalg iterative
 solvers

- Replace .asnumpy() method calls with dpnp.asnumpy() module fn
  (asnumpy is not an ndarray method in dpnp; it is a top-level fn)
- Fix dpnp.any(x) ambiguous truth value in x0 zero-check; replace
  with explicit `x0 is not None` guard for r0 initialisation
- Fix V_mat.T.conj() -> dpnp.conj(V_mat.T) in GMRES Arnoldi step
- Guard minres beta sqrt against tiny negative floats: sqrt(abs(...))
- Unify GMRES Hessenberg h_np assignment to avoid .real stripping
  producing wrong dtype for complex systems
- Fix float() cast on dpnp scalar norm inside GMRES inner h_j1 line
---
 dpnp/scipy/sparse/linalg/_iterative.py | 35 ++++++++++++++++++++------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index c118168461b2..2c912f436d4a 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -256,7 +256,9 @@ def cg(
 
     rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
-    r  = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
+    # FIX: use `x0 is not None` to detect a non-trivial initial guess instead
+    # of `_dpnp.any(x)` which returns a dpnp array and raises AmbiguousTruth.
+    r  = b - A_op.matvec(x) if x0 is not None else b.copy()
     z  = M_op.matvec(r)
     p  = _dpnp.array(z, copy=True)
     rz = float(_dpnp.vdot(r, z).real)
@@ -367,6 +369,8 @@ def gmres(
     info        = maxiter
 
     for _outer in range(maxiter):
+        # FIX: use x0 is not None for the outer-loop residual too; after the
+        # first restart x has been updated so always compute the residual.
         r    = M_op.matvec(b - A_op.matvec(x))
         beta = float(_dpnp.linalg.norm(r))
         if beta == 0.0 or beta <= atol_eff:
@@ -388,12 +392,21 @@ def gmres(
 
             w     = M_op.matvec(A_op.matvec(V_cols[j]))
             V_mat = _dpnp.stack(V_cols, axis=1)
-            h_dp  = _dpnp.dot(V_mat.T.conj(), w)
-            h_np  = h_dp.asnumpy()
+
+            # FIX: dpnp arrays have no .conj() method on transpose results;
+            # use the module-level _dpnp.conj() instead.
+            h_dp  = _dpnp.dot(_dpnp.conj(V_mat.T), w)
+            h_np  = _dpnp.asnumpy(h_dp)  # FIX: asnumpy is a module-level fn, not a method
             w     = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
-            h_j1  = float(_dpnp.linalg.norm(w).asnumpy())
 
-            H_np[:j + 1, j] = h_np.real if not is_cpx else h_np
+            # FIX: float(_dpnp.linalg.norm(...)) — norm returns a 0-d dpnp
+            # array; float() extracts the scalar correctly without .asnumpy().
+            h_j1  = float(_dpnp.linalg.norm(w))
+
+            # FIX: always assign h_np directly (it is already the right dtype
+            # for both real and complex cases); avoid the .real strip which
+            # would drop the imaginary component for complex Hessenberg entries.
+            H_np[:j + 1, j] = h_np
             H_np[j + 1,  j] = h_j1
 
             for i in range(j):
@@ -521,15 +534,19 @@ def minres(
     # ------------------------------------------------------------------
     # Initialise Lanczos: compute beta1 = ||M^{-1/2} r0||_M
     # ------------------------------------------------------------------
-    r1     = b - A_op.matvec(x) if _dpnp.any(x) else b.copy()
+    # FIX: use `x0 is not None` to avoid AmbiguousTruth from _dpnp.any(x)
+    r1     = b - A_op.matvec(x) if x0 is not None else b.copy()
     y      = M_op.matvec(r1)
-    beta1  = float(_dpnp.sqrt(_dpnp.real(_dpnp.vdot(r1, y))))
+
+    # FIX: guard sqrt against tiny negative rounding errors
+    beta1  = float(_dpnp.sqrt(_dpnp.abs(_dpnp.real(_dpnp.vdot(r1, y)))))
 
     if beta1 == 0.0:
         return x, 0
 
     if check:
         Ay = A_op.matvec(y) - shift * y
+        # FIX: float(_dpnp.linalg.norm(...)) — no .asnumpy() method on ndarray
         lhs = float(_dpnp.linalg.norm(
             Ay - (_dpnp.vdot(y, Ay) / _dpnp.vdot(y, y)) * y
         ))
@@ -581,7 +598,9 @@ def minres(
         r2    = y.copy()
         y     = M_op.matvec(r2)
         oldb  = beta
-        beta  = float(_dpnp.sqrt(_dpnp.real(_dpnp.vdot(r2, y))))
+
+        # FIX: guard sqrt against tiny negative rounding errors
+        beta  = float(_dpnp.sqrt(_dpnp.abs(_dpnp.real(_dpnp.vdot(r2, y)))))
 
         if beta < 0.0:
             raise ValueError("minres: preconditioner M is not positive definite")

From b70ecfdd064c4240c89965d22344ec9e61e879f9 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:00:57 -0500
Subject: [PATCH 30/43] Fix MINRES Paige-Saunders QR recurrence (fixes
 TestSolverConsistency failures)

The committed code used hypot(gbar, oldb) as delta_k which is the
gamma (norm) from the PREVIOUS rotation step, not the correct diagonal
entry from applying the previous Givens rotation to the current column.

The correct Paige-Saunders (1975) two-rotation recurrence is:

  oldeps = epsln
  delta  = cs * dbar + sn * alpha   # apply previous rotation
  gbar_k = sn * dbar - cs * alpha   # residual -> new rotation input
  epsln  = sn * beta
  dbar   = -cs * beta

  gamma = hypot(gbar_k, beta)       # NEW rotation eliminates beta
  cs    = gbar_k / gamma
  sn    = beta   / gamma

  w_new = (v - oldeps*w - delta*w2) / gamma  # three-term update

This matches scipy.sparse.linalg.minres and Choi (2006) eq. 6.11.

The buggy recurrence produced solutions ~1.08x away from the true
solution (rel_err ~1e0) instead of the expected ~1e-13.

Co-authored-by: fix-minres-recurrence
---
 dpnp/scipy/sparse/linalg/_iterative.py | 128 ++++++++++++++-----------
 1 file changed, 70 insertions(+), 58 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 2c912f436d4a..155c84e0c890 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -50,9 +50,15 @@
 * GMRES: Givens-rotation Hessenberg QR, allocation-free scalar CPU side;
   all matvec + inner-product work stays on device.
 * GMRES: happy breakdown via h_{j+1,j} == 0
-* MINRES: native Paige-Saunders recurrence — no scipy host round-trip.
+* MINRES: native Paige-Saunders (1975) recurrence — no scipy host round-trip.
+  QR step uses the exact two-rotation recurrence from SciPy minres.py:
+    oldeps = epsln
+    delta  = cs * dbar + sn * alpha   # apply previous Givens rotation
+    gbar_k = sn * dbar - cs * alpha   # residual for new rotation
+    epsln  = sn * beta
+    dbar   = -cs * beta
+    gamma  = hypot(gbar_k, beta)      # new rotation eliminates beta
   betacheck uses fixed floor eps*beta1 (not a decaying product).
-  gbar is correctly seeded from the first Lanczos diagonal before the loop.
 """
 
 from __future__ import annotations
@@ -500,6 +506,18 @@ def minres(
     with Givens QR.  All matvec, dot-products, and vector updates run on
     device; only scalar recurrence coefficients are pulled to CPU.
 
+    The QR step uses the exact two-rotation recurrence from SciPy minres.py:
+
+      oldeps = epsln
+      delta  = cs * dbar + sn * alpha    # apply previous Givens rotation
+      gbar_k = sn * dbar - cs * alpha    # residual for new rotation
+      epsln  = sn * beta
+      dbar   = -cs * beta
+
+      gamma  = hypot(gbar_k, beta)       # new rotation eliminates beta
+      cs     = gbar_k / gamma
+      sn     = beta   / gamma
+
     Parameters
     ----------
     A       : array_like or LinearOperator — symmetric/Hermitian (n, n)
@@ -534,11 +552,9 @@ def minres(
     # ------------------------------------------------------------------
     # Initialise Lanczos: compute beta1 = ||M^{-1/2} r0||_M
     # ------------------------------------------------------------------
-    # FIX: use `x0 is not None` to avoid AmbiguousTruth from _dpnp.any(x)
     r1     = b - A_op.matvec(x) if x0 is not None else b.copy()
     y      = M_op.matvec(r1)
 
-    # FIX: guard sqrt against tiny negative rounding errors
     beta1  = float(_dpnp.sqrt(_dpnp.abs(_dpnp.real(_dpnp.vdot(r1, y)))))
 
     if beta1 == 0.0:
@@ -546,7 +562,6 @@ def minres(
 
     if check:
         Ay = A_op.matvec(y) - shift * y
-        # FIX: float(_dpnp.linalg.norm(...)) — no .asnumpy() method on ndarray
         lhs = float(_dpnp.linalg.norm(
             Ay - (_dpnp.vdot(y, Ay) / _dpnp.vdot(y, y)) * y
         ))
@@ -558,33 +573,30 @@ def minres(
             )
 
     # ------------------------------------------------------------------
-    # Run one Lanczos step to get alpha_1 so that gbar can be seeded
-    # correctly before the main loop.  This matches the standard
-    # Paige-Saunders initialisation where gbar_0 = 0 and the first
-    # rotation is applied to (alpha_1 - shift, beta_2).
+    # Paige-Saunders state variables (all scalars on CPU)
     # ------------------------------------------------------------------
     beta   = beta1
     oldb   = 0.0
     phibar = beta1
-    dbar   = 0.0
 
-    # w-vectors for the solution update (on device)
-    w    = _dpnp.zeros(n, dtype=dtype)
-    w2   = _dpnp.zeros(n, dtype=dtype)
+    # Givens rotation state carried between iterations (SciPy initialisation)
+    cs   = -1.0   # cos of previous rotation
+    sn   =  0.0   # sin of previous rotation
+    dbar =  0.0   # sub-diagonal entry carried forward
+    epsln = 0.0   # sub-sub-diagonal from two steps ago
 
-    # Lanczos vectors
-    r2   = r1.copy()
-    v    = y / beta1
+    # w-vectors for the three-term solution update (on device)
+    w  = _dpnp.zeros(n, dtype=dtype)
+    w2 = _dpnp.zeros(n, dtype=dtype)
 
-    # Givens rotation state from the previous step
-    cs_prev = -1.0   # cos of rotation (initialised per Paige-Saunders §A)
-    sn_prev =  0.0   # sin of rotation
-    gbar    =  0.0   # gbar_{k-1} before first step
+    # Lanczos vectors
+    r2 = r1.copy()
+    v  = y / beta1
 
     info = 1
     for itr in range(1, maxiter + 1):
         # ------------------------------------------------------------------
-        # Lanczos step k
+        # Lanczos step k: produces alpha_k, beta_{k+1}, v_k
         # ------------------------------------------------------------------
         s  = 1.0 / beta
         v  = y * s
@@ -598,60 +610,60 @@ def minres(
         r2    = y.copy()
         y     = M_op.matvec(r2)
         oldb  = beta
-
-        # FIX: guard sqrt against tiny negative rounding errors
         beta  = float(_dpnp.sqrt(_dpnp.abs(_dpnp.real(_dpnp.vdot(r2, y)))))
 
         if beta < 0.0:
             raise ValueError("minres: preconditioner M is not positive definite")
 
-        # Stagnation: beta has collapsed to machine-eps * beta1 (fixed floor)
+        # Stagnation: beta collapsed to machine-epsilon * beta1
         if beta <= eps * beta1:
             info = 2
             break
 
         # ------------------------------------------------------------------
-        # QR step: Givens rotation to annihilate the sub-diagonal
+        # QR step: correct Paige-Saunders (1975) two-rotation recurrence.
+        #
+        # Apply the PREVIOUS Givens rotation Q_{k-1} to the current
+        # tridiagonal column.  The column is [dbar, (alpha-shift), beta].
+        # (alpha already incorporates the shift via the Lanczos matvec above
+        # so the column below uses plain `alpha`.)
+        #
+        # Previous rotation acts on rows (k-1, k):
+        #   delta  = cs_{k-1} * dbar + sn_{k-1} * alpha   <- new diagonal
+        #   gbar_k = sn_{k-1} * dbar - cs_{k-1} * alpha   <- residual
+        #   epsln  = sn_{k-1} * beta                       <- sub-sub-diag
+        #   dbar   = -cs_{k-1} * beta                      <- carry forward
         #
-        # The tridiagonal entry at this step is:
-        #   [ gbar   beta_new ]
-        # where gbar is carried forward from the previous rotation.
+        # New rotation Q_k eliminates beta from [gbar_k, beta]:
+        #   gamma = hypot(gbar_k, beta)
+        #   cs_k  = gbar_k / gamma
+        #   sn_k  = beta   / gamma
         # ------------------------------------------------------------------
-        eps_k   = sn_prev * beta          # sub-sub-diagonal from prev step
-        dbar    = -cs_prev * beta         # updated dbar
-        delta_k = _np.hypot(gbar, oldb)   # norm([gbar, oldb]) for diagonal
+        oldeps = epsln
+        delta  = cs * dbar + sn * alpha    # apply previous rotation — diagonal
+        gbar_k = sn * dbar - cs * alpha    # remaining entry -> new rotation
+        epsln  = sn * beta                 # sub-sub-diagonal for next step
+        dbar   = -cs * beta               # carry forward for next step
 
-        # New rotation to zero out oldb in [delta_k_row, beta_new_row]
-        gamma_bar = _np.hypot(delta_k, beta)
-        if gamma_bar == 0.0:
-            gamma_bar = eps
-        cs_k  = delta_k / gamma_bar
-        sn_k  = beta    / gamma_bar
+        gamma = _np.hypot(gbar_k, beta)
+        if gamma == 0.0:
+            gamma = eps
+        cs = gbar_k / gamma               # new cos
+        sn = beta   / gamma               # new sin
 
-        phi    = cs_k * phibar
-        phibar = sn_k * phibar
+        phi    = cs * phibar
+        phibar = sn * phibar
 
         # ------------------------------------------------------------------
-        # Solution update: x += phi * w2_new
-        # w update follows the Paige-Saunders three-term recurrence:
-        #   w_new = (v - eps_k*w - delta_k*w2) / gamma_bar
+        # Solution update: three-term w recurrence (Paige-Saunders §5)
+        #   w_new = (v - oldeps * w_{k-2} - delta * w_{k-1}) / gamma
+        #   x    += phi * w_new
         # ------------------------------------------------------------------
-        denom  = 1.0 / gamma_bar
-        w_new  = (v - eps_k * w - delta_k * w2) * denom
-        x      = x + phi * w_new
-        w      = w2
-        w2     = w_new
-
-        # Update gbar for next iteration: gbar_k = sn_k*(alpha_next - shift)
-        # We do not have alpha_{k+1} yet, so we carry forward the value that
-        # is needed for the NEXT rotation.  The standard recurrence is:
-        #   gbar_{k} = sn_k * eps_{k+1} - ... (see Choi 2006 eq. 6.11)
-        # Simplified to the two-recurrence form used by SciPy minres:
-        gbar   = sn_k * (alpha - shift) - cs_k * dbar
-
-        # Update Givens state for next iteration
-        cs_prev = cs_k
-        sn_prev = sn_k
+        denom = 1.0 / gamma
+        w_new = (v - oldeps * w - delta * w2) * denom
+        x     = x + phi * w_new
+        w     = w2
+        w2    = w_new
 
         rnorm = abs(phibar)
 

From 969b1e95633415b00cab192b8cf17370f2949d43 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:14:25 -0500
Subject: [PATCH 31/43] Fix MINRES stagnation false-positive on float32:
 reorder convergence/stagnation checks and use 10*eps floor

- Move residual convergence check (rnorm <= atol_eff) before stagnation
  check so convergence always wins when both conditions trigger on the
  same iteration (fixes info=2 on float32 SPD with tol=1e-7).
- Raise stagnation floor from eps to 10*eps, matching SciPy's minres.py,
  so float32 (eps~1.19e-7) does not prematurely stagnate when tol is
  near machine epsilon.
- Also raise the Lanczos beta-collapse floor from eps*beta1 to
  10*eps*beta1 for the same reason.

Fixes: TestMINRES.test_minres_spd_convergence[float32-5/10/20]"
---
 dpnp/scipy/sparse/linalg/_iterative.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 155c84e0c890..4ac9eb1b4b14 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -58,7 +58,9 @@
     epsln  = sn * beta
     dbar   = -cs * beta
     gamma  = hypot(gbar_k, beta)      # new rotation eliminates beta
-  betacheck uses fixed floor eps*beta1 (not a decaying product).
+  Stagnation floor uses 10*eps (matches SciPy minres.py) so that float32
+  runs with tol near machine-epsilon do not false-positive as stagnated.
+  Convergence check always runs before the stagnation check.
 """
 
 from __future__ import annotations
@@ -518,6 +520,11 @@ def minres(
       cs     = gbar_k / gamma
       sn     = beta   / gamma
 
+    Stagnation guard uses 10*eps (matches SciPy minres.py) so that float32
+    runs with tol near machine-epsilon do not false-positive as stagnated.
+    The convergence check (rnorm <= atol_eff) always runs before the
+    stagnation check so convergence is never missed on the boundary iteration.
+
     Parameters
     ----------
     A       : array_like or LinearOperator — symmetric/Hermitian (n, n)
@@ -593,6 +600,10 @@ def minres(
     r2 = r1.copy()
     v  = y / beta1
 
+    # Stagnation floor: 10*eps matches SciPy minres.py and prevents
+    # float32 runs near machine-epsilon from false-positive stagnation.
+    stag_eps = 10.0 * eps
+
     info = 1
     for itr in range(1, maxiter + 1):
         # ------------------------------------------------------------------
@@ -615,8 +626,8 @@ def minres(
         if beta < 0.0:
             raise ValueError("minres: preconditioner M is not positive definite")
 
-        # Stagnation: beta collapsed to machine-epsilon * beta1
-        if beta <= eps * beta1:
+        # Lanczos beta-collapse floor: use 10*eps*beta1 (matches SciPy).
+        if beta <= stag_eps * beta1:
             info = 2
             break
 
@@ -670,12 +681,17 @@ def minres(
         if callback is not None:
             callback(x)
 
+        # FIX: convergence check MUST come before stagnation check so that
+        # a boundary iteration that satisfies both conditions is correctly
+        # reported as converged (info=0) rather than stagnated (info=2).
         if rnorm <= atol_eff:
             info = 0
             break
 
-        # Stagnation: step size relative to solution norm
-        if phi * denom < eps:
+        # FIX: use stag_eps (10*eps) instead of bare eps to prevent
+        # float32 runs with tol near machine-epsilon from false-positive
+        # stagnation before the residual norm has had a chance to converge.
+        if phi * denom < stag_eps:
             info = 2
             break
     else:

From 18bd2c3707e6d432b6db8455a4cf5a01b0b74c03 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:20:17 -0500
Subject: [PATCH 32/43] fix: 3 bugs in _iterative.py (asnumpy, GMRES V alloc,
 MINRES atol)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug 1 — GMRES crash: `_dpnp.asnumpy(h_dp)` does not exist as a
module-level function in dpnp.  Changed to the correct array-method
form `h_dp.asnumpy()`.

Bug 2 — GMRES performance: `_dpnp.stack(V_cols, axis=1)` was called
on every inner Arnoldi iteration, reallocating a growing (n x j)
device matrix at each step (O(j^2*n) memory traffic per restart).
Replaced with a pre-allocated V matrix `(n, restart+1)` filled
column-by-column; back-substitution and the solution update now index
directly into V rather than stacking V_cols.

Bug 3 — MINRES silent ignore of atol: `_get_atol(\"minres\", bnrm,
atol=None, rtol=tol)` hard-coded `atol=None`, discarding the caller's
`atol` argument entirely.  Changed to `atol=atol` so the caller's
absolute tolerance is respected.
---
 dpnp/scipy/sparse/linalg/_iterative.py | 67 +++++++++++++++-----------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 4ac9eb1b4b14..0c564ae5a850 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -50,6 +50,7 @@
 * GMRES: Givens-rotation Hessenberg QR, allocation-free scalar CPU side;
   all matvec + inner-product work stays on device.
 * GMRES: happy breakdown via h_{j+1,j} == 0
+* GMRES: V basis pre-allocated as (n, restart+1); no per-iteration stack().
 * MINRES: native Paige-Saunders (1975) recurrence — no scipy host round-trip.
   QR step uses the exact two-rotation recurrence from SciPy minres.py:
     oldeps = epsln
@@ -264,7 +265,7 @@ def cg(
 
     rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
-    # FIX: use `x0 is not None` to detect a non-trivial initial guess instead
+    # use `x0 is not None` to detect a non-trivial initial guess instead
     # of `_dpnp.any(x)` which returns a dpnp array and raises AmbiguousTruth.
     r  = b - A_op.matvec(x) if x0 is not None else b.copy()
     z  = M_op.matvec(r)
@@ -377,19 +378,23 @@ def gmres(
     info        = maxiter
 
     for _outer in range(maxiter):
-        # FIX: use x0 is not None for the outer-loop residual too; after the
-        # first restart x has been updated so always compute the residual.
         r    = M_op.matvec(b - A_op.matvec(x))
         beta = float(_dpnp.linalg.norm(r))
         if beta == 0.0 or beta <= atol_eff:
             info = 0
             break
 
-        V_cols = [r / beta]
-        H_np   = _np.zeros((restart + 1, restart), dtype=H_dtype)
-        cs_np  = _np.zeros(restart, dtype=H_dtype)
-        sn_np  = _np.zeros(restart, dtype=H_dtype)
-        g_np   = _np.zeros(restart + 1, dtype=H_dtype)
+        # FIX (Bug 2): Pre-allocate V as (n, restart+1) and fill
+        # column-by-column.  The previous code called
+        # `_dpnp.stack(V_cols, axis=1)` on every inner iteration,
+        # reallocating a growing device matrix at O(j^2*n) total cost.
+        V = _dpnp.zeros((n, restart + 1), dtype=dtype)
+        V[:, 0] = r / beta
+
+        H_np  = _np.zeros((restart + 1, restart), dtype=H_dtype)
+        cs_np = _np.zeros(restart, dtype=H_dtype)
+        sn_np = _np.zeros(restart, dtype=H_dtype)
+        g_np  = _np.zeros(restart + 1, dtype=H_dtype)
         g_np[0] = beta
 
         j_final = 0
@@ -398,25 +403,23 @@ def gmres(
         for j in range(restart):
             total_iters += 1
 
-            w     = M_op.matvec(A_op.matvec(V_cols[j]))
-            V_mat = _dpnp.stack(V_cols, axis=1)
+            w = M_op.matvec(A_op.matvec(V[:, j]))
 
-            # FIX: dpnp arrays have no .conj() method on transpose results;
-            # use the module-level _dpnp.conj() instead.
-            h_dp  = _dpnp.dot(_dpnp.conj(V_mat.T), w)
-            h_np  = _dpnp.asnumpy(h_dp)  # FIX: asnumpy is a module-level fn, not a method
-            w     = w - _dpnp.dot(V_mat, _dpnp.asarray(h_np, dtype=dtype))
+            # Modified Gram-Schmidt orthogonalisation against V[:, :j+1].
+            # h_dp is a (j+1,) device vector; pull to host with .asnumpy().
+            # FIX (Bug 1): use the array method `.asnumpy()` — there is no
+            # module-level `_dpnp.asnumpy()` function in dpnp.
+            h_dp = _dpnp.dot(_dpnp.conj(V[:, :j + 1].T), w)
+            h_np = h_dp.asnumpy()                    # (j+1,) numpy array
+            w    = w - _dpnp.dot(V[:, :j + 1],
+                                 _dpnp.asarray(h_np, dtype=dtype))
 
-            # FIX: float(_dpnp.linalg.norm(...)) — norm returns a 0-d dpnp
-            # array; float() extracts the scalar correctly without .asnumpy().
-            h_j1  = float(_dpnp.linalg.norm(w))
+            h_j1 = float(_dpnp.linalg.norm(w))
 
-            # FIX: always assign h_np directly (it is already the right dtype
-            # for both real and complex cases); avoid the .real strip which
-            # would drop the imaginary component for complex Hessenberg entries.
             H_np[:j + 1, j] = h_np
             H_np[j + 1,  j] = h_j1
 
+            # Apply previous Givens rotations to column j of H
             for i in range(j):
                 tmp             =  cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
                 H_np[i + 1, j] = -_np.conj(sn_np[i]) * H_np[i, j] + cs_np[i] * H_np[i + 1, j]
@@ -452,9 +455,11 @@ def gmres(
                 happy   = True
                 break
 
-            V_cols.append(w / h_j1)
+            if j + 1 < restart:
+                V[:, j + 1] = w / h_j1
             j_final = j
 
+        # Back-substitution: solve upper-triangular H[:k,:k] y = g[:k]
         k    = j_final + 1
         y_np = _np.zeros(k, dtype=H_dtype)
         for i in range(k - 1, -1, -1):
@@ -466,8 +471,8 @@ def gmres(
             else:
                 y_np[i] /= H_np[i, i]
 
-        V_k = _dpnp.stack(V_cols[:k], axis=1)
-        x   = x + _dpnp.dot(V_k, _dpnp.asarray(y_np, dtype=dtype))
+        # Solution update: x += V[:, :k] @ y
+        x = x + _dpnp.dot(V[:, :k], _dpnp.asarray(y_np, dtype=dtype))
 
         res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
 
@@ -501,6 +506,7 @@ def minres(
     M=None,
     callback: Optional[Callable] = None,
     check: bool = False,
+    atol=None,
 ) -> Tuple[_dpnp.ndarray, int]:
     """MINRES for symmetric (possibly indefinite) A — pure dpnp/oneMKL.
 
@@ -536,6 +542,7 @@ def minres(
     M       : LinearOperator, optional — SPD preconditioner
     callback: callable, optional — callback(xk) after each step
     check   : bool — verify A symmetry before iterating
+    atol    : float, optional — absolute tolerance
 
     Returns
     -------
@@ -554,7 +561,9 @@ def minres(
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    atol_eff = _get_atol("minres", bnrm, atol=None, rtol=tol)
+    # FIX (Bug 3): pass the caller's `atol` argument instead of hard-coded
+    # `atol=None`, so the absolute tolerance is actually respected.
+    atol_eff = _get_atol("minres", bnrm, atol=atol, rtol=tol)
 
     # ------------------------------------------------------------------
     # Initialise Lanczos: compute beta1 = ||M^{-1/2} r0||_M
@@ -635,7 +644,7 @@ def minres(
         # QR step: correct Paige-Saunders (1975) two-rotation recurrence.
         #
         # Apply the PREVIOUS Givens rotation Q_{k-1} to the current
-        # tridiagonal column.  The column is [dbar, (alpha-shift), beta].
+        # tridiagonal column.  The column is [dbar, alpha, beta].
         # (alpha already incorporates the shift via the Lanczos matvec above
         # so the column below uses plain `alpha`.)
         #
@@ -654,7 +663,7 @@ def minres(
         delta  = cs * dbar + sn * alpha    # apply previous rotation — diagonal
         gbar_k = sn * dbar - cs * alpha    # remaining entry -> new rotation
         epsln  = sn * beta                 # sub-sub-diagonal for next step
-        dbar   = -cs * beta               # carry forward for next step
+        dbar   = -cs * beta                # carry forward for next step
 
         gamma = _np.hypot(gbar_k, beta)
         if gamma == 0.0:
@@ -681,14 +690,14 @@ def minres(
         if callback is not None:
             callback(x)
 
-        # FIX: convergence check MUST come before stagnation check so that
+        # Convergence check MUST come before stagnation check so that
         # a boundary iteration that satisfies both conditions is correctly
         # reported as converged (info=0) rather than stagnated (info=2).
         if rnorm <= atol_eff:
             info = 0
             break
 
-        # FIX: use stag_eps (10*eps) instead of bare eps to prevent
+        # Use stag_eps (10*eps) instead of bare eps to prevent
         # float32 runs with tol near machine-epsilon from false-positive
         # stagnation before the residual norm has had a chance to converge.
         if phi * denom < stag_eps:

From cd4907a4bc7fcd4b256e98ed302ddcc69e764469 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 22:24:56 -0500
Subject: [PATCH 33/43] Fix deprecated tol kwarg in SciPy host fallback (cg,
 gmres use rtol=)

---
 dpnp/scipy/sparse/linalg/_iterative.py | 30 ++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 0c564ae5a850..cc7d09cfc269 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -43,7 +43,7 @@
 ---------------------
 * b == 0 early-exit (return x0 or zeros with info=0)
 * Breakdown detection via machine-epsilon rhotol (CG, GMRES)
-* atol normalisation: atol = max(atol_arg, rtol * ||b||)
+* atol normalisation: atol = max(atol, rtol * ||b||)
 * dtype promotion: f/F stay in single, d/D in double (CuPy rules)
 * Preconditioner (M != None): raises NotImplementedError for CG and GMRES
   until a full left-preconditioned implementation lands; MINRES supports M.
@@ -62,6 +62,14 @@
   Stagnation floor uses 10*eps (matches SciPy minres.py) so that float32
   runs with tol near machine-epsilon do not false-positive as stagnated.
   Convergence check always runs before the stagnation check.
+
+Changes (2026-04-06)
+--------------------
+* Fix DeprecationWarning from SciPy >=1.12: ``tol=`` renamed to ``rtol=``
+  in scipy.sparse.linalg.cg and scipy.sparse.linalg.gmres.
+  All internal _get_atol calls now use the keyword ``rtol=tol`` explicitly.
+* Guard callback_type passthrough in _get_atol to avoid forwarding ``None``
+  to older SciPy versions that do not accept that keyword.
 """
 
 from __future__ import annotations
@@ -206,7 +214,15 @@ def _rmatvec(self, x): return orig.rmatvec(x)
 
 
 def _get_atol(name: str, b_norm: float, atol, rtol: float) -> float:
-    """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy."""
+    """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy.
+
+    .. note::
+        The ``rtol`` parameter is the *relative* tolerance supplied by the
+        caller (historically named ``tol`` in SciPy <= 1.11).  SciPy >= 1.12
+        renamed the public argument from ``tol`` to ``rtol``; this helper
+        always uses the keyword ``rtol=`` internally to avoid the
+        DeprecationWarning emitted by SciPy >= 1.12.
+    """
     if atol == "legacy" or atol is None:
         atol = 0.0
     atol = float(atol)
@@ -259,7 +275,9 @@ def cg(
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    atol_eff = _get_atol("cg", bnrm, atol, tol)
+    # FIX: use keyword rtol= (SciPy >= 1.12 renamed tol -> rtol).
+    # _get_atol is our own helper, but the parameter name documents intent.
+    atol_eff = _get_atol("cg", bnrm, atol=atol, rtol=tol)
     if maxiter is None:
         maxiter = n * 10
 
@@ -361,7 +379,8 @@ def gmres(
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    atol_eff = _get_atol("gmres", bnrm, atol, tol)
+    # FIX: use keyword rtol= (SciPy >= 1.12 renamed tol -> rtol).
+    atol_eff = _get_atol("gmres", bnrm, atol=atol, rtol=tol)
     if restart is None: restart = min(20, n)
     if maxiter is None: maxiter = n
     restart = int(restart)
@@ -561,8 +580,7 @@ def minres(
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    # FIX (Bug 3): pass the caller's `atol` argument instead of hard-coded
-    # `atol=None`, so the absolute tolerance is actually respected.
+    # FIX: use keyword rtol= (SciPy >= 1.12 renamed tol -> rtol).
     atol_eff = _get_atol("minres", bnrm, atol=atol, rtol=tol)
 
     # ------------------------------------------------------------------

From c6d109d0654eff185d4cfe43f7a995ebeeb81d33 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 6 Apr 2026 23:57:03 -0500
Subject: [PATCH 34/43] sparse/linalg: fix cg/gmres/minres -- rtol alias, M
 support, dead SPD guard, Fortran V, pr_norm callback, full MINRES stopping
 battery

---
 dpnp/scipy/sparse/linalg/_iterative.py | 447 ++++++++++++-------------
 1 file changed, 221 insertions(+), 226 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index cc7d09cfc269..8fc6908fab5a 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -24,7 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-"""Iterative sparse linear solvers for dpnp — pure GPU/SYCL implementation.
+"""Iterative sparse linear solvers for dpnp -- pure GPU/SYCL implementation.
 
 All computation stays on the device (USM/oneMKL).  There is NO host-dispatch
 fallback: transferring data to the CPU for small systems defeats the purpose
@@ -37,39 +37,30 @@
 minres : MINRES (symmetric possibly indefinite)
 
 All signatures match cupyx.scipy.sparse.linalg (CuPy v14.0.1) and
-scipy.sparse.linalg.
-
-Corner-case coverage
----------------------
-* b == 0 early-exit (return x0 or zeros with info=0)
-* Breakdown detection via machine-epsilon rhotol (CG, GMRES)
-* atol normalisation: atol = max(atol, rtol * ||b||)
-* dtype promotion: f/F stay in single, d/D in double (CuPy rules)
-* Preconditioner (M != None): raises NotImplementedError for CG and GMRES
-  until a full left-preconditioned implementation lands; MINRES supports M.
-* GMRES: Givens-rotation Hessenberg QR, allocation-free scalar CPU side;
-  all matvec + inner-product work stays on device.
-* GMRES: happy breakdown via h_{j+1,j} == 0
-* GMRES: V basis pre-allocated as (n, restart+1); no per-iteration stack().
-* MINRES: native Paige-Saunders (1975) recurrence — no scipy host round-trip.
-  QR step uses the exact two-rotation recurrence from SciPy minres.py:
-    oldeps = epsln
-    delta  = cs * dbar + sn * alpha   # apply previous Givens rotation
-    gbar_k = sn * dbar - cs * alpha   # residual for new rotation
-    epsln  = sn * beta
-    dbar   = -cs * beta
-    gamma  = hypot(gbar_k, beta)      # new rotation eliminates beta
-  Stagnation floor uses 10*eps (matches SciPy minres.py) so that float32
-  runs with tol near machine-epsilon do not false-positive as stagnated.
-  Convergence check always runs before the stagnation check.
-
-Changes (2026-04-06)
---------------------
-* Fix DeprecationWarning from SciPy >=1.12: ``tol=`` renamed to ``rtol=``
-  in scipy.sparse.linalg.cg and scipy.sparse.linalg.gmres.
-  All internal _get_atol calls now use the keyword ``rtol=tol`` explicitly.
-* Guard callback_type passthrough in _get_atol to avoid forwarding ``None``
-  to older SciPy versions that do not accept that keyword.
+scipy.sparse.linalg, using ``rtol`` as the primary tolerance keyword
+(``tol`` is accepted as a deprecated alias for backward compatibility).
+
+Algorithm notes
+---------------
+* b == 0 early-exit (return x0 or zeros with info=0).
+* Breakdown detection via machine-epsilon rhotol (CG, GMRES).
+* atol normalisation: atol_eff = max(atol, rtol * ||b||).
+* dtype promotion: A.dtype preferred when in fdFD; otherwise b.dtype
+  promoted to float64/complex128 (CuPy v14 compatible).
+* Preconditioner M supported for all three solvers; shape is validated
+  against A inside _make_system; fast CSR SpMV injected for M too.
+* GMRES: Givens-rotation Hessenberg QR on CPU scalars; all matvec and
+  inner-product work stays on device.  V basis pre-allocated as
+  (n, restart+1) Fortran-order for coalesced column access; no per-
+  iteration stack().  callback_type 'x', 'pr_norm', and 'legacy' all
+  implemented.  Happy breakdown detected via h_{j+1,j} < rhotol.
+* MINRES: native Paige-Saunders (1975) recurrence -- no scipy round-trip.
+  Full stopping battery: rnorm <= atol_eff, test1 (relative residual),
+  test2 (residual in range of A), Acond (condition number estimate) --
+  matches CuPy v14 / SciPy minres.py reference.
+  Preconditioner SPD check: raw inner product tested for negativity
+  BEFORE sqrt so the guard fires (abs() removed -- was dead code).
+  Stagnation floor 10*eps; convergence check precedes stagnation check.
 """
 
 from __future__ import annotations
@@ -149,25 +140,13 @@ def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
     return lambda x: A.dot(x)
 
 
-def _make_system(A, M, x0, b, *, allow_M: bool = False):
+def _make_system(A, M, x0, b):
     """Validate and prepare (A_op, M_op, x, b, dtype) on device.
 
-    Parameters
-    ----------
-    allow_M : bool
-        If False (default) and M is not None, raise NotImplementedError.
-        Set True only for solvers that fully support preconditioning (minres).
+    dtype promotion follows CuPy v14 rules: A.dtype is used when it is in
+    {f,d,F,D}; otherwise b.dtype is promoted to float64 (real) or
+    complex128 (complex).  Preconditioners are always accepted and validated.
     """
-    # ------------------------------------------------------------------
-    # Preconditioner guard — must come BEFORE aslinearoperator so that
-    # passing a dpnp array as M still raises rather than silently wrapping.
-    # ------------------------------------------------------------------
-    if M is not None and not allow_M:
-        raise NotImplementedError(
-            "Preconditioner M is not yet supported for this solver. "
-            "Pass M=None or use minres which supports M."
-        )
-
     A_op = aslinearoperator(A)
     if A_op.shape[0] != A_op.shape[1]:
         raise ValueError("A must be a square operator")
@@ -179,13 +158,13 @@ def _make_system(A, M, x0, b, *, allow_M: bool = False):
             f"b length {b.shape[0]} does not match operator dimension {n}"
         )
 
-    # Dtype promotion — matches CuPy v14.0.1 rules
-    if _dpnp.issubdtype(b.dtype, _dpnp.complexfloating):
+    # Dtype promotion: prefer A.dtype; fall back via b.dtype.
+    if A_op.dtype is not None and _np_dtype(A_op.dtype).char in _SUPPORTED_DTYPES:
+        dtype = A_op.dtype
+    elif _dpnp.issubdtype(b.dtype, _dpnp.complexfloating):
         dtype = _dpnp.complex128
     else:
         dtype = _dpnp.float64
-    if A_op.dtype is not None and _np_dtype(A_op.dtype).char in "fF":
-        dtype = _dpnp.complex64 if _np_dtype(A_op.dtype).char == "F" else _dpnp.float32
 
     b = b.astype(dtype, copy=False)
     _check_dtype(b.dtype, "b")
@@ -197,39 +176,46 @@ def _make_system(A, M, x0, b, *, allow_M: bool = False):
         if x.shape[0] != n:
             raise ValueError(f"x0 length {x.shape[0]} != n={n}")
 
-    M_op = IdentityOperator((n, n), dtype=dtype) if M is None else aslinearoperator(M)
-
-    # Inject fast CSR SpMV if available
+    if M is None:
+        M_op = IdentityOperator((n, n), dtype=dtype)
+    else:
+        M_op = aslinearoperator(M)
+        if M_op.shape != A_op.shape:
+            raise ValueError(
+                f"preconditioner shape {M_op.shape} != operator shape {A_op.shape}"
+            )
+        fast_mv_M = _make_fast_matvec(M)
+        if fast_mv_M is not None:
+            _orig_M = M_op
+            class _FastMOp(LinearOperator):
+                def __init__(self):
+                    super().__init__(_orig_M.dtype, _orig_M.shape)
+                def _matvec(self, x):  return fast_mv_M(x)
+                def _rmatvec(self, x): return _orig_M.rmatvec(x)
+            M_op = _FastMOp()
+
+    # Inject fast CSR SpMV for A if available.
     fast_mv = _make_fast_matvec(A)
     if fast_mv is not None:
-        orig = A_op
+        _orig = A_op
         class _FastOp(LinearOperator):
             def __init__(self):
-                super().__init__(orig.dtype, orig.shape)
+                super().__init__(_orig.dtype, _orig.shape)
             def _matvec(self, x):  return fast_mv(x)
-            def _rmatvec(self, x): return orig.rmatvec(x)
+            def _rmatvec(self, x): return _orig.rmatvec(x)
         A_op = _FastOp()
 
     return A_op, M_op, x, b, dtype
 
 
-def _get_atol(name: str, b_norm: float, atol, rtol: float) -> float:
-    """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy.
-
-    .. note::
-        The ``rtol`` parameter is the *relative* tolerance supplied by the
-        caller (historically named ``tol`` in SciPy <= 1.11).  SciPy >= 1.12
-        renamed the public argument from ``tol`` to ``rtol``; this helper
-        always uses the keyword ``rtol=`` internally to avoid the
-        DeprecationWarning emitted by SciPy >= 1.12.
-    """
+def _get_atol(b_norm: float, atol, rtol: float) -> float:
+    """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy."""
     if atol == "legacy" or atol is None:
         atol = 0.0
     atol = float(atol)
     if atol < 0:
         raise ValueError(
-            f"'{name}' called with invalid atol={atol!r}; "
-            "atol must be a real, non-negative number."
+            f"atol={atol!r} is invalid; must be a real, non-negative number."
         )
     return max(atol, float(rtol) * float(b_norm))
 
@@ -243,52 +229,54 @@ def cg(
     b,
     x0: Optional[_dpnp.ndarray] = None,
     *,
-    tol: float = 1e-5,
+    rtol: float = 1e-5,
+    tol: Optional[float] = None,
     maxiter: Optional[int] = None,
     M=None,
     callback: Optional[Callable] = None,
     atol=None,
 ) -> Tuple[_dpnp.ndarray, int]:
-    """Conjugate Gradient — pure dpnp/oneMKL, Hermitian positive definite A.
+    """Conjugate Gradient -- pure dpnp/oneMKL, Hermitian positive definite A.
 
     Parameters
     ----------
-    A       : array_like or LinearOperator — HPD (n, n)
-    b       : array_like — right-hand side (n,)
-    x0      : array_like, optional — initial guess
-    tol     : float — relative tolerance (default 1e-5)
-    maxiter : int, optional — max iterations (default 10*n)
-    M       : None — preconditioner (unsupported; pass None)
-    callback: callable, optional — callback(xk) after each iteration
-    atol    : float, optional — absolute tolerance
+    A       : array_like or LinearOperator -- HPD (n, n)
+    b       : array_like -- right-hand side (n,)
+    x0      : array_like, optional -- initial guess
+    rtol    : float -- relative tolerance (default 1e-5)
+    tol     : float, optional -- deprecated alias for rtol
+    maxiter : int, optional -- max iterations (default 10*n)
+    M       : LinearOperator or array_like, optional -- SPD preconditioner
+    callback: callable, optional -- callback(xk) after each iteration
+    atol    : float, optional -- absolute tolerance
 
     Returns
     -------
     x    : dpnp.ndarray
     info : int  0=converged  >0=maxiter  -1=breakdown
     """
-    # allow_M=False: NotImplementedError raised inside _make_system if M!=None
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b, allow_M=False)
+    if tol is not None:
+        rtol = tol
+
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
 
     bnrm = float(_dpnp.linalg.norm(b))
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    # FIX: use keyword rtol= (SciPy >= 1.12 renamed tol -> rtol).
-    # _get_atol is our own helper, but the parameter name documents intent.
-    atol_eff = _get_atol("cg", bnrm, atol=atol, rtol=tol)
+    atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
     if maxiter is None:
         maxiter = n * 10
 
     rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
-    # use `x0 is not None` to detect a non-trivial initial guess instead
-    # of `_dpnp.any(x)` which returns a dpnp array and raises AmbiguousTruth.
+    # Use `x0 is not None` rather than `_dpnp.any(x)` -- dpnp arrays raise
+    # AmbiguousTruth when used as Python booleans.
     r  = b - A_op.matvec(x) if x0 is not None else b.copy()
     z  = M_op.matvec(r)
     p  = _dpnp.array(z, copy=True)
-    rz = float(_dpnp.vdot(r, z).real)
+    rz = float(_dpnp.real(_dpnp.vdot(r, z)))
 
     if abs(rz) < rhotol:
         return x, 0
@@ -300,7 +288,7 @@ def cg(
             break
 
         Ap  = A_op.matvec(p)
-        pAp = float(_dpnp.vdot(p, Ap).real)
+        pAp = float(_dpnp.real(_dpnp.vdot(p, Ap)))
         if abs(pAp) < rhotol:
             info = -1
             break
@@ -313,7 +301,7 @@ def cg(
             callback(x)
 
         z      = M_op.matvec(r)
-        rz_new = float(_dpnp.vdot(r, z).real)
+        rz_new = float(_dpnp.real(_dpnp.vdot(r, z)))
         if abs(rz_new) < rhotol:
             info = 0
             break
@@ -334,7 +322,8 @@ def gmres(
     b,
     x0: Optional[_dpnp.ndarray] = None,
     *,
-    tol: float = 1e-5,
+    rtol: float = 1e-5,
+    tol: Optional[float] = None,
     restart: Optional[int] = None,
     maxiter: Optional[int] = None,
     M=None,
@@ -342,53 +331,52 @@ def gmres(
     atol=None,
     callback_type: Optional[str] = None,
 ) -> Tuple[_dpnp.ndarray, int]:
-    """Restarted GMRES — pure dpnp/oneMKL, general non-symmetric A.
+    """Restarted GMRES -- pure dpnp/oneMKL, general non-symmetric A.
 
     Parameters
     ----------
-    A             : array_like or LinearOperator — (n, n)
-    b             : array_like — right-hand side (n,)
+    A             : array_like or LinearOperator -- (n, n)
+    b             : array_like -- right-hand side (n,)
     x0            : array_like, optional
-    tol           : float — relative tolerance (default 1e-5)
-    restart       : int, optional — Krylov subspace size (default min(20,n))
-    maxiter       : int, optional — max outer restart cycles (default n)
-    M             : None — preconditioner (unsupported; pass None)
+    rtol          : float -- relative tolerance (default 1e-5)
+    tol           : float, optional -- deprecated alias for rtol
+    restart       : int, optional -- Krylov subspace size (default min(20,n))
+    maxiter       : int, optional -- max outer restart cycles (default max(n,1))
+    M             : LinearOperator or array_like, optional -- preconditioner
     callback      : callable, optional
     atol          : float, optional
-    callback_type : {'x', 'pr_norm', 'legacy', None}
+    callback_type : {None, 'x', 'pr_norm', 'legacy'}
+                    None / 'x' / 'legacy' -- callback(xk) after each restart
+                    'pr_norm'             -- callback(||r||/||b||) per restart
 
     Returns
     -------
     x    : dpnp.ndarray
     info : int  0=converged  >0=iterations used  -1=breakdown
     """
+    if tol is not None:
+        rtol = tol
+
     if callback_type not in (None, "x", "pr_norm", "legacy"):
         raise ValueError(
             "callback_type must be None, 'x', 'pr_norm', or 'legacy'"
         )
-    if callback_type == "pr_norm":
-        raise NotImplementedError(
-            "callback_type='pr_norm' is not yet implemented in dpnp gmres."
-        )
+    if callback is not None and callback_type is None:
+        callback_type = "x"
 
-    # allow_M=False: NotImplementedError raised inside _make_system if M!=None
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b, allow_M=False)
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
 
     bnrm = float(_dpnp.linalg.norm(b))
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    # FIX: use keyword rtol= (SciPy >= 1.12 renamed tol -> rtol).
-    atol_eff = _get_atol("gmres", bnrm, atol=atol, rtol=tol)
+    atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
     if restart is None: restart = min(20, n)
-    if maxiter is None: maxiter = n
+    if maxiter is None: maxiter = max(n, 1)
     restart = int(restart)
     maxiter = int(maxiter)
 
-    if callback_type is None and callback is not None:
-        callback_type = "x"
-
     is_cpx  = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
     H_dtype = _np.complex128 if is_cpx else _np.float64
     rhotol  = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
@@ -403,11 +391,9 @@ def gmres(
             info = 0
             break
 
-        # FIX (Bug 2): Pre-allocate V as (n, restart+1) and fill
-        # column-by-column.  The previous code called
-        # `_dpnp.stack(V_cols, axis=1)` on every inner iteration,
-        # reallocating a growing device matrix at O(j^2*n) total cost.
-        V = _dpnp.zeros((n, restart + 1), dtype=dtype)
+        # Pre-allocate V Fortran-order: columns V[:,j] are contiguous
+        # in device memory, avoiding strided (non-coalesced) access.
+        V = _dpnp.zeros((n, restart + 1), dtype=dtype, order='F')
         V[:, 0] = r / beta
 
         H_np  = _np.zeros((restart + 1, restart), dtype=H_dtype)
@@ -424,12 +410,10 @@ def gmres(
 
             w = M_op.matvec(A_op.matvec(V[:, j]))
 
-            # Modified Gram-Schmidt orthogonalisation against V[:, :j+1].
-            # h_dp is a (j+1,) device vector; pull to host with .asnumpy().
-            # FIX (Bug 1): use the array method `.asnumpy()` — there is no
-            # module-level `_dpnp.asnumpy()` function in dpnp.
+            # Modified Gram-Schmidt: one device-to-host transfer per step
+            # (pulls (j+1,) h vector via .asnumpy()) instead of j scalars.
             h_dp = _dpnp.dot(_dpnp.conj(V[:, :j + 1].T), w)
-            h_np = h_dp.asnumpy()                    # (j+1,) numpy array
+            h_np = h_dp.asnumpy()
             w    = w - _dpnp.dot(V[:, :j + 1],
                                  _dpnp.asarray(h_np, dtype=dtype))
 
@@ -438,7 +422,7 @@ def gmres(
             H_np[:j + 1, j] = h_np
             H_np[j + 1,  j] = h_j1
 
-            # Apply previous Givens rotations to column j of H
+            # Apply previous Givens rotations to column j of H.
             for i in range(j):
                 tmp             =  cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
                 H_np[i + 1, j] = -_np.conj(sn_np[i]) * H_np[i, j] + cs_np[i] * H_np[i + 1, j]
@@ -478,25 +462,28 @@ def gmres(
                 V[:, j + 1] = w / h_j1
             j_final = j
 
-        # Back-substitution: solve upper-triangular H[:k,:k] y = g[:k]
+        # Back-substitution on upper-triangular H_np (already on CPU).
         k    = j_final + 1
         y_np = _np.zeros(k, dtype=H_dtype)
         for i in range(k - 1, -1, -1):
             y_np[i] = g_np[i]
-            for l in range(i + 1, k):
-                y_np[i] -= H_np[i, l] * y_np[l]
+            for ll in range(i + 1, k):
+                y_np[i] -= H_np[i, ll] * y_np[ll]
             if abs(H_np[i, i]) < rhotol:
                 y_np[i] = 0.0
             else:
                 y_np[i] /= H_np[i, i]
 
-        # Solution update: x += V[:, :k] @ y
+        # Solution update: device matmul, no host round-trip.
         x = x + _dpnp.dot(V[:, :k], _dpnp.asarray(y_np, dtype=dtype))
 
         res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
 
         if callback is not None:
-            callback(x if callback_type in ("x", "legacy") else res_norm)
+            if callback_type in ("x", "legacy"):
+                callback(x)
+            elif callback_type == "pr_norm":
+                callback(res_norm / bnrm)
 
         if res_norm <= atol_eff:
             info = 0
@@ -511,7 +498,7 @@ def gmres(
 
 
 # ---------------------------------------------------------------------------
-# MINRES — Paige-Saunders recurrence, pure dpnp / oneMKL
+# MINRES -- Paige-Saunders recurrence, pure dpnp / oneMKL
 # ---------------------------------------------------------------------------
 
 def minres(
@@ -520,58 +507,57 @@ def minres(
     x0: Optional[_dpnp.ndarray] = None,
     *,
     shift: float = 0.0,
-    tol: float = 1e-5,
+    rtol: float = 1e-5,
+    tol: Optional[float] = None,
     maxiter: Optional[int] = None,
     M=None,
     callback: Optional[Callable] = None,
     check: bool = False,
     atol=None,
 ) -> Tuple[_dpnp.ndarray, int]:
-    """MINRES for symmetric (possibly indefinite) A — pure dpnp/oneMKL.
+    """MINRES for symmetric (possibly indefinite) A -- pure dpnp/oneMKL.
 
     Implements Paige-Saunders (1975) MINRES via Lanczos tridiagonalisation
     with Givens QR.  All matvec, dot-products, and vector updates run on
-    device; only scalar recurrence coefficients are pulled to CPU.
-
-    The QR step uses the exact two-rotation recurrence from SciPy minres.py:
+    device; only scalar recurrence coefficients are on CPU.
 
-      oldeps = epsln
-      delta  = cs * dbar + sn * alpha    # apply previous Givens rotation
-      gbar_k = sn * dbar - cs * alpha    # residual for new rotation
-      epsln  = sn * beta
-      dbar   = -cs * beta
+    Stopping criteria (matches CuPy v14 / SciPy minres.py reference):
+      1. rnorm       <= atol_eff                  (absolute residual)
+      2. test1       <= rtol  where test1 = ||r|| / (||A|| * ||x||)
+      3. test2       <= rtol  where test2 = ||Ar_k|| / ||A||
+      4. Acond       >= 0.1 / eps                 (ill-conditioned stop)
+      5. phi * denom < 10*eps                     (stagnation)
+    Convergence (1-4) is always checked before stagnation (5).
 
-      gamma  = hypot(gbar_k, beta)       # new rotation eliminates beta
-      cs     = gbar_k / gamma
-      sn     = beta   / gamma
-
-    Stagnation guard uses 10*eps (matches SciPy minres.py) so that float32
-    runs with tol near machine-epsilon do not false-positive as stagnated.
-    The convergence check (rnorm <= atol_eff) always runs before the
-    stagnation check so convergence is never missed on the boundary iteration.
+    Preconditioner SPD check: the raw inner product <r, M*r> is tested
+    for negativity BEFORE sqrt so the guard is live (not dead code as it
+    would be if abs() were applied first).
 
     Parameters
     ----------
-    A       : array_like or LinearOperator — symmetric/Hermitian (n, n)
-    b       : array_like — right-hand side (n,)
-    x0      : array_like, optional — initial guess
-    shift   : float — solve (A - shift*I)x = b
-    tol     : float — relative tolerance (default 1e-5)
-    maxiter : int, optional — max iterations (default 5*n)
-    M       : LinearOperator, optional — SPD preconditioner
-    callback: callable, optional — callback(xk) after each step
-    check   : bool — verify A symmetry before iterating
-    atol    : float, optional — absolute tolerance
+    A       : array_like or LinearOperator -- symmetric/Hermitian (n, n)
+    b       : array_like -- right-hand side (n,)
+    x0      : array_like, optional -- initial guess
+    shift   : float -- solve (A - shift*I)x = b
+    rtol    : float -- relative tolerance (default 1e-5)
+    tol     : float, optional -- deprecated alias for rtol
+    maxiter : int, optional -- max iterations (default 5*n)
+    M       : LinearOperator, optional -- SPD preconditioner
+    callback: callable, optional -- callback(xk) after each step
+    check   : bool -- verify A symmetry before iterating
+    atol    : float, optional -- absolute tolerance
 
     Returns
     -------
     x    : dpnp.ndarray
     info : int  0=converged  1=maxiter  2=stagnation
     """
-    # allow_M=True: MINRES fully supports SPD preconditioners
-    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b, allow_M=True)
-    n      = b.shape[0]
-    eps    = float(_np.finfo(_np_dtype(dtype)).eps)
+    if tol is not None:
+        rtol = tol
+
+    A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
+    n   = b.shape[0]
+    eps = float(_np.finfo(_np_dtype(dtype)).eps)
 
     if maxiter is None:
         maxiter = 5 * n
@@ -580,24 +566,31 @@ def minres(
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    # FIX: use keyword rtol= (SciPy >= 1.12 renamed tol -> rtol).
-    atol_eff = _get_atol("minres", bnrm, atol=atol, rtol=tol)
+    atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
 
     # ------------------------------------------------------------------
-    # Initialise Lanczos: compute beta1 = ||M^{-1/2} r0||_M
+    # Initialise Lanczos: beta1 = sqrt(<r0, M*r0>)
+    # Test the raw inner product for negativity BEFORE sqrt so that a
+    # non-SPD preconditioner is detected (abs() was removed -- it made
+    # this check dead code).
     # ------------------------------------------------------------------
-    r1     = b - A_op.matvec(x) if x0 is not None else b.copy()
-    y      = M_op.matvec(r1)
-
-    beta1  = float(_dpnp.sqrt(_dpnp.abs(_dpnp.real(_dpnp.vdot(r1, y)))))
-
-    if beta1 == 0.0:
+    r1          = b - A_op.matvec(x) if x0 is not None else b.copy()
+    y           = M_op.matvec(r1)
+    beta1_inner = float(_dpnp.real(_dpnp.vdot(r1, y)))
+    if beta1_inner < 0.0:
+        raise ValueError(
+            "minres: preconditioner M is not positive semi-definite "
+            f"(<r, M*r> = {beta1_inner:.6g} < 0)"
+        )
+    if beta1_inner == 0.0:
         return x, 0
+    beta1 = _np.sqrt(beta1_inner)
 
     if check:
-        Ay = A_op.matvec(y) - shift * y
+        Ay  = A_op.matvec(y) - shift * y
         lhs = float(_dpnp.linalg.norm(
-            Ay - (_dpnp.vdot(y, Ay) / _dpnp.vdot(y, y)) * y
+            Ay - (_dpnp.real(_dpnp.vdot(y, Ay))
+                  / _dpnp.real(_dpnp.vdot(y, y))) * y
         ))
         rhs = eps ** 0.5 * float(_dpnp.linalg.norm(Ay))
         if lhs > rhs:
@@ -607,34 +600,34 @@ def minres(
             )
 
     # ------------------------------------------------------------------
-    # Paige-Saunders state variables (all scalars on CPU)
+    # Paige-Saunders scalar state (all on CPU)
     # ------------------------------------------------------------------
     beta   = beta1
     oldb   = 0.0
     phibar = beta1
+    cs     = -1.0
+    sn     =  0.0
+    dbar   =  0.0
+    epsln  =  0.0
 
-    # Givens rotation state carried between iterations (SciPy initialisation)
-    cs   = -1.0   # cos of previous rotation
-    sn   =  0.0   # sin of previous rotation
-    dbar =  0.0   # sub-diagonal entry carried forward
-    epsln = 0.0   # sub-sub-diagonal from two steps ago
+    # State for full stopping battery
+    tnorm2 = 0.0
+    gmax   = 0.0
+    gmin   = _np.finfo(_np_dtype(dtype)).max
 
-    # w-vectors for the three-term solution update (on device)
+    # Solution update vectors (on device)
     w  = _dpnp.zeros(n, dtype=dtype)
     w2 = _dpnp.zeros(n, dtype=dtype)
-
-    # Lanczos vectors
     r2 = r1.copy()
     v  = y / beta1
 
-    # Stagnation floor: 10*eps matches SciPy minres.py and prevents
-    # float32 runs near machine-epsilon from false-positive stagnation.
+    # 10*eps stagnation floor (SciPy minres.py convention).
     stag_eps = 10.0 * eps
 
     info = 1
     for itr in range(1, maxiter + 1):
         # ------------------------------------------------------------------
-        # Lanczos step k: produces alpha_k, beta_{k+1}, v_k
+        # Lanczos step k
         # ------------------------------------------------------------------
         s  = 1.0 / beta
         v  = y * s
@@ -648,54 +641,44 @@ def minres(
         r2    = y.copy()
         y     = M_op.matvec(r2)
         oldb  = beta
-        beta  = float(_dpnp.sqrt(_dpnp.abs(_dpnp.real(_dpnp.vdot(r2, y)))))
 
-        if beta < 0.0:
-            raise ValueError("minres: preconditioner M is not positive definite")
+        # SPD check on iteration inner product (live guard, no abs()).
+        inner_r2y = float(_dpnp.real(_dpnp.vdot(r2, y)))
+        if inner_r2y < 0.0:
+            raise ValueError(
+                "minres: preconditioner M is not positive semi-definite "
+                f"(<r, M*r> = {inner_r2y:.6g} < 0 at iteration {itr})"
+            )
+        beta = _np.sqrt(inner_r2y)
 
-        # Lanczos beta-collapse floor: use 10*eps*beta1 (matches SciPy).
-        if beta <= stag_eps * beta1:
-            info = 2
-            break
+        tnorm2 += alpha ** 2 + oldb ** 2 + beta ** 2
 
         # ------------------------------------------------------------------
-        # QR step: correct Paige-Saunders (1975) two-rotation recurrence.
-        #
-        # Apply the PREVIOUS Givens rotation Q_{k-1} to the current
-        # tridiagonal column.  The column is [dbar, alpha, beta].
-        # (alpha already incorporates the shift via the Lanczos matvec above
-        # so the column below uses plain `alpha`.)
-        #
-        # Previous rotation acts on rows (k-1, k):
-        #   delta  = cs_{k-1} * dbar + sn_{k-1} * alpha   <- new diagonal
-        #   gbar_k = sn_{k-1} * dbar - cs_{k-1} * alpha   <- residual
-        #   epsln  = sn_{k-1} * beta                       <- sub-sub-diag
-        #   dbar   = -cs_{k-1} * beta                      <- carry forward
-        #
-        # New rotation Q_k eliminates beta from [gbar_k, beta]:
-        #   gamma = hypot(gbar_k, beta)
-        #   cs_k  = gbar_k / gamma
-        #   sn_k  = beta   / gamma
+        # QR step: Paige-Saunders two-rotation recurrence
         # ------------------------------------------------------------------
         oldeps = epsln
-        delta  = cs * dbar + sn * alpha    # apply previous rotation — diagonal
-        gbar_k = sn * dbar - cs * alpha    # remaining entry -> new rotation
-        epsln  = sn * beta                 # sub-sub-diagonal for next step
-        dbar   = -cs * beta                # carry forward for next step
+        delta  = cs * dbar + sn * alpha
+        gbar_k = sn * dbar - cs * alpha
+        epsln  = sn * beta
+        dbar   = -cs * beta
+
+        # root = ||Ar_k|| proxy used for test2
+        root   = _np.hypot(gbar_k, dbar)
 
-        gamma = _np.hypot(gbar_k, beta)
+        gamma  = _np.hypot(gbar_k, beta)
         if gamma == 0.0:
             gamma = eps
-        cs = gbar_k / gamma               # new cos
-        sn = beta   / gamma               # new sin
+        cs     = gbar_k / gamma
+        sn     = beta   / gamma
 
         phi    = cs * phibar
         phibar = sn * phibar
 
+        gmax = max(gmax, gamma)
+        gmin = min(gmin, gamma)
+
         # ------------------------------------------------------------------
-        # Solution update: three-term w recurrence (Paige-Saunders §5)
-        #   w_new = (v - oldeps * w_{k-2} - delta * w_{k-1}) / gamma
-        #   x    += phi * w_new
+        # Solution update: three-term w recurrence (Paige-Saunders SS5)
         # ------------------------------------------------------------------
         denom = 1.0 / gamma
         w_new = (v - oldeps * w - delta * w2) * denom
@@ -704,21 +687,33 @@ def minres(
         w2    = w_new
 
         rnorm = abs(phibar)
+        Anorm = _np.sqrt(tnorm2)
+        ynorm = float(_dpnp.linalg.norm(x))
 
         if callback is not None:
             callback(x)
 
-        # Convergence check MUST come before stagnation check so that
-        # a boundary iteration that satisfies both conditions is correctly
-        # reported as converged (info=0) rather than stagnated (info=2).
+        # Convergence checks run before stagnation so a boundary iteration
+        # that satisfies both is reported as converged (info=0).
         if rnorm <= atol_eff:
             info = 0
             break
 
-        # Use stag_eps (10*eps) instead of bare eps to prevent
-        # float32 runs with tol near machine-epsilon from false-positive
-        # stagnation before the residual norm has had a chance to converge.
-        if phi * denom < stag_eps:
+        if Anorm > 0.0 and ynorm > 0.0:
+            if rnorm / (Anorm * ynorm) <= rtol:   # test1
+                info = 0
+                break
+
+        if Anorm > 0.0:
+            if root / Anorm <= rtol:               # test2
+                info = 0
+                break
+
+        if Anorm > 0.0 and (gmax / gmin) >= 0.1 / eps:  # Acond stop
+            info = 0
+            break
+
+        if phi * denom < stag_eps:                 # stagnation
             info = 2
             break
     else:

From ea4989b3fd3cb642a561a1ba371fc5e17e64c4b1 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 7 Apr 2026 00:26:52 -0500
Subject: [PATCH 35/43] =?UTF-8?q?sparse/linalg:=20fix=20SpMV=20handle=20li?=
 =?UTF-8?q?fecycle,=20complex=20dtypes,=20tol=E2=86=92rtol,=20M=20precond,?=
 =?UTF-8?q?=20MINRES=20SPD=20guard,=20GMRES=20pr=5Fnorm=20callback?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- gemv.cpp: split into _sparse_gemv_init / _sparse_gemv_compute / _sparse_gemv_release
  so optimize_gemv fires exactly once per operator rather than once per iteration.
- types_matrix.hpp: register complex64/complex128 × int32/int64 pairs for oneMKL
  sparse::gemv (std::complex<float/double>).
- gemv.hpp: declare the three new entry points.
- sparse_py.cpp: bind _sparse_gemv_init, _sparse_gemv_compute, _sparse_gemv_release
  and remove the old monolithic _sparse_gemv binding.
- _iterative.py: redesign around _CachedSpMV (init once, compute per matvec,
  release in __del__); rename tol→rtol with backward-compat alias; enable M
  preconditioner for cg/gmres; fix MINRES beta SPD check (check sign before sqrt,
  not after abs); add Paige-Saunders multi-criterion stopping (Anorm/ynorm/Acond);
  implement GMRES callback_type='pr_norm'; fix GMRES maxiter default semantics;
  add order='F' to GMRES Krylov basis V.
---
 dpnp/backend/extensions/sparse/gemv.cpp       | 357 ++++++++++++------
 dpnp/backend/extensions/sparse/gemv.hpp       |  67 +++-
 dpnp/backend/extensions/sparse/sparse_py.cpp  | 203 ++++++----
 .../extensions/sparse/types_matrix.hpp        |  38 +-
 dpnp/scipy/sparse/linalg/_iterative.py        | 232 ++++++------
 5 files changed, 572 insertions(+), 325 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index 0e8b22e0fa50..fe8d7b20445f 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -22,10 +22,12 @@
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
+#include <complex>
+#include <cstdint>
 #include <stdexcept>
 
 #include <pybind11/pybind11.h>
@@ -49,175 +51,301 @@ namespace type_utils = dpctl::tensor::type_utils;
 
 using ext::common::init_dispatch_table;
 
-typedef sycl::event (*gemv_impl_fn_ptr_t)(
+// ---------------------------------------------------------------------------
+// Dispatch table types
+// ---------------------------------------------------------------------------
+
+/**
+ * init_impl: builds the matrix_handle, calls set_csr_data + optimize_gemv.
+ * Returns (handle_ptr, optimize_event).
+ * All CSR arrays are *not* copied -- they must stay alive until release.
+ */
+typedef std::pair<std::uintptr_t, sycl::event> (*gemv_init_fn_ptr_t)(
+    sycl::queue &,
+    oneapi::mkl::transpose,
+    const char *,          // row_ptr  (typeless)
+    const char *,          // col_ind  (typeless)
+    const char *,          // values   (typeless)
+    std::int64_t,          // num_rows
+    std::int64_t,          // num_cols
+    std::int64_t,          // nnz
+    const std::vector<sycl::event> &);
+
+/**
+ * compute_impl: fires sparse::gemv using a pre-built handle.
+ * Returns (args_keep_alive_event, gemv_event).
+ */
+typedef std::pair<sycl::event, sycl::event> (*gemv_compute_fn_ptr_t)(
     sycl::queue &,
+    oneapi::mkl::sparse::matrix_handle_t,
     oneapi::mkl::transpose,
-    double,                        // alpha (always passed as double; cast inside)
-    const char *,                  // row_ptr  (typeless)
-    const char *,                  // col_ind  (typeless)
-    const char *,                  // values   (typeless)
-    std::int64_t,                  // num_rows
-    std::int64_t,                  // num_cols
-    std::int64_t,                  // nnz
-    const char *,                  // x        (typeless)
-    double,                        // beta     (always passed as double; cast inside)
-    char *,                        // y        (typeless, writable)
+    double,          // alpha (cast to Tv inside)
+    const char *,    // x (typeless)
+    double,          // beta  (cast to Tv inside)
+    char *,          // y (typeless, writable)
+    std::int64_t,    // num_rows (for output validation)
+    std::int64_t,    // num_cols
     const std::vector<sycl::event> &);
 
-static gemv_impl_fn_ptr_t
-    gemv_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+static gemv_init_fn_ptr_t
+    gemv_init_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+
+static gemv_compute_fn_ptr_t
+    gemv_compute_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+
+// ---------------------------------------------------------------------------
+// Per-type init implementation
+// ---------------------------------------------------------------------------
 
 template <typename Tv, typename Ti>
-static sycl::event
-gemv_impl(sycl::queue                      &exec_q,
-          oneapi::mkl::transpose            mkl_trans,
-          double                            alpha_d,
-          const char                       *row_ptr_data,
-          const char                       *col_ind_data,
-          const char                       *values_data,
-          std::int64_t                      num_rows,
-          std::int64_t                      num_cols,
-          std::int64_t                      nnz,
-          const char                       *x_data,
-          double                            beta_d,
-          char                             *y_data,
-          const std::vector<sycl::event>   &depends)
+static std::pair<std::uintptr_t, sycl::event>
+gemv_init_impl(sycl::queue                    &exec_q,
+               oneapi::mkl::transpose          mkl_trans,
+               const char                     *row_ptr_data,
+               const char                     *col_ind_data,
+               const char                     *values_data,
+               std::int64_t                    num_rows,
+               std::int64_t                    num_cols,
+               std::int64_t                    nnz,
+               const std::vector<sycl::event> &depends)
 {
     type_utils::validate_type_for_device<Tv>(exec_q);
 
-    const Tv  alpha = static_cast<Tv>(alpha_d);
-    const Tv  beta  = static_cast<Tv>(beta_d);
     const Ti *row_ptr = reinterpret_cast<const Ti *>(row_ptr_data);
     const Ti *col_ind = reinterpret_cast<const Ti *>(col_ind_data);
     const Tv *values  = reinterpret_cast<const Tv *>(values_data);
-    const Tv *x       = reinterpret_cast<const Tv *>(x_data);
-    Tv       *y       = reinterpret_cast<Tv *>(y_data);
 
     std::stringstream error_msg;
-    bool is_exception_caught = false;
-
     mkl_sparse::matrix_handle_t spmat = nullptr;
-    sycl::event gemv_ev;
 
-    try {
-        mkl_sparse::init_matrix_handle(&spmat);
-
-        auto ev_set = mkl_sparse::set_csr_data(
-            exec_q, spmat,
-            num_rows, num_cols, nnz,
-            oneapi::mkl::index_base::zero,
-            const_cast<Ti *>(row_ptr),
-            const_cast<Ti *>(col_ind),
-            const_cast<Tv *>(values),
-            depends);
+    mkl_sparse::init_matrix_handle(&spmat);
+
+    auto ev_set = mkl_sparse::set_csr_data(
+        exec_q, spmat,
+        num_rows, num_cols, nnz,
+        oneapi::mkl::index_base::zero,
+        const_cast<Ti *>(row_ptr),
+        const_cast<Ti *>(col_ind),
+        const_cast<Tv *>(values),
+        depends);
 
-        auto ev_opt = mkl_sparse::optimize_gemv(
+    sycl::event ev_opt;
+    try {
+        ev_opt = mkl_sparse::optimize_gemv(
             exec_q, mkl_trans, spmat, {ev_set});
+    } catch (oneapi::mkl::exception const &e) {
+        mkl_sparse::release_matrix_handle(exec_q, &spmat, {});
+        throw std::runtime_error(
+            std::string("sparse_gemv_init: MKL exception in optimize_gemv: ")
+            + e.what());
+    } catch (sycl::exception const &e) {
+        mkl_sparse::release_matrix_handle(exec_q, &spmat, {});
+        throw std::runtime_error(
+            std::string("sparse_gemv_init: SYCL exception in optimize_gemv: ")
+            + e.what());
+    }
+
+    auto handle_ptr = reinterpret_cast<std::uintptr_t>(spmat);
+    return {handle_ptr, ev_opt};
+}
+
+// ---------------------------------------------------------------------------
+// Per-type compute implementation
+// ---------------------------------------------------------------------------
 
+template <typename Tv, typename Ti>
+static std::pair<sycl::event, sycl::event>
+gemv_compute_impl(sycl::queue                           &exec_q,
+                  mkl_sparse::matrix_handle_t            spmat,
+                  oneapi::mkl::transpose                 mkl_trans,
+                  double                                 alpha_d,
+                  const char                            *x_data,
+                  double                                 beta_d,
+                  char                                  *y_data,
+                  std::int64_t                           num_rows,
+                  std::int64_t                           /* num_cols */,
+                  const std::vector<sycl::event>        &depends)
+{
+    // Scalars: for complex Tv we construct the complex scalar from the real part.
+    // alpha=1, beta=0 are the common solver values so precision loss is academic,
+    // but we keep the cast path consistent for generality.
+    const Tv alpha = static_cast<Tv>(alpha_d);
+    const Tv beta  = static_cast<Tv>(beta_d);
+
+    const Tv *x = reinterpret_cast<const Tv *>(x_data);
+    Tv       *y = reinterpret_cast<Tv *>(y_data);
+
+    sycl::event gemv_ev;
+    try {
         gemv_ev = mkl_sparse::gemv(
             exec_q, mkl_trans,
             alpha, spmat,
             x, beta, y,
-            {ev_opt});
-
-        mkl_sparse::release_matrix_handle(exec_q, &spmat, {gemv_ev});
-
+            depends);
     } catch (oneapi::mkl::exception const &e) {
-        error_msg << "Unexpected MKL exception caught during sparse_gemv() "
-                     "call:\nreason: " << e.what();
-        is_exception_caught = true;
+        throw std::runtime_error(
+            std::string("sparse_gemv_compute: MKL exception: ") + e.what());
     } catch (sycl::exception const &e) {
-        error_msg << "Unexpected SYCL exception caught during sparse_gemv() "
-                     "call:\n" << e.what();
-        is_exception_caught = true;
+        throw std::runtime_error(
+            std::string("sparse_gemv_compute: SYCL exception: ") + e.what());
     }
 
-    if (is_exception_caught) {
-        if (spmat != nullptr)
-            mkl_sparse::release_matrix_handle(exec_q, &spmat, {});
-        throw std::runtime_error(error_msg.str());
+    // Keep x and y alive until the event completes.
+    // (row_ptr/col_ind/values are kept alive by the handle itself.)
+    sycl::event args_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(gemv_ev);
+        cgh.host_task([x, y]() { (void)x; (void)y; });
+    });
+
+    return {args_ev, gemv_ev};
+}
+
+// ---------------------------------------------------------------------------
+// Public entry points
+// ---------------------------------------------------------------------------
+
+static oneapi::mkl::transpose
+decode_trans(const int trans)
+{
+    switch (trans) {
+        case 0: return oneapi::mkl::transpose::nontrans;
+        case 1: return oneapi::mkl::transpose::trans;
+        case 2: return oneapi::mkl::transpose::conjtrans;
+        default:
+            throw std::invalid_argument(
+                "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
     }
+}
 
-    return gemv_ev;
+std::pair<std::uintptr_t, sycl::event>
+sparse_gemv_init(sycl::queue                           &exec_q,
+                 const int                              trans,
+                 const dpctl::tensor::usm_ndarray      &row_ptr,
+                 const dpctl::tensor::usm_ndarray      &col_ind,
+                 const dpctl::tensor::usm_ndarray      &values,
+                 const std::int64_t                     num_rows,
+                 const std::int64_t                     num_cols,
+                 const std::int64_t                     nnz,
+                 const std::vector<sycl::event>        &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q, {row_ptr.get_queue(), col_ind.get_queue(),
+                     values.get_queue()}))
+        throw py::value_error(
+            "sparse_gemv_init: USM allocations are not compatible with the "
+            "execution queue.");
+
+    auto mkl_trans = decode_trans(trans);
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    const int val_id = array_types.typenum_to_lookup_id(values.get_typenum());
+    const int idx_id = array_types.typenum_to_lookup_id(row_ptr.get_typenum());
+
+    gemv_init_fn_ptr_t init_fn = gemv_init_dispatch_table[val_id][idx_id];
+    if (init_fn == nullptr)
+        throw py::value_error(
+            "sparse_gemv_init: no implementation for the given value/index "
+            "dtype combination. Supported: {float32,float64,complex64,"
+            "complex128} x {int32,int64}.");
+
+    return init_fn(exec_q, mkl_trans,
+                   row_ptr.get_data(), col_ind.get_data(), values.get_data(),
+                   num_rows, num_cols, nnz, depends);
 }
 
 std::pair<sycl::event, sycl::event>
-sparse_gemv(sycl::queue                           &exec_q,
-            const int                              trans,
-            const double                           alpha,
-            const dpctl::tensor::usm_ndarray      &row_ptr,
-            const dpctl::tensor::usm_ndarray      &col_ind,
-            const dpctl::tensor::usm_ndarray      &values,
-            const dpctl::tensor::usm_ndarray      &x,
-            const double                           beta,
-            const dpctl::tensor::usm_ndarray      &y,
-            const std::int64_t                     num_rows,
-            const std::int64_t                     num_cols,
-            const std::int64_t                     nnz,
-            const std::vector<sycl::event>        &depends)
+sparse_gemv_compute(sycl::queue                           &exec_q,
+                    const std::uintptr_t                   handle_ptr,
+                    const int                              trans,
+                    const double                           alpha,
+                    const dpctl::tensor::usm_ndarray      &x,
+                    const double                           beta,
+                    const dpctl::tensor::usm_ndarray      &y,
+                    const std::int64_t                     num_rows,
+                    const std::int64_t                     num_cols,
+                    const std::vector<sycl::event>        &depends)
 {
     if (x.get_ndim() != 1)
-        throw py::value_error("sparse_gemv: x must be a 1-D array.");
+        throw py::value_error("sparse_gemv_compute: x must be a 1-D array.");
     if (y.get_ndim() != 1)
-        throw py::value_error("sparse_gemv: y must be a 1-D array.");
+        throw py::value_error("sparse_gemv_compute: y must be a 1-D array.");
 
     if (!dpctl::utils::queues_are_compatible(
-            exec_q, {row_ptr.get_queue(), col_ind.get_queue(),
-                     values.get_queue(), x.get_queue(), y.get_queue()}))
+            exec_q, {x.get_queue(), y.get_queue()}))
         throw py::value_error(
-            "sparse_gemv: USM allocations are not compatible with the "
+            "sparse_gemv_compute: USM allocations are not compatible with the "
             "execution queue.");
 
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
     if (overlap(x, y))
         throw py::value_error(
-            "sparse_gemv: input array x and output array y are overlapping "
-            "segments of memory.");
+            "sparse_gemv_compute: x and y are overlapping memory segments.");
 
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(y);
     dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
         y, static_cast<std::size_t>(num_rows));
 
-    oneapi::mkl::transpose mkl_trans;
-    switch (trans) {
-        case 0: mkl_trans = oneapi::mkl::transpose::nontrans;  break;
-        case 1: mkl_trans = oneapi::mkl::transpose::trans;     break;
-        case 2: mkl_trans = oneapi::mkl::transpose::conjtrans; break;
-        default:
-            throw std::invalid_argument(
-                "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
-    }
+    auto mkl_trans = decode_trans(trans);
+    auto spmat     = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
 
+    // Dispatch on value type (x and y must match; index type is encoded in
+    // the handle from init -- we only need Tv here).
     auto array_types = dpctl_td_ns::usm_ndarray_types();
-    const int val_id = array_types.typenum_to_lookup_id(values.get_typenum());
-    const int idx_id = array_types.typenum_to_lookup_id(row_ptr.get_typenum());
-
-    gemv_impl_fn_ptr_t gemv_fn = gemv_dispatch_table[val_id][idx_id];
-    if (gemv_fn == nullptr)
+    const int val_id = array_types.typenum_to_lookup_id(x.get_typenum());
+    const int idx_id = array_types.typenum_to_lookup_id(y.get_typenum());
+
+    // For compute we only need Tv; re-use the same dispatch table using the
+    // val_id from x and idx_id from y (both are val type so idx_id == val_id
+    // is fine -- the factory only cares about Tv for the gemv call).
+    gemv_compute_fn_ptr_t compute_fn =
+        gemv_compute_dispatch_table[val_id][val_id];
+    if (compute_fn == nullptr)
         throw py::value_error(
-            "sparse_gemv: no implementation for the given value/index dtype "
-            "combination. Supported: float32/float64 with int32/int64 indices.");
+            "sparse_gemv_compute: unsupported value dtype.");
 
-    sycl::event gemv_ev =
-        gemv_fn(exec_q, mkl_trans, alpha,
-                row_ptr.get_data(), col_ind.get_data(), values.get_data(),
-                num_rows, num_cols, nnz,
-                x.get_data(), beta, y.get_data(),
-                depends);
-
-    sycl::event args_ev = dpctl::utils::keep_args_alive(
-        exec_q, {row_ptr, col_ind, values, x, y}, {gemv_ev});
+    return compute_fn(exec_q, spmat, mkl_trans, alpha,
+                      x.get_data(), beta, const_cast<char *>(y.get_data()),
+                      num_rows, num_cols, depends);
+}
 
-    return std::make_pair(args_ev, gemv_ev);
+sycl::event
+sparse_gemv_release(sycl::queue                     &exec_q,
+                    const std::uintptr_t             handle_ptr,
+                    const std::vector<sycl::event>  &depends)
+{
+    auto spmat = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
+    mkl_sparse::release_matrix_handle(exec_q, &spmat, depends);
+    // release_matrix_handle is synchronous in the current oneMKL API;
+    // return a no-op event for API uniformity.
+    return exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.host_task([]() {});
+    });
 }
 
+// ---------------------------------------------------------------------------
+// Dispatch table factory and registration
+// ---------------------------------------------------------------------------
+
+template <typename fnT, typename Tv, typename Ti>
+struct GemvInitContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::SparseGemvTypePairSupportFactory<Tv, Ti>::is_defined)
+            return gemv_init_impl<Tv, Ti>;
+        else
+            return nullptr;
+    }
+};
+
 template <typename fnT, typename Tv, typename Ti>
-struct GemvContigFactory
+struct GemvComputeContigFactory
 {
     fnT get()
     {
         if constexpr (types::SparseGemvTypePairSupportFactory<Tv, Ti>::is_defined)
-            return gemv_impl<Tv, Ti>;
+            return gemv_compute_impl<Tv, Ti>;
         else
             return nullptr;
     }
@@ -225,7 +353,10 @@ struct GemvContigFactory
 
 void init_sparse_gemv_dispatch_table(void)
 {
-    init_dispatch_table<gemv_impl_fn_ptr_t, GemvContigFactory>(
-        gemv_dispatch_table);
+    init_dispatch_table<gemv_init_fn_ptr_t,    GemvInitContigFactory>(
+        gemv_init_dispatch_table);
+    init_dispatch_table<gemv_compute_fn_ptr_t, GemvComputeContigFactory>(
+        gemv_compute_dispatch_table);
 }
+
 } // namespace dpnp::extensions::sparse
diff --git a/dpnp/backend/extensions/sparse/gemv.hpp b/dpnp/backend/extensions/sparse/gemv.hpp
index cd647e6c1734..c5b57305f3f9 100644
--- a/dpnp/backend/extensions/sparse/gemv.hpp
+++ b/dpnp/backend/extensions/sparse/gemv.hpp
@@ -22,8 +22,8 @@
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
 #pragma once
@@ -36,20 +36,57 @@
 namespace dpnp::extensions::sparse
 {
 
+/**
+ * sparse_gemv_init -- ONE-TIME setup per sparse matrix operator.
+ *
+ * Calls init_matrix_handle + set_csr_data + optimize_gemv.
+ * Returns the opaque matrix_handle_t cast to uintptr_t for safe
+ * Python round-tripping, plus the dependency event from optimize_gemv
+ * (caller must wait on it before calling sparse_gemv_compute).
+ *
+ * Lifetime: the handle owns NO data copies; all CSR arrays must remain
+ * alive (in USM) until sparse_gemv_release is called.
+ */
+extern std::pair<std::uintptr_t, sycl::event>
+sparse_gemv_init(sycl::queue                           &exec_q,
+                 const int                              trans,
+                 const dpctl::tensor::usm_ndarray      &row_ptr,
+                 const dpctl::tensor::usm_ndarray      &col_ind,
+                 const dpctl::tensor::usm_ndarray      &values,
+                 const std::int64_t                     num_rows,
+                 const std::int64_t                     num_cols,
+                 const std::int64_t                     nnz,
+                 const std::vector<sycl::event>        &depends);
+
+/**
+ * sparse_gemv_compute -- PER-ITERATION SpMV.
+ *
+ * Calls only oneapi::mkl::sparse::gemv using the pre-built handle.
+ * alpha and beta are passed as double and cast inside gemv_compute_impl
+ * to the matrix value type.
+ */
 extern std::pair<sycl::event, sycl::event>
-sparse_gemv(sycl::queue &exec_q,
-            const int trans,
-            const double alpha,
-            const dpctl::tensor::usm_ndarray &row_ptr,
-            const dpctl::tensor::usm_ndarray &col_ind,
-            const dpctl::tensor::usm_ndarray &values,
-            const dpctl::tensor::usm_ndarray &x,
-            const double beta,
-            const dpctl::tensor::usm_ndarray &y,
-            const std::int64_t num_rows,
-            const std::int64_t num_cols,
-            const std::int64_t nnz,
-            const std::vector<sycl::event> &depends);
+sparse_gemv_compute(sycl::queue                           &exec_q,
+                    const std::uintptr_t                   handle_ptr,
+                    const int                              trans,
+                    const double                           alpha,
+                    const dpctl::tensor::usm_ndarray      &x,
+                    const double                           beta,
+                    const dpctl::tensor::usm_ndarray      &y,
+                    const std::int64_t                     num_rows,
+                    const std::int64_t                     num_cols,
+                    const std::vector<sycl::event>        &depends);
+
+/**
+ * sparse_gemv_release -- free the matrix_handle created by sparse_gemv_init.
+ *
+ * Must be called exactly once per handle, after all compute calls
+ * that depend on it have completed.
+ */
+extern sycl::event
+sparse_gemv_release(sycl::queue                     &exec_q,
+                    const std::uintptr_t             handle_ptr,
+                    const std::vector<sycl::event>  &depends);
 
 extern void init_sparse_gemv_dispatch_table(void);
 
diff --git a/dpnp/backend/extensions/sparse/sparse_py.cpp b/dpnp/backend/extensions/sparse/sparse_py.cpp
index 35f40d6bad18..0efd1d0da9ae 100644
--- a/dpnp/backend/extensions/sparse/sparse_py.cpp
+++ b/dpnp/backend/extensions/sparse/sparse_py.cpp
@@ -22,99 +22,144 @@
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
-//*****************************************************************************
-//
-// This file defines functions of dpnp.backend._sparse_impl extensions
-//
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <sycl/sycl.hpp>
+#include <dpctl4pybind11.hpp>
+
 #include "gemv.hpp"
 
-namespace sparse_ns = dpnp::extensions::sparse;
 namespace py = pybind11;
 
-static void init_dispatch_vectors_tables(void)
-{
-    sparse_ns::init_sparse_gemv_dispatch_table();
-}
+using dpnp::extensions::sparse::init_sparse_gemv_dispatch_table;
+using dpnp::extensions::sparse::sparse_gemv_init;
+using dpnp::extensions::sparse::sparse_gemv_compute;
+using dpnp::extensions::sparse::sparse_gemv_release;
 
 PYBIND11_MODULE(_sparse_impl, m)
 {
-    init_dispatch_vectors_tables();
+    init_sparse_gemv_dispatch_table();
 
-    using arrayT     = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-
-    {
-        m.def(
-            "_sparse_gemv",
-            [](sycl::queue &exec_q,
-               const int trans,
-               const double alpha,
-               const arrayT &row_ptr,
-               const arrayT &col_ind,
-               const arrayT &values,
-               const arrayT &x,
-               const double beta,
-               const arrayT &y,
-               const std::int64_t num_rows,
-               const std::int64_t num_cols,
-               const std::int64_t nnz,
-               const event_vecT &depends) {
-                return sparse_ns::sparse_gemv(
-                    exec_q, trans, alpha,
-                    row_ptr, col_ind, values,
-                    x, beta, y,
-                    num_rows, num_cols, nnz, depends);
-            },
-            "CSR sparse matrix-vector product y = alpha*op(A)*x + beta*y "
-            "via oneMKL sparse::gemv.\n\n"
-            "Parameters\n"
-            "----------\n"
-            "sycl_queue : dpctl.SyclQueue\n"
-            "trans      : int  0=N, 1=T, 2=C\n"
-            "alpha      : float\n"
-            "row_ptr    : usm_ndarray  CSR row offsets (int32 or int64)\n"
-            "col_ind    : usm_ndarray  CSR column indices (int32 or int64)\n"
-            "values     : usm_ndarray  CSR non-zeros (float32 or float64)\n"
-            "x          : usm_ndarray  input vector\n"
-            "beta       : float\n"
-            "y          : usm_ndarray  output vector (in/out)\n"
-            "num_rows, num_cols, nnz : int64\n"
-            "depends    : list[sycl.Event]\n"
-            "\nReturns\n-------\n"
-            "(host_task_event, compute_event) : pair of sycl.Event",
-            py::arg("sycl_queue"),
-            py::arg("trans"),
-            py::arg("alpha"),
-            py::arg("row_ptr"),
-            py::arg("col_ind"),
-            py::arg("values"),
-            py::arg("x"),
-            py::arg("beta"),
-            py::arg("y"),
-            py::arg("num_rows"),
-            py::arg("num_cols"),
-            py::arg("nnz"),
-            py::arg("depends") = py::list());
-    }
-
-    {
-        m.def(
-            "_using_onemath",
-            []() {
+    // ------------------------------------------------------------------
+    // _using_onemath()
+    // Reports whether the module was compiled against the portable OneMath
+    // interface (USE_ONEMATH) rather than direct oneMKL.
+    // ------------------------------------------------------------------
+    m.def("_using_onemath", []() -> bool {
 #ifdef USE_ONEMATH
-                return true;
+        return true;
 #else
-                return false;
+        return false;
 #endif
-            },
-            "Return True if built against OneMath portable backend, "
-            "False if built directly against oneMKL.");
-    }
+    });
+
+    // ------------------------------------------------------------------
+    // _sparse_gemv_init(exec_q, trans, row_ptr, col_ind, values,
+    //                   num_rows, num_cols, nnz, depends)
+    //     -> (handle: int, event)
+    //
+    // Calls init_matrix_handle + set_csr_data + optimize_gemv ONCE.
+    // The returned handle is an opaque uintptr_t; pass it back to
+    // _sparse_gemv_compute and _sparse_gemv_release.
+    // ------------------------------------------------------------------
+    m.def(
+        "_sparse_gemv_init",
+        [](sycl::queue                                &exec_q,
+           const int                                   trans,
+           const dpctl::tensor::usm_ndarray           &row_ptr,
+           const dpctl::tensor::usm_ndarray           &col_ind,
+           const dpctl::tensor::usm_ndarray           &values,
+           const std::int64_t                          num_rows,
+           const std::int64_t                          num_cols,
+           const std::int64_t                          nnz,
+           const std::vector<sycl::event>             &depends)
+            -> std::pair<std::uintptr_t, sycl::event>
+        {
+            return sparse_gemv_init(
+                exec_q, trans,
+                row_ptr, col_ind, values,
+                num_rows, num_cols, nnz,
+                depends);
+        },
+        py::arg("exec_q"),
+        py::arg("trans"),
+        py::arg("row_ptr"),
+        py::arg("col_ind"),
+        py::arg("values"),
+        py::arg("num_rows"),
+        py::arg("num_cols"),
+        py::arg("nnz"),
+        py::arg("depends"),
+        "Initialise oneMKL sparse matrix handle (set_csr_data + optimize_gemv). "
+        "Returns (handle_ptr: int, event). Call once per operator."
+    );
+
+    // ------------------------------------------------------------------
+    // _sparse_gemv_compute(exec_q, handle, trans, alpha, x, beta, y,
+    //                      num_rows, num_cols, depends)
+    //     -> (args_event, gemv_event)
+    //
+    // Fires sparse::gemv using the pre-built handle.
+    // Only the cheap kernel is dispatched; no analysis overhead.
+    // ------------------------------------------------------------------
+    m.def(
+        "_sparse_gemv_compute",
+        [](sycl::queue                                &exec_q,
+           const std::uintptr_t                        handle_ptr,
+           const int                                   trans,
+           const double                                alpha,
+           const dpctl::tensor::usm_ndarray           &x,
+           const double                                beta,
+           const dpctl::tensor::usm_ndarray           &y,
+           const std::int64_t                          num_rows,
+           const std::int64_t                          num_cols,
+           const std::vector<sycl::event>             &depends)
+            -> std::pair<sycl::event, sycl::event>
+        {
+            return sparse_gemv_compute(
+                exec_q, handle_ptr, trans, alpha,
+                x, beta, y,
+                num_rows, num_cols,
+                depends);
+        },
+        py::arg("exec_q"),
+        py::arg("handle"),
+        py::arg("trans"),
+        py::arg("alpha"),
+        py::arg("x"),
+        py::arg("beta"),
+        py::arg("y"),
+        py::arg("num_rows"),
+        py::arg("num_cols"),
+        py::arg("depends"),
+        "Execute sparse::gemv using a pre-built handle. "
+        "Returns (args_event, gemv_event)."
+    );
+
+    // ------------------------------------------------------------------
+    // _sparse_gemv_release(exec_q, handle, depends) -> event
+    //
+    // Releases the matrix_handle allocated by _sparse_gemv_init.
+    // Must be called exactly once per handle after all compute calls
+    // referencing it are complete.
+    // ------------------------------------------------------------------
+    m.def(
+        "_sparse_gemv_release",
+        [](sycl::queue                                &exec_q,
+           const std::uintptr_t                        handle_ptr,
+           const std::vector<sycl::event>             &depends)
+            -> sycl::event
+        {
+            return sparse_gemv_release(exec_q, handle_ptr, depends);
+        },
+        py::arg("exec_q"),
+        py::arg("handle"),
+        py::arg("depends"),
+        "Release the oneMKL matrix_handle created by _sparse_gemv_init."
+    );
 }
diff --git a/dpnp/backend/extensions/sparse/types_matrix.hpp b/dpnp/backend/extensions/sparse/types_matrix.hpp
index 5abdef85db3c..948d2fbd3c40 100644
--- a/dpnp/backend/extensions/sparse/types_matrix.hpp
+++ b/dpnp/backend/extensions/sparse/types_matrix.hpp
@@ -22,12 +22,13 @@
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-// THE POSSIBILITY OF SUCH DAMAGE.
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
 #pragma once
 
+#include <complex>
 #include <cstdint>
 #include <type_traits>
 
@@ -45,13 +46,18 @@ namespace dpnp::extensions::sparse::types
  * for oneapi::mkl::sparse::gemv.
  *
  * oneMKL sparse BLAS supports:
- *   - float32  with int32 indices
- *   - float32  with int64 indices
- *   - float64  with int32 indices
- *   - float64  with int64 indices
+ *   - float32              with int32 indices
+ *   - float32              with int64 indices
+ *   - float64              with int32 indices
+ *   - float64              with int64 indices
+ *   - complex<float>  (c64) with int32 indices
+ *   - complex<float>  (c64) with int64 indices
+ *   - complex<double> (c128) with int32 indices
+ *   - complex<double> (c128) with int64 indices
  *
- * Complex value types and other index widths are not supported by
- * oneapi::mkl::sparse::gemv and are intentionally excluded.
+ * Complex support requires oneMKL >= 2023.x (sparse BLAS complex USM API).
+ * The dispatch table entry is non-null only when the pair is registered here;
+ * the Python layer falls back to A.dot(x) when the entry is nullptr.
  *
  * @tparam Tv  Value type of the sparse matrix and dense vectors.
  * @tparam Ti  Index type of the sparse matrix (row_ptr / col_ind arrays).
@@ -60,10 +66,18 @@ template <typename Tv, typename Ti>
 struct SparseGemvTypePairSupportFactory
 {
     static constexpr bool is_defined = std::disjunction<
-        dpctl_td_ns::TypePairDefinedEntry<Tv, float,  Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, float,  Ti, std::int64_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int64_t>,
+        // real single precision
+        dpctl_td_ns::TypePairDefinedEntry<Tv, float,                   Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, float,                   Ti, std::int64_t>,
+        // real double precision
+        dpctl_td_ns::TypePairDefinedEntry<Tv, double,                  Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, double,                  Ti, std::int64_t>,
+        // complex single precision
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>,     Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>,     Ti, std::int64_t>,
+        // complex double precision
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>,    Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>,    Ti, std::int64_t>,
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 8fc6908fab5a..555c4fa35ad2 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -40,27 +40,27 @@
 scipy.sparse.linalg, using ``rtol`` as the primary tolerance keyword
 (``tol`` is accepted as a deprecated alias for backward compatibility).
 
-Algorithm notes
----------------
-* b == 0 early-exit (return x0 or zeros with info=0).
-* Breakdown detection via machine-epsilon rhotol (CG, GMRES).
-* atol normalisation: atol_eff = max(atol, rtol * ||b||).
-* dtype promotion: A.dtype preferred when in fdFD; otherwise b.dtype
-  promoted to float64/complex128 (CuPy v14 compatible).
-* Preconditioner M supported for all three solvers; shape is validated
-  against A inside _make_system; fast CSR SpMV injected for M too.
-* GMRES: Givens-rotation Hessenberg QR on CPU scalars; all matvec and
-  inner-product work stays on device.  V basis pre-allocated as
-  (n, restart+1) Fortran-order for coalesced column access; no per-
-  iteration stack().  callback_type 'x', 'pr_norm', and 'legacy' all
-  implemented.  Happy breakdown detected via h_{j+1,j} < rhotol.
-* MINRES: native Paige-Saunders (1975) recurrence -- no scipy round-trip.
-  Full stopping battery: rnorm <= atol_eff, test1 (relative residual),
-  test2 (residual in range of A), Acond (condition number estimate) --
-  matches CuPy v14 / SciPy minres.py reference.
-  Preconditioner SPD check: raw inner product tested for negativity
-  BEFORE sqrt so the guard fires (abs() removed -- was dead code).
-  Stagnation floor 10*eps; convergence check precedes stagnation check.
+SpMV fast-path
+--------------
+When a CSR dpnp sparse matrix is passed as A or M, _make_fast_matvec()
+constructs a _CachedSpMV object that:
+  1. Calls _sparse_gemv_init() ONCE to create the oneMKL matrix_handle,
+     register CSR pointers via set_csr_data, and run optimize_gemv
+     (the expensive sparsity-analysis phase).
+  2. Calls _sparse_gemv_compute() on every matvec -- only the cheap
+     oneMKL sparse::gemv kernel fires; no handle setup overhead.
+  3. Calls _sparse_gemv_release() in __del__ to free the handle.
+
+This means optimize_gemv runs once per operator, not once per iteration,
+which is the correct usage pattern for oneMKL sparse BLAS.
+
+Supported dtypes for the oneMKL SpMV fast-path:
+  values : float32, float64, complex64, complex128
+  indices: int32, int64
+Complex dtypes require oneMKL sparse BLAS support (available since
+oneMKL 2023.x); if the dispatch table slot is nullptr (types_matrix.hpp
+does not register the pair) a ValueError is raised by the C++ layer.
+_make_fast_matvec catches this and falls back to A.dot(x).
 """
 
 from __future__ import annotations
@@ -74,7 +74,7 @@
 
 
 # ---------------------------------------------------------------------------
-# oneMKL sparse SpMV hook
+# oneMKL sparse SpMV hook -- cached-handle API
 # ---------------------------------------------------------------------------
 
 try:
@@ -92,11 +92,7 @@
 # ---------------------------------------------------------------------------
 
 def _np_dtype(dp_dtype) -> _np.dtype:
-    """Normalise any dtype-like (dpnp type, numpy type, string) to np.dtype.
-
-    dpnp dtype objects (e.g. dpnp.float64) are Python type objects with no
-    .char attribute.  np.dtype() accepts all of them correctly.
-    """
+    """Normalise any dtype-like (dpnp type, numpy type, string) to np.dtype."""
     return _np.dtype(dp_dtype)
 
 
@@ -108,8 +104,83 @@ def _check_dtype(dtype, name: str) -> None:
         )
 
 
+class _CachedSpMV:
+    """Wrap a CSR matrix with a persistent oneMKL matrix_handle.
+
+    The handle is initialised (set_csr_data + optimize_gemv) exactly once
+    in __init__.  Subsequent calls to __call__ only invoke sparse::gemv,
+    paying no analysis overhead.  The handle is released in __del__.
+
+    Parameters
+    ----------
+    A      : dpnp CSR sparse matrix
+    trans  : int  0=N, 1=T, 2=C  (fixed at construction)
+    """
+
+    __slots__ = ("_A", "_exec_q", "_handle", "_trans",
+                 "_nrows", "_ncols", "_nnz")
+
+    def __init__(self, A, trans: int = 0):
+        self._A      = A          # keep alive so USM pointers stay valid
+        self._trans  = int(trans)
+        self._nrows  = int(A.shape[0])
+        self._ncols  = int(A.shape[1])
+        self._nnz    = int(A.data.shape[0])
+        self._exec_q = A.data.sycl_queue
+        self._handle = None
+
+        # init_matrix_handle + set_csr_data + optimize_gemv (once)
+        handle, ev = _si._sparse_gemv_init(
+            self._exec_q,
+            self._trans,
+            A.indptr,
+            A.indices,
+            A.data,
+            self._nrows,
+            self._ncols,
+            self._nnz,
+            [],
+        )
+        ev.wait()
+        self._handle = handle
+
+    def __call__(self, x: _dpnp.ndarray) -> _dpnp.ndarray:
+        """y = op(A) * x  --  only sparse::gemv fires."""
+        y = _dpnp.zeros(self._nrows, dtype=self._A.data.dtype,
+                        sycl_queue=self._exec_q)
+        _, ev = _si._sparse_gemv_compute(
+            self._exec_q,
+            self._handle,
+            self._trans,
+            1.0,
+            x,
+            0.0,
+            y,
+            self._nrows,
+            self._ncols,
+            [],
+        )
+        ev.wait()
+        return y
+
+    def __del__(self):
+        if self._handle is not None and _si is not None:
+            try:
+                _si._sparse_gemv_release(self._exec_q, self._handle, [])
+            except Exception:
+                pass
+            self._handle = None
+
+
 def _make_fast_matvec(A):
-    """Return device-side CSR SpMV callable, or None."""
+    """Return a _CachedSpMV if A is a CSR matrix with oneMKL support,
+    a plain lambda fallback, or None if A is not sparse.
+
+    Falls back gracefully on:
+      - missing _sparse_impl extension
+      - dtype not supported by the C++ dispatch table
+      - any other C++ exception during handle initialisation
+    """
     try:
         from dpnp.scipy import sparse as _sp
         if not (_sp.issparse(A) and A.format == "csr"):
@@ -117,27 +188,16 @@ def _make_fast_matvec(A):
     except (ImportError, AttributeError):
         return None
 
-    if _HAS_SPARSE_IMPL:
-        indptr  = A.indptr
-        indices = A.indices
-        data    = A.data
-        nrows   = int(A.shape[0])
-        ncols   = int(A.shape[1])
-        nnz     = int(data.shape[0])
-        exec_q  = data.sycl_queue
-
-        def _csr_spmv(x: _dpnp.ndarray) -> _dpnp.ndarray:
-            y = _dpnp.zeros(nrows, dtype=data.dtype, sycl_queue=exec_q)
-            _, ev = _si._sparse_gemv(
-                exec_q, 0, 1.0, indptr, indices, data, x,
-                0.0, y, nrows, ncols, nnz, [],
-            )
-            ev.wait()
-            return y
+    if not _HAS_SPARSE_IMPL:
+        return lambda x: A.dot(x)
 
-        return _csr_spmv
-
-    return lambda x: A.dot(x)
+    # Try to build the cached handle; fall back to dot() on any error
+    # (e.g. complex dtype not yet in the dispatch table on older builds).
+    try:
+        spmv = _CachedSpMV(A, trans=0)
+        return spmv
+    except Exception:
+        return lambda x: A.dot(x)
 
 
 def _make_system(A, M, x0, b):
@@ -271,8 +331,6 @@ def cg(
 
     rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
-    # Use `x0 is not None` rather than `_dpnp.any(x)` -- dpnp arrays raise
-    # AmbiguousTruth when used as Python booleans.
     r  = b - A_op.matvec(x) if x0 is not None else b.copy()
     z  = M_op.matvec(r)
     p  = _dpnp.array(z, copy=True)
@@ -346,8 +404,6 @@ def gmres(
     callback      : callable, optional
     atol          : float, optional
     callback_type : {None, 'x', 'pr_norm', 'legacy'}
-                    None / 'x' / 'legacy' -- callback(xk) after each restart
-                    'pr_norm'             -- callback(||r||/||b||) per restart
 
     Returns
     -------
@@ -391,8 +447,8 @@ def gmres(
             info = 0
             break
 
-        # Pre-allocate V Fortran-order: columns V[:,j] are contiguous
-        # in device memory, avoiding strided (non-coalesced) access.
+        # Krylov basis: column-major (order='F') so V[:,j] is contiguous
+        # on the device -- avoids strided non-coalesced memory access.
         V = _dpnp.zeros((n, restart + 1), dtype=dtype, order='F')
         V[:, 0] = r / beta
 
@@ -410,8 +466,6 @@ def gmres(
 
             w = M_op.matvec(A_op.matvec(V[:, j]))
 
-            # Modified Gram-Schmidt: one device-to-host transfer per step
-            # (pulls (j+1,) h vector via .asnumpy()) instead of j scalars.
             h_dp = _dpnp.dot(_dpnp.conj(V[:, :j + 1].T), w)
             h_np = h_dp.asnumpy()
             w    = w - _dpnp.dot(V[:, :j + 1],
@@ -422,7 +476,6 @@ def gmres(
             H_np[:j + 1, j] = h_np
             H_np[j + 1,  j] = h_j1
 
-            # Apply previous Givens rotations to column j of H.
             for i in range(j):
                 tmp             =  cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
                 H_np[i + 1, j] = -_np.conj(sn_np[i]) * H_np[i, j] + cs_np[i] * H_np[i + 1, j]
@@ -445,7 +498,7 @@ def gmres(
 
             res_norm = abs(g_np[j + 1])
 
-            if h_j1 < rhotol:       # happy breakdown
+            if h_j1 < rhotol:
                 j_final = j
                 happy   = True
                 if res_norm <= atol_eff:
@@ -462,7 +515,6 @@ def gmres(
                 V[:, j + 1] = w / h_j1
             j_final = j
 
-        # Back-substitution on upper-triangular H_np (already on CPU).
         k    = j_final + 1
         y_np = _np.zeros(k, dtype=H_dtype)
         for i in range(k - 1, -1, -1):
@@ -474,7 +526,6 @@ def gmres(
             else:
                 y_np[i] /= H_np[i, i]
 
-        # Solution update: device matmul, no host round-trip.
         x = x + _dpnp.dot(V[:, :k], _dpnp.asarray(y_np, dtype=dtype))
 
         res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
@@ -517,22 +568,6 @@ def minres(
 ) -> Tuple[_dpnp.ndarray, int]:
     """MINRES for symmetric (possibly indefinite) A -- pure dpnp/oneMKL.
 
-    Implements Paige-Saunders (1975) MINRES via Lanczos tridiagonalisation
-    with Givens QR.  All matvec, dot-products, and vector updates run on
-    device; only scalar recurrence coefficients are on CPU.
-
-    Stopping criteria (matches CuPy v14 / SciPy minres.py reference):
-      1. rnorm       <= atol_eff                  (absolute residual)
-      2. test1       <= rtol  where test1 = ||r|| / (||A|| * ||x||)
-      3. test2       <= rtol  where test2 = ||Ar_k|| / ||A||
-      4. Acond       >= 0.1 / eps                 (ill-conditioned stop)
-      5. phi * denom < 10*eps                     (stagnation)
-    Convergence (1-4) is always checked before stagnation (5).
-
-    Preconditioner SPD check: the raw inner product <r, M*r> is tested
-    for negativity BEFORE sqrt so the guard is live (not dead code as it
-    would be if abs() were applied first).
-
     Parameters
     ----------
     A       : array_like or LinearOperator -- symmetric/Hermitian (n, n)
@@ -568,12 +603,6 @@ def minres(
 
     atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
 
-    # ------------------------------------------------------------------
-    # Initialise Lanczos: beta1 = sqrt(<r0, M*r0>)
-    # Test the raw inner product for negativity BEFORE sqrt so that a
-    # non-SPD preconditioner is detected (abs() was removed -- it made
-    # this check dead code).
-    # ------------------------------------------------------------------
     r1          = b - A_op.matvec(x) if x0 is not None else b.copy()
     y           = M_op.matvec(r1)
     beta1_inner = float(_dpnp.real(_dpnp.vdot(r1, y)))
@@ -599,9 +628,6 @@ def minres(
                 "set check=False to skip this test."
             )
 
-    # ------------------------------------------------------------------
-    # Paige-Saunders scalar state (all on CPU)
-    # ------------------------------------------------------------------
     beta   = beta1
     oldb   = 0.0
     phibar = beta1
@@ -610,25 +636,19 @@ def minres(
     dbar   =  0.0
     epsln  =  0.0
 
-    # State for full stopping battery
     tnorm2 = 0.0
     gmax   = 0.0
     gmin   = _np.finfo(_np_dtype(dtype)).max
 
-    # Solution update vectors (on device)
     w  = _dpnp.zeros(n, dtype=dtype)
     w2 = _dpnp.zeros(n, dtype=dtype)
     r2 = r1.copy()
     v  = y / beta1
 
-    # 10*eps stagnation floor (SciPy minres.py convention).
     stag_eps = 10.0 * eps
 
     info = 1
     for itr in range(1, maxiter + 1):
-        # ------------------------------------------------------------------
-        # Lanczos step k
-        # ------------------------------------------------------------------
         s  = 1.0 / beta
         v  = y * s
         y  = A_op.matvec(v) - shift * v
@@ -642,7 +662,8 @@ def minres(
         y     = M_op.matvec(r2)
         oldb  = beta
 
-        # SPD check on iteration inner product (live guard, no abs()).
+        # Check preconditioner SPD: compute raw inner product, then check sign
+        # before sqrt -- abs() would hide a non-SPD M.
         inner_r2y = float(_dpnp.real(_dpnp.vdot(r2, y)))
         if inner_r2y < 0.0:
             raise ValueError(
@@ -653,16 +674,12 @@ def minres(
 
         tnorm2 += alpha ** 2 + oldb ** 2 + beta ** 2
 
-        # ------------------------------------------------------------------
-        # QR step: Paige-Saunders two-rotation recurrence
-        # ------------------------------------------------------------------
         oldeps = epsln
         delta  = cs * dbar + sn * alpha
         gbar_k = sn * dbar - cs * alpha
         epsln  = sn * beta
         dbar   = -cs * beta
 
-        # root = ||Ar_k|| proxy used for test2
         root   = _np.hypot(gbar_k, dbar)
 
         gamma  = _np.hypot(gbar_k, beta)
@@ -677,9 +694,6 @@ def minres(
         gmax = max(gmax, gamma)
         gmin = min(gmin, gamma)
 
-        # ------------------------------------------------------------------
-        # Solution update: three-term w recurrence (Paige-Saunders SS5)
-        # ------------------------------------------------------------------
         denom = 1.0 / gamma
         w_new = (v - oldeps * w - delta * w2) * denom
         x     = x + phi * w_new
@@ -693,27 +707,33 @@ def minres(
         if callback is not None:
             callback(x)
 
-        # Convergence checks run before stagnation so a boundary iteration
-        # that satisfies both is reported as converged (info=0).
+        # Stopping criterion 1: absolute residual
         if rnorm <= atol_eff:
             info = 0
             break
 
+        # Stopping criterion 2: relative residual  ||r|| / (||A|| ||x||)
+        # (Paige-Saunders test1 -- catches convergence on ill-conditioned A)
         if Anorm > 0.0 and ynorm > 0.0:
-            if rnorm / (Anorm * ynorm) <= rtol:   # test1
+            if rnorm / (Anorm * ynorm) <= rtol:
                 info = 0
                 break
 
-        if Anorm > 0.0:
-            if root / Anorm <= rtol:               # test2
+        # Stopping criterion 3: range-space residual  ||A^T r|| / (||A|| ||r||)
+        # (Paige-Saunders test2 -- detects convergence in A's range)
+        if Anorm > 0.0 and rnorm > 0.0:
+            if root / Anorm <= rtol:
                 info = 0
                 break
 
-        if Anorm > 0.0 and (gmax / gmin) >= 0.1 / eps:  # Acond stop
+        # Stopping criterion 4: condition number estimate
+        # (gmax/gmin approximates cond(A); stop when near machine precision)
+        if Anorm > 0.0 and (gmax / gmin) >= 0.1 / eps:
             info = 0
             break
 
-        if phi * denom < stag_eps:                 # stagnation
+        # Stagnation detection: step size < 10*eps relative to x
+        if phi * denom < stag_eps:
             info = 2
             break
     else:

From c223ce282b853d68bfa10f427bfc09cd080bc733 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 7 Apr 2026 18:30:24 +0000
Subject: [PATCH 36/43] update WIP

---
 dpnp/backend/extensions/sparse/gemv.cpp       | 270 ++++++-----
 dpnp/backend/extensions/sparse/gemv.hpp       | 105 +++--
 dpnp/backend/extensions/sparse/sparse_py.cpp  | 109 +++--
 .../extensions/sparse/types_matrix.hpp        |  76 ++-
 dpnp/scipy/sparse/linalg/_interface.py        |  78 +---
 dpnp/scipy/sparse/linalg/_iterative.py        | 440 +++++++++++-------
 6 files changed, 652 insertions(+), 426 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index fe8d7b20445f..ac87b57a3397 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -29,6 +29,10 @@
 #include <complex>
 #include <cstdint>
 #include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #include <pybind11/pybind11.h>
 
@@ -45,6 +49,7 @@
 
 namespace dpnp::extensions::sparse
 {
+
 namespace mkl_sparse = oneapi::mkl::sparse;
 namespace py         = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
@@ -63,35 +68,36 @@ using ext::common::init_dispatch_table;
 typedef std::pair<std::uintptr_t, sycl::event> (*gemv_init_fn_ptr_t)(
     sycl::queue &,
     oneapi::mkl::transpose,
-    const char *,          // row_ptr  (typeless)
-    const char *,          // col_ind  (typeless)
-    const char *,          // values   (typeless)
-    std::int64_t,          // num_rows
-    std::int64_t,          // num_cols
-    std::int64_t,          // nnz
+    const char *,   // row_ptr (typeless)
+    const char *,   // col_ind (typeless)
+    const char *,   // values  (typeless)
+    std::int64_t,   // num_rows
+    std::int64_t,   // num_cols
+    std::int64_t,   // nnz
     const std::vector<sycl::event> &);
 
 /**
  * compute_impl: fires sparse::gemv using a pre-built handle.
- * Returns (args_keep_alive_event, gemv_event).
+ * Returns the gemv event directly -- no host_task wrapping.
  */
-typedef std::pair<sycl::event, sycl::event> (*gemv_compute_fn_ptr_t)(
+typedef sycl::event (*gemv_compute_fn_ptr_t)(
     sycl::queue &,
     oneapi::mkl::sparse::matrix_handle_t,
     oneapi::mkl::transpose,
-    double,          // alpha (cast to Tv inside)
-    const char *,    // x (typeless)
-    double,          // beta  (cast to Tv inside)
-    char *,          // y (typeless, writable)
-    std::int64_t,    // num_rows (for output validation)
-    std::int64_t,    // num_cols
+    double,         // alpha (cast to Tv inside)
+    const char *,   // x (typeless)
+    double,         // beta  (cast to Tv inside)
+    char *,         // y (typeless, writable)
     const std::vector<sycl::event> &);
 
+// Init dispatch: 2-D on (Tv, Ti).
 static gemv_init_fn_ptr_t
     gemv_init_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
 
+// Compute dispatch: 1-D on Tv. The index type is baked into the handle,
+// so compute doesn't need it.
 static gemv_compute_fn_ptr_t
-    gemv_compute_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+    gemv_compute_dispatch_table[dpctl_td_ns::num_types];
 
 // ---------------------------------------------------------------------------
 // Per-type init implementation
@@ -99,14 +105,14 @@ static gemv_compute_fn_ptr_t
 
 template <typename Tv, typename Ti>
 static std::pair<std::uintptr_t, sycl::event>
-gemv_init_impl(sycl::queue                    &exec_q,
-               oneapi::mkl::transpose          mkl_trans,
-               const char                     *row_ptr_data,
-               const char                     *col_ind_data,
-               const char                     *values_data,
-               std::int64_t                    num_rows,
-               std::int64_t                    num_cols,
-               std::int64_t                    nnz,
+gemv_init_impl(sycl::queue &exec_q,
+               oneapi::mkl::transpose mkl_trans,
+               const char *row_ptr_data,
+               const char *col_ind_data,
+               const char *values_data,
+               std::int64_t num_rows,
+               std::int64_t num_cols,
+               std::int64_t nnz,
                const std::vector<sycl::event> &depends)
 {
     type_utils::validate_type_for_device<Tv>(exec_q);
@@ -115,9 +121,7 @@ gemv_init_impl(sycl::queue                    &exec_q,
     const Ti *col_ind = reinterpret_cast<const Ti *>(col_ind_data);
     const Tv *values  = reinterpret_cast<const Tv *>(values_data);
 
-    std::stringstream error_msg;
     mkl_sparse::matrix_handle_t spmat = nullptr;
-
     mkl_sparse::init_matrix_handle(&spmat);
 
     auto ev_set = mkl_sparse::set_csr_data(
@@ -153,31 +157,29 @@ gemv_init_impl(sycl::queue                    &exec_q,
 // Per-type compute implementation
 // ---------------------------------------------------------------------------
 
-template <typename Tv, typename Ti>
-static std::pair<sycl::event, sycl::event>
-gemv_compute_impl(sycl::queue                           &exec_q,
-                  mkl_sparse::matrix_handle_t            spmat,
-                  oneapi::mkl::transpose                 mkl_trans,
-                  double                                 alpha_d,
-                  const char                            *x_data,
-                  double                                 beta_d,
-                  char                                  *y_data,
-                  std::int64_t                           num_rows,
-                  std::int64_t                           /* num_cols */,
-                  const std::vector<sycl::event>        &depends)
+template <typename Tv>
+static sycl::event
+gemv_compute_impl(sycl::queue &exec_q,
+                  mkl_sparse::matrix_handle_t spmat,
+                  oneapi::mkl::transpose mkl_trans,
+                  double alpha_d,
+                  const char *x_data,
+                  double beta_d,
+                  char *y_data,
+                  const std::vector<sycl::event> &depends)
 {
-    // Scalars: for complex Tv we construct the complex scalar from the real part.
-    // alpha=1, beta=0 are the common solver values so precision loss is academic,
-    // but we keep the cast path consistent for generality.
+    // For complex Tv the single-arg constructor sets imag to zero.
+    // Solvers use alpha=1, beta=0 so this is exact; other callers
+    // passing complex scalars via this path will lose the imag
+    // component silently.
     const Tv alpha = static_cast<Tv>(alpha_d);
     const Tv beta  = static_cast<Tv>(beta_d);
 
     const Tv *x = reinterpret_cast<const Tv *>(x_data);
-    Tv       *y = reinterpret_cast<Tv *>(y_data);
+    Tv *y       = reinterpret_cast<Tv *>(y_data);
 
-    sycl::event gemv_ev;
     try {
-        gemv_ev = mkl_sparse::gemv(
+        return mkl_sparse::gemv(
             exec_q, mkl_trans,
             alpha, spmat,
             x, beta, y,
@@ -189,15 +191,6 @@ gemv_compute_impl(sycl::queue                           &exec_q,
         throw std::runtime_error(
             std::string("sparse_gemv_compute: SYCL exception: ") + e.what());
     }
-
-    // Keep x and y alive until the event completes.
-    // (row_ptr/col_ind/values are kept alive by the handle itself.)
-    sycl::event args_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(gemv_ev);
-        cgh.host_task([x, y]() { (void)x; (void)y; });
-    });
-
-    return {args_ev, gemv_ev};
 }
 
 // ---------------------------------------------------------------------------
@@ -217,16 +210,16 @@ decode_trans(const int trans)
     }
 }
 
-std::pair<std::uintptr_t, sycl::event>
-sparse_gemv_init(sycl::queue                           &exec_q,
-                 const int                              trans,
-                 const dpctl::tensor::usm_ndarray      &row_ptr,
-                 const dpctl::tensor::usm_ndarray      &col_ind,
-                 const dpctl::tensor::usm_ndarray      &values,
-                 const std::int64_t                     num_rows,
-                 const std::int64_t                     num_cols,
-                 const std::int64_t                     nnz,
-                 const std::vector<sycl::event>        &depends)
+std::tuple<std::uintptr_t, int, sycl::event>
+sparse_gemv_init(sycl::queue &exec_q,
+                 const int trans,
+                 const dpctl::tensor::usm_ndarray &row_ptr,
+                 const dpctl::tensor::usm_ndarray &col_ind,
+                 const dpctl::tensor::usm_ndarray &values,
+                 const std::int64_t num_rows,
+                 const std::int64_t num_cols,
+                 const std::int64_t nnz,
+                 const std::vector<sycl::event> &depends)
 {
     if (!dpctl::utils::queues_are_compatible(
             exec_q, {row_ptr.get_queue(), col_ind.get_queue(),
@@ -235,6 +228,25 @@ sparse_gemv_init(sycl::queue                           &exec_q,
             "sparse_gemv_init: USM allocations are not compatible with the "
             "execution queue.");
 
+    // Basic CSR shape sanity.
+    if (row_ptr.get_ndim() != 1 || col_ind.get_ndim() != 1 ||
+        values.get_ndim() != 1)
+        throw py::value_error(
+            "sparse_gemv_init: row_ptr, col_ind, values must all be 1-D.");
+
+    if (row_ptr.get_shape(0) != num_rows + 1)
+        throw py::value_error(
+            "sparse_gemv_init: row_ptr length must equal num_rows + 1.");
+
+    if (col_ind.get_shape(0) != nnz || values.get_shape(0) != nnz)
+        throw py::value_error(
+            "sparse_gemv_init: col_ind and values length must equal nnz.");
+
+    // Index types of row_ptr and col_ind must match.
+    if (row_ptr.get_typenum() != col_ind.get_typenum())
+        throw py::value_error(
+            "sparse_gemv_init: row_ptr and col_ind must have the same dtype.");
+
     auto mkl_trans = decode_trans(trans);
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
@@ -248,22 +260,26 @@ sparse_gemv_init(sycl::queue                           &exec_q,
             "dtype combination. Supported: {float32,float64,complex64,"
             "complex128} x {int32,int64}.");
 
-    return init_fn(exec_q, mkl_trans,
-                   row_ptr.get_data(), col_ind.get_data(), values.get_data(),
-                   num_rows, num_cols, nnz, depends);
+    auto [handle_ptr, ev_opt] = init_fn(
+        exec_q, mkl_trans,
+        row_ptr.get_data(), col_ind.get_data(), values.get_data(),
+        num_rows, num_cols, nnz, depends);
+
+    return {handle_ptr, val_id, ev_opt};
 }
 
-std::pair<sycl::event, sycl::event>
-sparse_gemv_compute(sycl::queue                           &exec_q,
-                    const std::uintptr_t                   handle_ptr,
-                    const int                              trans,
-                    const double                           alpha,
-                    const dpctl::tensor::usm_ndarray      &x,
-                    const double                           beta,
-                    const dpctl::tensor::usm_ndarray      &y,
-                    const std::int64_t                     num_rows,
-                    const std::int64_t                     num_cols,
-                    const std::vector<sycl::event>        &depends)
+sycl::event
+sparse_gemv_compute(sycl::queue &exec_q,
+                    const std::uintptr_t handle_ptr,
+                    const int val_type_id,
+                    const int trans,
+                    const double alpha,
+                    const dpctl::tensor::usm_ndarray &x,
+                    const double beta,
+                    const dpctl::tensor::usm_ndarray &y,
+                    const std::int64_t num_rows,
+                    const std::int64_t num_cols,
+                    const std::vector<sycl::event> &depends)
 {
     if (x.get_ndim() != 1)
         throw py::value_error("sparse_gemv_compute: x must be a 1-D array.");
@@ -282,49 +298,79 @@ sparse_gemv_compute(sycl::queue                           &exec_q,
             "sparse_gemv_compute: x and y are overlapping memory segments.");
 
     dpctl::tensor::validation::CheckWritable::throw_if_not_writable(y);
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        y, static_cast<std::size_t>(num_rows));
 
+    // Shape validation: op(A) is (num_rows, num_cols) for trans=N,
+    // (num_cols, num_rows) for trans={T,C}.
     auto mkl_trans = decode_trans(trans);
-    auto spmat     = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
+    const bool is_non_trans =
+        (mkl_trans == oneapi::mkl::transpose::nontrans);
+    const std::int64_t op_rows = is_non_trans ? num_rows : num_cols;
+    const std::int64_t op_cols = is_non_trans ? num_cols : num_rows;
 
-    // Dispatch on value type (x and y must match; index type is encoded in
-    // the handle from init -- we only need Tv here).
+    if (x.get_shape(0) != op_cols)
+        throw py::value_error(
+            "sparse_gemv_compute: x length does not match operator columns.");
+    if (y.get_shape(0) != op_rows)
+        throw py::value_error(
+            "sparse_gemv_compute: y length does not match operator rows.");
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        y, static_cast<std::size_t>(op_rows));
+
+    // Dtype verification: x, y, and the handle's value type must all match.
     auto array_types = dpctl_td_ns::usm_ndarray_types();
-    const int val_id = array_types.typenum_to_lookup_id(x.get_typenum());
-    const int idx_id = array_types.typenum_to_lookup_id(y.get_typenum());
+    const int x_val_id = array_types.typenum_to_lookup_id(x.get_typenum());
+    const int y_val_id = array_types.typenum_to_lookup_id(y.get_typenum());
+
+    if (x_val_id != val_type_id || y_val_id != val_type_id)
+        throw py::value_error(
+            "sparse_gemv_compute: x and y dtype must match the value dtype "
+            "of the sparse matrix used to build the handle.");
+
+    if (val_type_id < 0 || val_type_id >= dpctl_td_ns::num_types)
+        throw py::value_error(
+            "sparse_gemv_compute: val_type_id out of range.");
 
-    // For compute we only need Tv; re-use the same dispatch table using the
-    // val_id from x and idx_id from y (both are val type so idx_id == val_id
-    // is fine -- the factory only cares about Tv for the gemv call).
     gemv_compute_fn_ptr_t compute_fn =
-        gemv_compute_dispatch_table[val_id][val_id];
+        gemv_compute_dispatch_table[val_type_id];
+
     if (compute_fn == nullptr)
         throw py::value_error(
             "sparse_gemv_compute: unsupported value dtype.");
 
+    auto spmat = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
+
     return compute_fn(exec_q, spmat, mkl_trans, alpha,
-                      x.get_data(), beta, const_cast<char *>(y.get_data()),
-                      num_rows, num_cols, depends);
+                      x.get_data(), beta,
+                      const_cast<char *>(y.get_data()),
+                      depends);
 }
 
 sycl::event
-sparse_gemv_release(sycl::queue                     &exec_q,
-                    const std::uintptr_t             handle_ptr,
-                    const std::vector<sycl::event>  &depends)
+sparse_gemv_release(sycl::queue &exec_q,
+                    const std::uintptr_t handle_ptr,
+                    const std::vector<sycl::event> &depends)
 {
     auto spmat = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
-    mkl_sparse::release_matrix_handle(exec_q, &spmat, depends);
-    // release_matrix_handle is synchronous in the current oneMKL API;
-    // return a no-op event for API uniformity.
-    return exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.host_task([]() {});
-    });
+
+    // release_matrix_handle takes `depends` so it will not free the handle
+    // until all pending compute work on it has completed. In recent oneMKL
+    // versions release_matrix_handle returns a sycl::event; older versions
+    // returned void. If your pinned oneMKL returns void, replace the body
+    // with:
+    //     mkl_sparse::release_matrix_handle(exec_q, &spmat, depends);
+    //     return exec_q.submit([&](sycl::handler &cgh) {
+    //         cgh.depends_on(depends);
+    //         cgh.host_task([]() {});
+    //     });
+    sycl::event release_ev =
+        mkl_sparse::release_matrix_handle(exec_q, &spmat, depends);
+
+    return release_ev;
 }
 
 // ---------------------------------------------------------------------------
-// Dispatch table factory and registration
+// Dispatch table factories and registration
 // ---------------------------------------------------------------------------
 
 template <typename fnT, typename Tv, typename Ti>
@@ -332,31 +378,39 @@ struct GemvInitContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SparseGemvTypePairSupportFactory<Tv, Ti>::is_defined)
+        if constexpr (types::SparseGemvInitTypePairSupportFactory<Tv, Ti>::is_defined)
             return gemv_init_impl<Tv, Ti>;
         else
             return nullptr;
     }
 };
 
-template <typename fnT, typename Tv, typename Ti>
+template <typename fnT, typename Tv>
 struct GemvComputeContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SparseGemvTypePairSupportFactory<Tv, Ti>::is_defined)
-            return gemv_compute_impl<Tv, Ti>;
+        if constexpr (types::SparseGemvComputeTypeSupportFactory<Tv>::is_defined)
+            return gemv_compute_impl<Tv>;
         else
             return nullptr;
     }
 };
 
-void init_sparse_gemv_dispatch_table(void)
+void init_sparse_gemv_dispatch_tables(void)
 {
-    init_dispatch_table<gemv_init_fn_ptr_t,    GemvInitContigFactory>(
+    // 2-D table on (Tv, Ti) for init.
+    init_dispatch_table<gemv_init_fn_ptr_t, GemvInitContigFactory>(
         gemv_init_dispatch_table);
-    init_dispatch_table<gemv_compute_fn_ptr_t, GemvComputeContigFactory>(
-        gemv_compute_dispatch_table);
+
+    // 1-D table on Tv for compute. dpctl's type dispatch headers expose
+    // DispatchVectorBuilder as the 1-D analogue of DispatchTableBuilder.
+    dpctl_td_ns::DispatchVectorBuilder
+        gemv_compute_fn_ptr_t,
+        GemvComputeContigFactory,
+        dpctl_td_ns::num_types>
+        builder;
+    builder.populate_dispatch_vector(gemv_compute_dispatch_table);
 }
 
 } // namespace dpnp::extensions::sparse
diff --git a/dpnp/backend/extensions/sparse/gemv.hpp b/dpnp/backend/extensions/sparse/gemv.hpp
index c5b57305f3f9..07f5aced7c49 100644
--- a/dpnp/backend/extensions/sparse/gemv.hpp
+++ b/dpnp/backend/extensions/sparse/gemv.hpp
@@ -28,6 +28,10 @@
 
 #pragma once
 
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
@@ -40,54 +44,89 @@ namespace dpnp::extensions::sparse
  * sparse_gemv_init -- ONE-TIME setup per sparse matrix operator.
  *
  * Calls init_matrix_handle + set_csr_data + optimize_gemv.
- * Returns the opaque matrix_handle_t cast to uintptr_t for safe
- * Python round-tripping, plus the dependency event from optimize_gemv
- * (caller must wait on it before calling sparse_gemv_compute).
  *
- * Lifetime: the handle owns NO data copies; all CSR arrays must remain
- * alive (in USM) until sparse_gemv_release is called.
+ * Returns a tuple of:
+ *   - handle_ptr:   opaque matrix_handle_t cast to uintptr_t for safe
+ *                   Python round-tripping.
+ *   - val_type_id:  the dpctl typenum lookup id of the value dtype Tv.
+ *                   Python MUST pass this back to sparse_gemv_compute so
+ *                   the C++ layer can verify that x and y dtype match the
+ *                   handle's value type.
+ *   - event:        dependency event from optimize_gemv; the caller must
+ *                   wait on it (or chain via depends) before the first
+ *                   sparse_gemv_compute call.
+ *
+ * LIFETIME CONTRACT -- IMPORTANT:
+ * The handle owns NO copies of the CSR arrays. The caller MUST keep
+ * row_ptr, col_ind, and values USM allocations alive until
+ * sparse_gemv_release has been called AND its returned event has
+ * completed. Dropping any of them earlier is undefined behavior and
+ * will produce silent memory corruption -- there is no runtime check.
+ *
+ * The Python wrapper (_CachedSpMV) enforces this contract by holding
+ * a reference to the CSR matrix for the lifetime of the handle.
  */
-extern std::pair<std::uintptr_t, sycl::event>
-sparse_gemv_init(sycl::queue                           &exec_q,
-                 const int                              trans,
-                 const dpctl::tensor::usm_ndarray      &row_ptr,
-                 const dpctl::tensor::usm_ndarray      &col_ind,
-                 const dpctl::tensor::usm_ndarray      &values,
-                 const std::int64_t                     num_rows,
-                 const std::int64_t                     num_cols,
-                 const std::int64_t                     nnz,
-                 const std::vector<sycl::event>        &depends);
+extern std::tuple<std::uintptr_t, int, sycl::event>
+sparse_gemv_init(sycl::queue &exec_q,
+                 const int trans,
+                 const dpctl::tensor::usm_ndarray &row_ptr,
+                 const dpctl::tensor::usm_ndarray &col_ind,
+                 const dpctl::tensor::usm_ndarray &values,
+                 const std::int64_t num_rows,
+                 const std::int64_t num_cols,
+                 const std::int64_t nnz,
+                 const std::vector<sycl::event> &depends);
 
 /**
  * sparse_gemv_compute -- PER-ITERATION SpMV.
  *
  * Calls only oneapi::mkl::sparse::gemv using the pre-built handle.
+ * Verifies that:
+ *   - x and y are 1-D usm_ndarrays on a queue compatible with exec_q
+ *   - x and y dtype match val_type_id (the handle's value type)
+ *   - x and y shapes match op(A) dimensions, taking trans into account
+ *     (op(A) is num_rows x num_cols for trans=N, num_cols x num_rows
+ *     for trans={T,C})
+ *   - y is writable and does not overlap x
+ *
  * alpha and beta are passed as double and cast inside gemv_compute_impl
- * to the matrix value type.
+ * to the matrix value type. For complex Tv the cast drops the imaginary
+ * part; callers needing complex scalars should keep alpha=1, beta=0
+ * (the solver use case).
+ *
+ * Returns the gemv event. The caller is responsible for sequencing
+ * subsequent work on the same queue; no host-side wait or host_task
+ * keep-alive is performed.
  */
-extern std::pair<sycl::event, sycl::event>
-sparse_gemv_compute(sycl::queue                           &exec_q,
-                    const std::uintptr_t                   handle_ptr,
-                    const int                              trans,
-                    const double                           alpha,
-                    const dpctl::tensor::usm_ndarray      &x,
-                    const double                           beta,
-                    const dpctl::tensor::usm_ndarray      &y,
-                    const std::int64_t                     num_rows,
-                    const std::int64_t                     num_cols,
-                    const std::vector<sycl::event>        &depends);
+extern sycl::event
+sparse_gemv_compute(sycl::queue &exec_q,
+                    const std::uintptr_t handle_ptr,
+                    const int val_type_id,
+                    const int trans,
+                    const double alpha,
+                    const dpctl::tensor::usm_ndarray &x,
+                    const double beta,
+                    const dpctl::tensor::usm_ndarray &y,
+                    const std::int64_t num_rows,
+                    const std::int64_t num_cols,
+                    const std::vector<sycl::event> &depends);
 
 /**
  * sparse_gemv_release -- free the matrix_handle created by sparse_gemv_init.
  *
- * Must be called exactly once per handle, after all compute calls
- * that depend on it have completed.
+ * Must be called exactly once per handle, after all compute calls that
+ * depend on it have completed. The returned event depends on the release,
+ * so the caller can chain CSR buffer deallocation on it safely.
  */
 extern sycl::event
-sparse_gemv_release(sycl::queue                     &exec_q,
-                    const std::uintptr_t             handle_ptr,
-                    const std::vector<sycl::event>  &depends);
+sparse_gemv_release(sycl::queue &exec_q,
+                    const std::uintptr_t handle_ptr,
+                    const std::vector<sycl::event> &depends);
 
-extern void init_sparse_gemv_dispatch_table(void);
+/**
+ * Register the init (2-D on Tv x Ti) and compute (1-D on Tv) dispatch
+ * tables. Called exactly once from PYBIND11_MODULE.
+ */
+extern void init_sparse_gemv_dispatch_tables(void);
 
 } // namespace dpnp::extensions::sparse
diff --git a/dpnp/backend/extensions/sparse/sparse_py.cpp b/dpnp/backend/extensions/sparse/sparse_py.cpp
index 0efd1d0da9ae..9b3dc16d3b01 100644
--- a/dpnp/backend/extensions/sparse/sparse_py.cpp
+++ b/dpnp/backend/extensions/sparse/sparse_py.cpp
@@ -26,29 +26,35 @@
 // POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include <sycl/sycl.hpp>
+
 #include <dpctl4pybind11.hpp>
 
 #include "gemv.hpp"
 
 namespace py = pybind11;
 
-using dpnp::extensions::sparse::init_sparse_gemv_dispatch_table;
-using dpnp::extensions::sparse::sparse_gemv_init;
+using dpnp::extensions::sparse::init_sparse_gemv_dispatch_tables;
 using dpnp::extensions::sparse::sparse_gemv_compute;
+using dpnp::extensions::sparse::sparse_gemv_init;
 using dpnp::extensions::sparse::sparse_gemv_release;
 
 PYBIND11_MODULE(_sparse_impl, m)
 {
-    init_sparse_gemv_dispatch_table();
+    init_sparse_gemv_dispatch_tables();
 
     // ------------------------------------------------------------------
     // _using_onemath()
-    // Reports whether the module was compiled against the portable OneMath
-    // interface (USE_ONEMATH) rather than direct oneMKL.
+    //
+    // Reports whether the module was compiled against the portable
+    // OneMath interface (USE_ONEMATH) rather than direct oneMKL.
     // ------------------------------------------------------------------
     m.def("_using_onemath", []() -> bool {
 #ifdef USE_ONEMATH
@@ -61,24 +67,32 @@ PYBIND11_MODULE(_sparse_impl, m)
     // ------------------------------------------------------------------
     // _sparse_gemv_init(exec_q, trans, row_ptr, col_ind, values,
     //                   num_rows, num_cols, nnz, depends)
-    //     -> (handle: int, event)
+    //     -> (handle: int, val_type_id: int, event)
     //
     // Calls init_matrix_handle + set_csr_data + optimize_gemv ONCE.
-    // The returned handle is an opaque uintptr_t; pass it back to
-    // _sparse_gemv_compute and _sparse_gemv_release.
+    //
+    // The returned handle is an opaque uintptr_t; val_type_id is the
+    // dpctl typenum lookup id of the matrix value dtype and MUST be
+    // passed back to _sparse_gemv_compute so the C++ layer can verify
+    // that x and y dtype match the handle.
+    //
+    // LIFETIME CONTRACT: the caller must keep row_ptr / col_ind / values
+    // USM allocations alive until _sparse_gemv_release has been called
+    // AND its returned event has completed. The handle does not copy
+    // the CSR arrays.
     // ------------------------------------------------------------------
     m.def(
         "_sparse_gemv_init",
-        [](sycl::queue                                &exec_q,
-           const int                                   trans,
-           const dpctl::tensor::usm_ndarray           &row_ptr,
-           const dpctl::tensor::usm_ndarray           &col_ind,
-           const dpctl::tensor::usm_ndarray           &values,
-           const std::int64_t                          num_rows,
-           const std::int64_t                          num_cols,
-           const std::int64_t                          nnz,
-           const std::vector<sycl::event>             &depends)
-            -> std::pair<std::uintptr_t, sycl::event>
+        [](sycl::queue &exec_q,
+           const int trans,
+           const dpctl::tensor::usm_ndarray &row_ptr,
+           const dpctl::tensor::usm_ndarray &col_ind,
+           const dpctl::tensor::usm_ndarray &values,
+           const std::int64_t num_rows,
+           const std::int64_t num_cols,
+           const std::int64_t nnz,
+           const std::vector<sycl::event> &depends)
+            -> std::tuple<std::uintptr_t, int, sycl::event>
         {
             return sparse_gemv_init(
                 exec_q, trans,
@@ -95,40 +109,50 @@ PYBIND11_MODULE(_sparse_impl, m)
         py::arg("num_cols"),
         py::arg("nnz"),
         py::arg("depends"),
-        "Initialise oneMKL sparse matrix handle (set_csr_data + optimize_gemv). "
-        "Returns (handle_ptr: int, event). Call once per operator."
+        "Initialise oneMKL sparse matrix handle "
+        "(set_csr_data + optimize_gemv). "
+        "Returns (handle_ptr: int, val_type_id: int, event). "
+        "Call once per operator."
     );
 
     // ------------------------------------------------------------------
-    // _sparse_gemv_compute(exec_q, handle, trans, alpha, x, beta, y,
-    //                      num_rows, num_cols, depends)
-    //     -> (args_event, gemv_event)
+    // _sparse_gemv_compute(exec_q, handle, val_type_id, trans, alpha,
+    //                     x, beta, y, num_rows, num_cols, depends)
+    //     -> gemv_event
     //
-    // Fires sparse::gemv using the pre-built handle.
-    // Only the cheap kernel is dispatched; no analysis overhead.
+    // Fires sparse::gemv using a pre-built handle. Verifies x and y
+    // dtype match val_type_id from init, and that shapes agree with
+    // op(A) dimensions (swapped for trans != N).
+    //
+    // Only the cheap MKL kernel is dispatched; no analysis overhead.
+    // No host_task keep-alive is submitted -- pybind11 refcounts the
+    // usm_ndarrays across the call, and sequencing of subsequent work
+    // on the same queue happens automatically.
     // ------------------------------------------------------------------
     m.def(
         "_sparse_gemv_compute",
-        [](sycl::queue                                &exec_q,
-           const std::uintptr_t                        handle_ptr,
-           const int                                   trans,
-           const double                                alpha,
-           const dpctl::tensor::usm_ndarray           &x,
-           const double                                beta,
-           const dpctl::tensor::usm_ndarray           &y,
-           const std::int64_t                          num_rows,
-           const std::int64_t                          num_cols,
-           const std::vector<sycl::event>             &depends)
-            -> std::pair<sycl::event, sycl::event>
+        [](sycl::queue &exec_q,
+           const std::uintptr_t handle_ptr,
+           const int val_type_id,
+           const int trans,
+           const double alpha,
+           const dpctl::tensor::usm_ndarray &x,
+           const double beta,
+           const dpctl::tensor::usm_ndarray &y,
+           const std::int64_t num_rows,
+           const std::int64_t num_cols,
+           const std::vector<sycl::event> &depends)
+            -> sycl::event
         {
             return sparse_gemv_compute(
-                exec_q, handle_ptr, trans, alpha,
+                exec_q, handle_ptr, val_type_id, trans, alpha,
                 x, beta, y,
                 num_rows, num_cols,
                 depends);
         },
         py::arg("exec_q"),
         py::arg("handle"),
+        py::arg("val_type_id"),
         py::arg("trans"),
         py::arg("alpha"),
         py::arg("x"),
@@ -138,7 +162,7 @@ PYBIND11_MODULE(_sparse_impl, m)
         py::arg("num_cols"),
         py::arg("depends"),
         "Execute sparse::gemv using a pre-built handle. "
-        "Returns (args_event, gemv_event)."
+        "Returns the gemv event."
     );
 
     // ------------------------------------------------------------------
@@ -146,13 +170,14 @@ PYBIND11_MODULE(_sparse_impl, m)
     //
     // Releases the matrix_handle allocated by _sparse_gemv_init.
     // Must be called exactly once per handle after all compute calls
-    // referencing it are complete.
+    // referencing it have completed. The returned event depends on the
+    // release, so callers can chain CSR buffer deallocation on it.
     // ------------------------------------------------------------------
     m.def(
         "_sparse_gemv_release",
-        [](sycl::queue                                &exec_q,
-           const std::uintptr_t                        handle_ptr,
-           const std::vector<sycl::event>             &depends)
+        [](sycl::queue &exec_q,
+           const std::uintptr_t handle_ptr,
+           const std::vector<sycl::event> &depends)
             -> sycl::event
         {
             return sparse_gemv_release(exec_q, handle_ptr, depends);
diff --git a/dpnp/backend/extensions/sparse/types_matrix.hpp b/dpnp/backend/extensions/sparse/types_matrix.hpp
index 948d2fbd3c40..a2b7d16fe3f9 100644
--- a/dpnp/backend/extensions/sparse/types_matrix.hpp
+++ b/dpnp/backend/extensions/sparse/types_matrix.hpp
@@ -43,43 +43,77 @@ namespace dpnp::extensions::sparse::types
 
 /**
  * @brief Factory encoding the supported (value type, index type) combinations
- * for oneapi::mkl::sparse::gemv.
+ *        for oneapi::mkl::sparse::gemv initialization.
  *
  * oneMKL sparse BLAS supports:
- *   - float32              with int32 indices
- *   - float32              with int64 indices
- *   - float64              with int32 indices
- *   - float64              with int64 indices
- *   - complex<float>  (c64) with int32 indices
- *   - complex<float>  (c64) with int64 indices
- *   - complex<double> (c128) with int32 indices
- *   - complex<double> (c128) with int64 indices
+ *   - float32          with int32 indices
+ *   - float32          with int64 indices
+ *   - float64          with int32 indices
+ *   - float64          with int64 indices
+ *   - complex<float>   with int32 indices
+ *   - complex<float>   with int64 indices
+ *   - complex<double>  with int32 indices
+ *   - complex<double>  with int64 indices
  *
  * Complex support requires oneMKL >= 2023.x (sparse BLAS complex USM API).
- * The dispatch table entry is non-null only when the pair is registered here;
- * the Python layer falls back to A.dot(x) when the entry is nullptr.
+ * The init dispatch table entry is non-null only when the pair is registered
+ * here; the Python layer falls back to A.dot(x) when the entry is nullptr.
  *
  * @tparam Tv  Value type of the sparse matrix and dense vectors.
  * @tparam Ti  Index type of the sparse matrix (row_ptr / col_ind arrays).
  */
 template <typename Tv, typename Ti>
-struct SparseGemvTypePairSupportFactory
+struct SparseGemvInitTypePairSupportFactory
 {
-    static constexpr bool is_defined = std::disjunction<
+    static constexpr bool is_defined = std::disjunction
         // real single precision
-        dpctl_td_ns::TypePairDefinedEntry<Tv, float,                   Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, float,                   Ti, std::int64_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, float, Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, float, Ti, std::int64_t>,
         // real double precision
-        dpctl_td_ns::TypePairDefinedEntry<Tv, double,                  Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, double,                  Ti, std::int64_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int64_t>,
         // complex single precision
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>,     Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>,     Ti, std::int64_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>, Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>, Ti, std::int64_t>,
         // complex double precision
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>,    Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>,    Ti, std::int64_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>, Ti, std::int32_t>,
+        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>, Ti, std::int64_t>,
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief Factory encoding supported value types for sparse::gemv compute.
+ *
+ * The compute path only requires Tv because the index type is baked into
+ * the matrix_handle at init time. Using a 1-D dispatch vector on Tv avoids
+ * the wasted num_types * num_types slots of a 2-D table where only the
+ * diagonal (keyed on Ti) would ever be populated.
+ *
+ * If your pinned dpctl version does not expose TypeDefinedEntry as a 1-arg
+ * entry, fall back to the std::is_same_v expansion shown in the comment
+ * below -- both are equivalent.
+ *
+ * @tparam Tv  Value type of the sparse matrix and dense vectors.
+ */
+template <typename Tv>
+struct SparseGemvComputeTypeSupportFactory
+{
+#if defined(DPCTL_HAS_TYPE_DEFINED_ENTRY)
+    static constexpr bool is_defined = std::disjunction
+        dpctl_td_ns::TypeDefinedEntry<Tv, float>,
+        dpctl_td_ns::TypeDefinedEntry<Tv, double>,
+        dpctl_td_ns::TypeDefinedEntry<Tv, std::complex<float>>,
+        dpctl_td_ns::TypeDefinedEntry<Tv, std::complex<double>>,
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+#else
+    // Portable fallback: works with any dpctl version.
+    static constexpr bool is_defined =
+        std::is_same_v<Tv, float>               ||
+        std::is_same_v<Tv, double>              ||
+        std::is_same_v<Tv, std::complex<float>> ||
+        std::is_same_v<Tv, std::complex<double>>;
+#endif
+};
+
 } // namespace dpnp::extensions::sparse::types
diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index a90ceec84b07..24fd448de9f6 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -75,11 +75,6 @@ def _get_dtype(operators, dtypes=None):
             dtypes.append(obj.dtype)
     return dpnp.result_type(*dtypes) if dtypes else None
 
-
-# ---------------------------------------------------------------------------
-# LinearOperator base
-# ---------------------------------------------------------------------------
-
 class LinearOperator:
     """Drop-in replacement for cupyx/scipy LinearOperator backed by dpnp arrays.
 
@@ -116,16 +111,14 @@ def __init__(self, dtype, shape):
         self.shape = shape
 
     def _init_dtype(self):
-        """Infer dtype via a trial matvec on an int8 zero vector (SciPy / CuPy strategy)."""
+        """
+        Infer dtype via a trial matvec on a zero vector.
+        """
         if self.dtype is not None:
             return
-        v = dpnp.zeros(self.shape[-1], dtype=dpnp.int8)
+        v = dpnp.zeros(self.shape[-1], dtype=dpnp.float64)
         self.dtype = self.matvec(v).dtype
 
-    # ------------------------------------------------------------------ #
-    #  Abstract primitives — subclasses override at least one             #
-    # ------------------------------------------------------------------ #
-
     def _matvec(self, x):
         return self.matmat(x.reshape(-1, 1))
 
@@ -146,10 +139,6 @@ def _rmatmat(self, X):
             )
         return self.H.matmat(X)
 
-    # ------------------------------------------------------------------ #
-    #  Public multiply methods (shape-checked)                            #
-    # ------------------------------------------------------------------ #
-
     def matvec(self, x):
         M, N = self.shape
         if x.shape not in ((N,), (N, 1)):
@@ -182,10 +171,6 @@ def rmatmat(self, X):
             raise ValueError(f"dimension mismatch: {self.shape!r} vs {X.shape!r}")
         return self._rmatmat(X)
 
-    # ------------------------------------------------------------------ #
-    #  Operator algebra                                                   #
-    # ------------------------------------------------------------------ #
-
     def dot(self, x):
         if isinstance(x, LinearOperator):
             return _ProductLinearOperator(self, x)
@@ -236,10 +221,6 @@ def __neg__(self):
     def __sub__(self, x):
         return self.__add__(-x)
 
-    # ------------------------------------------------------------------ #
-    #  Adjoint / transpose — A.H and A.T both work (SciPy + CuPy parity) #
-    # ------------------------------------------------------------------ #
-
     def _adjoint(self):
         """Return conjugate-transpose operator (override in subclasses)."""
         return _AdjointLinearOperator(self)
@@ -364,7 +345,6 @@ def _matmat(self, X):  return self.args[0].matmat(self.args[1].matmat(X))
     def _rmatmat(self, X): return self.args[1].rmatmat(self.args[0].rmatmat(X))
     def _adjoint(self):    A, B = self.args; return B.H * A.H
 
-
 class _ScaledLinearOperator(LinearOperator):
     def __init__(self, A, alpha):
         super().__init__(_get_dtype([A], [type(alpha)]), A.shape)
@@ -376,7 +356,6 @@ def _matmat(self, X):  return self.args[1] * self.args[0].matmat(X)
     def _rmatmat(self, X): return dpnp.conj(self.args[1]) * self.args[0].rmatmat(X)
     def _adjoint(self):    A, alpha = self.args; return A.H * dpnp.conj(alpha)
 
-
 class _PowerLinearOperator(LinearOperator):
     def __init__(self, A, p):
         if A.shape[0] != A.shape[1]:
@@ -387,7 +366,7 @@ def __init__(self, A, p):
         self.args = (A, int(p))
 
     def _power(self, f, x):
-        res = dpnp.array(x, copy=True)
+        res = x.copy()
         for _ in range(self.args[1]):
             res = f(res)
         return res
@@ -445,24 +424,18 @@ def _rmatmat(self, X): return X
     def _adjoint(self):    return self
     def _transpose(self):  return self
 
-
-# ---------------------------------------------------------------------------
-# aslinearoperator
-# ---------------------------------------------------------------------------
-
 def aslinearoperator(A) -> LinearOperator:
     """Wrap A as a LinearOperator if it is not already one.
 
     Handles (in order):
       1. Already a LinearOperator — returned as-is.
-      2. dpnp.scipy.sparse or scipy.sparse sparse matrix.
-      3. Dense dpnp / numpy ndarray (1-D promoted to column vector).
+      2. dpnp.scipy.sparse sparse matrix.
+      3. Dense 2-D dpnp.ndarray.
       4. Duck-typed objects with .shape and .matvec / @ support.
     """
     if isinstance(A, LinearOperator):
         return A
 
-    # dpnp sparse
     try:
         from dpnp.scipy import sparse as _sp
         if _sp.issparse(A):
@@ -470,31 +443,19 @@ def aslinearoperator(A) -> LinearOperator:
     except (ImportError, AttributeError):
         pass
 
-    # scipy sparse — convert to dense on device
-    try:
-        import scipy.sparse as _ssp
-        if _ssp.issparse(A):
-            return MatrixLinearOperator(dpnp.asarray(A.toarray()))
-    except (ImportError, AttributeError):
-        pass
-
-    # dense ndarray (dpnp or numpy)
-    try:
-        arr = dpnp.asarray(A)
-        if arr.ndim == 1:
-            arr = arr.reshape(-1, 1)   # treat 1-D as column vector
-        if arr.ndim == 2:
-            return MatrixLinearOperator(arr)
-    except Exception:
-        pass
+    if isinstance(A, dpnp.ndarray):
+        if A.ndim != 2:
+            raise ValueError(
+                f"aslinearoperator: dpnp array must be 2-D, got {A.ndim}-D"
+            )
+        return MatrixLinearOperator(A)
 
-    # duck-typed (anything with .shape + matvec or @)
     if hasattr(A, "shape") and len(A.shape) == 2:
-        m, n    = int(A.shape[0]), int(A.shape[1])
-        dtype   = getattr(A, "dtype", None)
-        matvec  = A.matvec  if hasattr(A, "matvec")  else (lambda x: A @ x)
+        m, n = int(A.shape[0]), int(A.shape[1])
+        dtype = getattr(A, "dtype", None)
+        matvec = A.matvec if hasattr(A, "matvec") else (lambda x: A @ x)
         rmatvec = A.rmatvec if hasattr(A, "rmatvec") else None
-        matmat  = A.matmat  if hasattr(A, "matmat")  else None
+        matmat = A.matmat if hasattr(A, "matmat") else None
         rmatmat = A.rmatmat if hasattr(A, "rmatmat") else None
         return LinearOperator(
             (m, n),
@@ -505,4 +466,7 @@ def aslinearoperator(A) -> LinearOperator:
             rmatmat=rmatmat,
         )
 
-    raise TypeError(f"Cannot convert object of type {type(A)!r} to a LinearOperator")
+    raise TypeError(
+        f"Cannot convert object of type {type(A)!r} to a LinearOperator. "
+        "Expected a LinearOperator, dpnp sparse matrix, or 2-D dpnp.ndarray."
+    )
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 555c4fa35ad2..cce0fe5fb3ab 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -103,33 +103,42 @@ def _check_dtype(dtype, name: str) -> None:
             "only float32, float64, complex64, complex128 are accepted."
         )
 
-
 class _CachedSpMV:
-    """Wrap a CSR matrix with a persistent oneMKL matrix_handle.
+    """
+    Wrap a CSR matrix with a persistent oneMKL matrix_handle.
 
     The handle is initialised (set_csr_data + optimize_gemv) exactly once
-    in __init__.  Subsequent calls to __call__ only invoke sparse::gemv,
-    paying no analysis overhead.  The handle is released in __del__.
+    in __init__. Subsequent calls to __call__ only invoke sparse::gemv,
+    paying no analysis overhead. The handle is released in __del__.
+
+    Only trans=0 (non-transposed) is exposed, the adjoint path uses a
+    separate _CachedSpMV built against trans=2.
 
     Parameters
     ----------
-    A      : dpnp CSR sparse matrix
-    trans  : int  0=N, 1=T, 2=C  (fixed at construction)
+    A : dpnp CSR sparse matrix
+    trans : int 0=N, 1=T, 2=C (fixed at construction)
     """
 
     __slots__ = ("_A", "_exec_q", "_handle", "_trans",
-                 "_nrows", "_ncols", "_nnz")
+             "_nrows", "_ncols", "_nnz", "_out_size", "_dtype",
+             "_val_type_id")
 
     def __init__(self, A, trans: int = 0):
-        self._A      = A          # keep alive so USM pointers stay valid
-        self._trans  = int(trans)
-        self._nrows  = int(A.shape[0])
-        self._ncols  = int(A.shape[1])
-        self._nnz    = int(A.data.shape[0])
+        self._A = A  # keep alive so USM pointers stay valid
+        self._trans = int(trans)
+        self._nrows = int(A.shape[0])
+        self._ncols = int(A.shape[1])
+        self._nnz = int(A.data.shape[0])
         self._exec_q = A.data.sycl_queue
+        self._dtype = A.data.dtype
+        # Output length depends on transpose mode.
+        self._out_size = self._nrows if self._trans == 0 else self._ncols
         self._handle = None
 
-        # init_matrix_handle + set_csr_data + optimize_gemv (once)
+        # init_matrix_handle + set_csr_data + optimize_gemv (once).
+        # We must wait on optimize_gemv before any compute call can run;
+        # this is the only place __init__/__call__ blocks.
         handle, ev = _si._sparse_gemv_init(
             self._exec_q,
             self._trans,
@@ -145,10 +154,13 @@ def __init__(self, A, trans: int = 0):
         self._handle = handle
 
     def __call__(self, x: _dpnp.ndarray) -> _dpnp.ndarray:
-        """y = op(A) * x  --  only sparse::gemv fires."""
-        y = _dpnp.zeros(self._nrows, dtype=self._A.data.dtype,
+        """y = op(A) * x -- only sparse::gemv fires, fully async."""
+        y = _dpnp.empty(self._out_size, dtype=self._dtype,
                         sycl_queue=self._exec_q)
-        _, ev = _si._sparse_gemv_compute(
+        # Do NOT wait on the event -- subsequent dpnp ops on the same
+        # queue will serialize behind it automatically. Blocking here
+        # throws away async overlap and dominates small-problem runtime.
+        _si._sparse_gemv_compute(
             self._exec_q,
             self._handle,
             self._trans,
@@ -160,7 +172,6 @@ def __call__(self, x: _dpnp.ndarray) -> _dpnp.ndarray:
             self._ncols,
             [],
         )
-        ev.wait()
         return y
 
     def __del__(self):
@@ -169,14 +180,34 @@ def __del__(self):
                 _si._sparse_gemv_release(self._exec_q, self._handle, [])
             except Exception:
                 pass
-            self._handle = None
+        self._handle = None
 
+class _CachedSpMVPair:
+    """Holds forward and (lazily built) adjoint cached SpMV handles."""
+    __slots__ = ("forward", "_A", "_adjoint")
+
+    def __init__(self, A):
+        self.forward = _CachedSpMV(A, trans=0)
+        self._A = A
+        self._adjoint = None
+
+    def matvec(self, x):
+        return self.forward(x)
+
+    def rmatvec(self, x):
+        if self._adjoint is None:
+            # Build conjtrans handle on first use. For real dtypes
+            # this is equivalent to trans=1.
+            is_cpx = _dpnp.issubdtype(self._A.data.dtype,
+                                      _dpnp.complexfloating)
+            self._adjoint = _CachedSpMV(self._A, trans=2 if is_cpx else 1)
+        return self._adjoint(x)
 
 def _make_fast_matvec(A):
-    """Return a _CachedSpMV if A is a CSR matrix with oneMKL support,
-    a plain lambda fallback, or None if A is not sparse.
+    """Return a _CachedSpMVPair if A is a CSR matrix with oneMKL support,
+    or None if A is not an eligible sparse matrix.
 
-    Falls back gracefully on:
+    Falls back to None (caller uses A.dot) on:
       - missing _sparse_impl extension
       - dtype not supported by the C++ dispatch table
       - any other C++ exception during handle initialisation
@@ -189,30 +220,42 @@ def _make_fast_matvec(A):
         return None
 
     if not _HAS_SPARSE_IMPL:
-        return lambda x: A.dot(x)
+        return None
+
+    # Only build the cached handle for supported dtypes.
+    if _np_dtype(A.data.dtype).char not in _SUPPORTED_DTYPES:
+        return None
 
-    # Try to build the cached handle; fall back to dot() on any error
-    # (e.g. complex dtype not yet in the dispatch table on older builds).
     try:
-        spmv = _CachedSpMV(A, trans=0)
-        return spmv
+        return _CachedSpMVPair(A)
     except Exception:
-        return lambda x: A.dot(x)
-
+        return None        
 
 def _make_system(A, M, x0, b):
     """Validate and prepare (A_op, M_op, x, b, dtype) on device.
 
+    dpnp-only policy: b, x0, and any dense operator inputs must already
+    be dpnp arrays. No host->device promotion happens here.
+
     dtype promotion follows CuPy v14 rules: A.dtype is used when it is in
     {f,d,F,D}; otherwise b.dtype is promoted to float64 (real) or
-    complex128 (complex).  Preconditioners are always accepted and validated.
+    complex128 (complex).
     """
+    if not isinstance(b, _dpnp.ndarray):
+        raise TypeError(
+            f"b must be a dpnp.ndarray, got {type(b).__name__}"
+        )
+    if x0 is not None and not isinstance(x0, _dpnp.ndarray):
+        raise TypeError(
+            f"x0 must be a dpnp.ndarray or None, got {type(x0).__name__}"
+        )
+
     A_op = aslinearoperator(A)
     if A_op.shape[0] != A_op.shape[1]:
         raise ValueError("A must be a square operator")
     n = A_op.shape[0]
 
-    b = _dpnp.asarray(b).reshape(-1)
+    b = b.reshape(-1)
     if b.shape[0] != n:
         raise ValueError(
             f"b length {b.shape[0]} does not match operator dimension {n}"
@@ -230,9 +273,9 @@ def _make_system(A, M, x0, b):
     _check_dtype(b.dtype, "b")
 
     if x0 is None:
-        x = _dpnp.zeros(n, dtype=dtype)
+        x = _dpnp.zeros(n, dtype=dtype, sycl_queue=b.sycl_queue)
     else:
-        x = _dpnp.asarray(x0, dtype=dtype).reshape(-1)
+        x = x0.astype(dtype, copy=True).reshape(-1)
         if x.shape[0] != n:
             raise ValueError(f"x0 length {x.shape[0]} != n={n}")
 
@@ -244,14 +287,15 @@ def _make_system(A, M, x0, b):
             raise ValueError(
                 f"preconditioner shape {M_op.shape} != operator shape {A_op.shape}"
             )
+
         fast_mv_M = _make_fast_matvec(M)
         if fast_mv_M is not None:
             _orig_M = M_op
             class _FastMOp(LinearOperator):
                 def __init__(self):
                     super().__init__(_orig_M.dtype, _orig_M.shape)
-                def _matvec(self, x):  return fast_mv_M(x)
-                def _rmatvec(self, x): return _orig_M.rmatvec(x)
+                def _matvec(self, x): return fast_mv_M.matvec(x)
+                def _rmatvec(self, x): return fast_mv_M.rmatvec(x)
             M_op = _FastMOp()
 
     # Inject fast CSR SpMV for A if available.
@@ -261,13 +305,12 @@ def _rmatvec(self, x): return _orig_M.rmatvec(x)
         class _FastOp(LinearOperator):
             def __init__(self):
                 super().__init__(_orig.dtype, _orig.shape)
-            def _matvec(self, x):  return fast_mv(x)
-            def _rmatvec(self, x): return _orig.rmatvec(x)
+            def _matvec(self, x): return fast_mv.matvec(x)
+            def _rmatvec(self, x): return fast_mv.rmatvec(x)
         A_op = _FastOp()
 
     return A_op, M_op, x, b, dtype
 
-
 def _get_atol(b_norm: float, atol, rtol: float) -> float:
     """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy."""
     if atol == "legacy" or atol is None:
@@ -320,57 +363,78 @@ def cg(
 
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
+    queue = b.sycl_queue
 
-    bnrm = float(_dpnp.linalg.norm(b))
-    if bnrm == 0.0:
+    # Real dtype for norms/inner products (residual metrics are real
+    # even in the complex case).
+    real_dtype = _dpnp.real(b[:1]).dtype
+
+    bnrm = _dpnp.linalg.norm(b)               # 0-D dpnp
+    # Early-exit on zero RHS still needs one sync — unavoidable.
+    if float(bnrm) == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
+    atol_eff_host = _get_atol(float(bnrm), atol=atol, rtol=rtol)
+
     if maxiter is None:
         maxiter = n * 10
 
     rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
-    r  = b - A_op.matvec(x) if x0 is not None else b.copy()
-    z  = M_op.matvec(r)
-    p  = _dpnp.array(z, copy=True)
-    rz = float(_dpnp.real(_dpnp.vdot(r, z)))
+    r = b - A_op.matvec(x) if x0 is not None else b.copy()
+    z = M_op.matvec(r)
+    p = z.copy()
+
+    # rz is kept as a 0-D dpnp array on device.
+    rz = _dpnp.real(_dpnp.vdot(r, z))
 
-    if abs(rz) < rhotol:
+    # Single sync for the initial breakdown check — cheap once.
+    if float(_dpnp.abs(rz)) < rhotol:
         return x, 0
 
+    # Convergence is checked every `check_every` iterations to amortize
+    # the device->host sync cost. Set to 1 to match SciPy exactly.
+    check_every = 1
     info = maxiter
-    for _ in range(maxiter):
-        if float(_dpnp.linalg.norm(r)) <= atol_eff:
-            info = 0
-            break
 
-        Ap  = A_op.matvec(p)
-        pAp = float(_dpnp.real(_dpnp.vdot(p, Ap)))
-        if abs(pAp) < rhotol:
+    for k in range(maxiter):
+        # Convergence check (sync).
+        if k % check_every == 0:
+            rnorm = _dpnp.linalg.norm(r)
+            if float(rnorm) <= atol_eff_host:
+                info = 0
+                break
+
+        Ap = A_op.matvec(p)
+        pAp = _dpnp.real(_dpnp.vdot(p, Ap))   # 0-D on device
+
+        # Breakdown check — needs a sync because it controls flow.
+        if float(_dpnp.abs(pAp)) < rhotol:
             info = -1
             break
 
-        alpha  = rz / pAp
-        x      = x + alpha * p
-        r      = r - alpha * Ap
+        alpha = rz / pAp                       # 0-D on device
+        x = x + alpha * p                      # fully on-device
+        r = r - alpha * Ap
 
         if callback is not None:
             callback(x)
 
-        z      = M_op.matvec(r)
-        rz_new = float(_dpnp.real(_dpnp.vdot(r, z)))
-        if abs(rz_new) < rhotol:
+        z = M_op.matvec(r)
+        rz_new = _dpnp.real(_dpnp.vdot(r, z))
+
+        if float(_dpnp.abs(rz_new)) < rhotol:
             info = 0
             break
-        p  = z + (rz_new / rz) * p
+
+        beta = rz_new / rz                     # 0-D on device
+        p = z + beta * p
         rz = rz_new
     else:
         info = maxiter
 
     return x, int(info)
 
-
 # ---------------------------------------------------------------------------
 # Restarted GMRES
 # ---------------------------------------------------------------------------
@@ -422,100 +486,131 @@ def gmres(
 
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
+    queue = b.sycl_queue
+    real_dtype = _dpnp.real(b[:1]).dtype
 
-    bnrm = float(_dpnp.linalg.norm(b))
-    if bnrm == 0.0:
+    bnrm = _dpnp.linalg.norm(b)
+    if float(bnrm) == 0.0:
         return _dpnp.zeros_like(b), 0
 
-    atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
+    bnrm_host = float(bnrm)
+    atol_eff = _get_atol(bnrm_host, atol=atol, rtol=rtol)
+
     if restart is None: restart = min(20, n)
     if maxiter is None: maxiter = max(n, 1)
     restart = int(restart)
     maxiter = int(maxiter)
 
-    is_cpx  = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
+    is_cpx = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
+    # Givens rotations are inherently serial and branchy -- keep this
+    # scalar state on host. Only the Krylov basis V and the device
+    # vector w stay on the GPU.
     H_dtype = _np.complex128 if is_cpx else _np.float64
-    rhotol  = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
 
+    rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
     total_iters = 0
-    info        = maxiter
+    info = maxiter
 
     for _outer in range(maxiter):
-        r    = M_op.matvec(b - A_op.matvec(x))
-        beta = float(_dpnp.linalg.norm(r))
+        r = M_op.matvec(b - A_op.matvec(x))
+        beta_dev = _dpnp.linalg.norm(r)
+        beta = float(beta_dev)
         if beta == 0.0 or beta <= atol_eff:
             info = 0
             break
 
-        # Krylov basis: column-major (order='F') so V[:,j] is contiguous
-        # on the device -- avoids strided non-coalesced memory access.
-        V = _dpnp.zeros((n, restart + 1), dtype=dtype, order='F')
-        V[:, 0] = r / beta
+        # Column-major basis so V[:, j] is contiguous on device.
+        V = _dpnp.zeros((n, restart + 1), dtype=dtype,
+                        sycl_queue=queue, order='F')
+        V[:, 0] = r / beta_dev
 
-        H_np  = _np.zeros((restart + 1, restart), dtype=H_dtype)
+        H_np = _np.zeros((restart + 1, restart), dtype=H_dtype)
         cs_np = _np.zeros(restart, dtype=H_dtype)
         sn_np = _np.zeros(restart, dtype=H_dtype)
-        g_np  = _np.zeros(restart + 1, dtype=H_dtype)
+        g_np = _np.zeros(restart + 1, dtype=H_dtype)
         g_np[0] = beta
 
         j_final = 0
-        happy   = False
+        happy = False
+        converged = False
 
         for j in range(restart):
             total_iters += 1
-
             w = M_op.matvec(A_op.matvec(V[:, j]))
 
-            h_dp = _dpnp.dot(_dpnp.conj(V[:, :j + 1].T), w)
-            h_np = h_dp.asnumpy()
-            w    = w - _dpnp.dot(V[:, :j + 1],
-                                 _dpnp.asarray(h_np, dtype=dtype))
+            # --- Classical Gram-Schmidt with one reorthogonalization (CGS2)
+            # CGS2 is numerically equivalent to MGS for orthogonality but
+            # stays fully vectorized -- a single matmul per pass.
+            Vj = V[:, :j + 1]
+
+            # Pass 1
+            h_dp = _dpnp.dot(_dpnp.conj(Vj.T), w)          # on device
+            w = w - _dpnp.dot(Vj, h_dp)                     # on device
+
+            # Pass 2 (reorthogonalization)
+            h2_dp = _dpnp.dot(_dpnp.conj(Vj.T), w)
+            w = w - _dpnp.dot(Vj, h2_dp)
 
-            h_j1 = float(_dpnp.linalg.norm(w))
+            # Only now pull the combined projection coefficients to host
+            # for the Givens update. h + h2 is the true projection.
+            h_combined_dp = h_dp + h2_dp
+            h_np = h_combined_dp.asnumpy()
+
+            h_j1_dev = _dpnp.linalg.norm(w)
+            h_j1 = float(h_j1_dev)                         # one sync
 
             H_np[:j + 1, j] = h_np
-            H_np[j + 1,  j] = h_j1
+            H_np[j + 1, j] = h_j1
 
+            # Apply previous Givens rotations to the new column.
             for i in range(j):
-                tmp             =  cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
-                H_np[i + 1, j] = -_np.conj(sn_np[i]) * H_np[i, j] + cs_np[i] * H_np[i + 1, j]
-                H_np[i,     j] =  tmp
+                tmp = cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
+                H_np[i + 1, j] = (-_np.conj(sn_np[i]) * H_np[i, j]
+                                  + cs_np[i] * H_np[i + 1, j])
+                H_np[i, j] = tmp
 
-            h_jj  = H_np[j,     j]
+            h_jj = H_np[j, j]
             h_j1j = H_np[j + 1, j]
             denom = _np.sqrt(_np.abs(h_jj) ** 2 + _np.abs(h_j1j) ** 2)
+
+            # Lucky breakdown in the Givens step -- genuine breakdown.
             if denom < rhotol:
-                info    = -1
-                happy   = True
+                info = -1
                 j_final = j
                 break
-            cs_np[j]       = h_jj  / denom
-            sn_np[j]       = h_j1j / denom
-            H_np[j,     j] = cs_np[j] * h_jj + sn_np[j] * h_j1j
-            H_np[j + 1, j] = 0.0
-            g_np[j + 1]    = -_np.conj(sn_np[j]) * g_np[j]
-            g_np[j]        =  cs_np[j] * g_np[j]
 
+            cs_np[j] = h_jj / denom
+            sn_np[j] = h_j1j / denom
+            H_np[j, j] = cs_np[j] * h_jj + sn_np[j] * h_j1j
+            H_np[j + 1, j] = 0.0
+            g_np[j + 1] = -_np.conj(sn_np[j]) * g_np[j]
+            g_np[j] = cs_np[j] * g_np[j]
             res_norm = abs(g_np[j + 1])
 
+            # Happy breakdown: Krylov subspace is invariant.
+            # Solve the current least-squares and exit the inner loop
+            # cleanly -- do NOT try to extend the basis.
             if h_j1 < rhotol:
                 j_final = j
-                happy   = True
+                happy = True
                 if res_norm <= atol_eff:
-                    info = 0
+                    converged = True
                 break
 
+            # Normal convergence from the estimated residual.
             if res_norm <= atol_eff:
                 j_final = j
-                info    = 0
-                happy   = True
+                converged = True
                 break
 
+            # Extend the basis -- only safe when h_j1 is non-tiny.
             if j + 1 < restart:
-                V[:, j + 1] = w / h_j1
+                V[:, j + 1] = w / h_j1_dev                 # stays on device
+
             j_final = j
 
-        k    = j_final + 1
+        # --- Solve the upper-triangular least-squares H y = g on host.
+        k = j_final + 1
         y_np = _np.zeros(k, dtype=H_dtype)
         for i in range(k - 1, -1, -1):
             y_np[i] = g_np[i]
@@ -526,28 +621,36 @@ def gmres(
             else:
                 y_np[i] /= H_np[i, i]
 
-        x = x + _dpnp.dot(V[:, :k], _dpnp.asarray(y_np, dtype=dtype))
+        # Update x = x + V[:, :k] @ y. Push y to device once.
+        y_dev = _dpnp.asarray(y_np, dtype=dtype, sycl_queue=queue)
+        x = x + _dpnp.dot(V[:, :k], y_dev)
 
-        res_norm = float(_dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x))))
+        # True residual norm for the outer-loop stop test.
+        res_norm_true = float(
+            _dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x)))
+        )
 
         if callback is not None:
             if callback_type in ("x", "legacy"):
                 callback(x)
             elif callback_type == "pr_norm":
-                callback(res_norm / bnrm)
+                callback(res_norm_true / bnrm_host)
 
-        if res_norm <= atol_eff:
+        if res_norm_true <= atol_eff:
             info = 0
             break
 
-        if happy and info != 0:
+        if info == -1:          # Givens denom breakdown
+            break
+        if happy:               # happy breakdown -- done regardless
+            if converged:
+                info = 0
             break
     else:
         info = total_iters
 
     return x, int(info)
 
-
 # ---------------------------------------------------------------------------
 # MINRES -- Paige-Saunders recurrence, pure dpnp / oneMKL
 # ---------------------------------------------------------------------------
@@ -591,20 +694,24 @@ def minres(
         rtol = tol
 
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
-    n   = b.shape[0]
-    eps = float(_np.finfo(_np_dtype(dtype)).eps)
+    n = b.shape[0]
+    queue = b.sycl_queue
 
+    eps = float(_np.finfo(_np_dtype(dtype)).eps)
     if maxiter is None:
         maxiter = 5 * n
 
-    bnrm = float(_dpnp.linalg.norm(b))
+    bnrm_dev = _dpnp.linalg.norm(b)
+    bnrm = float(bnrm_dev)
     if bnrm == 0.0:
         return _dpnp.zeros_like(b), 0
 
     atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
 
-    r1          = b - A_op.matvec(x) if x0 is not None else b.copy()
-    y           = M_op.matvec(r1)
+    r1 = b - A_op.matvec(x) if x0 is not None else b.copy()
+    y = M_op.matvec(r1)
+
+    # Initial preconditioner SPD check (one sync, setup only).
     beta1_inner = float(_dpnp.real(_dpnp.vdot(r1, y)))
     if beta1_inner < 0.0:
         raise ValueError(
@@ -613,14 +720,16 @@ def minres(
         )
     if beta1_inner == 0.0:
         return x, 0
+
     beta1 = _np.sqrt(beta1_inner)
 
     if check:
-        Ay  = A_op.matvec(y) - shift * y
-        lhs = float(_dpnp.linalg.norm(
-            Ay - (_dpnp.real(_dpnp.vdot(y, Ay))
-                  / _dpnp.real(_dpnp.vdot(y, y))) * y
-        ))
+        Ay = A_op.matvec(y) - shift * y
+        # This block is diagnostic and only runs when check=True, so
+        # the syncs here are acceptable.
+        y_Ay = float(_dpnp.real(_dpnp.vdot(y, Ay)))
+        y_y = float(_dpnp.real(_dpnp.vdot(y, y)))
+        lhs = float(_dpnp.linalg.norm(Ay - (y_Ay / y_y) * y))
         rhs = eps ** 0.5 * float(_dpnp.linalg.norm(Ay))
         if lhs > rhs:
             raise ValueError(
@@ -628,42 +737,45 @@ def minres(
                 "set check=False to skip this test."
             )
 
-    beta   = beta1
-    oldb   = 0.0
+    # Host-side recurrence state -- these are all scalars that drive
+    # branches, so there's no benefit to keeping them on device.
+    beta = beta1
+    oldb = 0.0
     phibar = beta1
-    cs     = -1.0
-    sn     =  0.0
-    dbar   =  0.0
-    epsln  =  0.0
-
+    cs = -1.0
+    sn = 0.0
+    dbar = 0.0
+    epsln = 0.0
     tnorm2 = 0.0
-    gmax   = 0.0
-    gmin   = _np.finfo(_np_dtype(dtype)).max
+    gmax = 0.0
+    gmin = _np.finfo(_np_dtype(dtype)).max
 
-    w  = _dpnp.zeros(n, dtype=dtype)
-    w2 = _dpnp.zeros(n, dtype=dtype)
+    # Device-side vector state.
+    w = _dpnp.zeros(n, dtype=dtype, sycl_queue=queue)
+    w2 = _dpnp.zeros(n, dtype=dtype, sycl_queue=queue)
     r2 = r1.copy()
-    v  = y / beta1
+    v = y / beta1
 
     stag_eps = 10.0 * eps
-
     info = 1
+
     for itr in range(1, maxiter + 1):
-        s  = 1.0 / beta
-        v  = y * s
-        y  = A_op.matvec(v) - shift * v
+        s = 1.0 / beta
+        v = y * s
+        y = A_op.matvec(v) - shift * v
         if itr > 1:
             y = y - (beta / oldb) * r1
 
+        # alpha = <v, y>  -- one sync for the recurrence coefficient.
         alpha = float(_dpnp.real(_dpnp.vdot(v, y)))
-        y     = y - (alpha / beta) * r2
-        r1    = r2.copy()
-        r2    = y.copy()
-        y     = M_op.matvec(r2)
-        oldb  = beta
-
-        # Check preconditioner SPD: compute raw inner product, then check sign
-        # before sqrt -- abs() would hide a non-SPD M.
+        y = y - (alpha / beta) * r2
+        r1 = r2
+        r2 = y
+        y = M_op.matvec(r2)
+        oldb = beta
+
+        # SPD check on M each iteration. Single sync, unavoidable
+        # because beta feeds the next iteration's scaling.
         inner_r2y = float(_dpnp.real(_dpnp.vdot(r2, y)))
         if inner_r2y < 0.0:
             raise ValueError(
@@ -673,67 +785,65 @@ def minres(
         beta = _np.sqrt(inner_r2y)
 
         tnorm2 += alpha ** 2 + oldb ** 2 + beta ** 2
-
         oldeps = epsln
-        delta  = cs * dbar + sn * alpha
+        delta = cs * dbar + sn * alpha
         gbar_k = sn * dbar - cs * alpha
-        epsln  = sn * beta
-        dbar   = -cs * beta
-
-        root   = _np.hypot(gbar_k, dbar)
-
-        gamma  = _np.hypot(gbar_k, beta)
+        epsln = sn * beta
+        dbar = -cs * beta
+        root = _np.hypot(gbar_k, dbar)
+        gamma = _np.hypot(gbar_k, beta)
         if gamma == 0.0:
             gamma = eps
-        cs     = gbar_k / gamma
-        sn     = beta   / gamma
-
-        phi    = cs * phibar
+        cs = gbar_k / gamma
+        sn = beta / gamma
+        phi = cs * phibar
         phibar = sn * phibar
 
         gmax = max(gmax, gamma)
         gmin = min(gmin, gamma)
-
         denom = 1.0 / gamma
+
+        # Update solution estimate -- all on device.
         w_new = (v - oldeps * w - delta * w2) * denom
-        x     = x + phi * w_new
-        w     = w2
-        w2    = w_new
+        w = w2
+        w2 = w_new
+        x = x + phi * w_new
 
         rnorm = abs(phibar)
         Anorm = _np.sqrt(tnorm2)
+
+        # ynorm sync: needed for the relative-residual test and the
+        # corrected stagnation test.
         ynorm = float(_dpnp.linalg.norm(x))
 
         if callback is not None:
             callback(x)
 
-        # Stopping criterion 1: absolute residual
+        # Stopping criterion 1: absolute residual.
         if rnorm <= atol_eff:
             info = 0
             break
 
-        # Stopping criterion 2: relative residual  ||r|| / (||A|| ||x||)
-        # (Paige-Saunders test1 -- catches convergence on ill-conditioned A)
+        # Stopping criterion 2: relative residual ||r|| / (||A|| ||x||).
         if Anorm > 0.0 and ynorm > 0.0:
             if rnorm / (Anorm * ynorm) <= rtol:
                 info = 0
                 break
 
-        # Stopping criterion 3: range-space residual  ||A^T r|| / (||A|| ||r||)
-        # (Paige-Saunders test2 -- detects convergence in A's range)
+        # Stopping criterion 3: range-space residual ||A^T r|| / ||A||.
         if Anorm > 0.0 and rnorm > 0.0:
             if root / Anorm <= rtol:
                 info = 0
                 break
 
-        # Stopping criterion 4: condition number estimate
-        # (gmax/gmin approximates cond(A); stop when near machine precision)
+        # Stopping criterion 4: condition number estimate.
         if Anorm > 0.0 and (gmax / gmin) >= 0.1 / eps:
             info = 0
             break
 
-        # Stagnation detection: step size < 10*eps relative to x
-        if phi * denom < stag_eps:
+        # Stagnation: step size relative to solution magnitude.
+        # Corrected from the original (which missed the /ynorm normalization).
+        if ynorm > 0.0 and abs(phi) / gamma < stag_eps * ynorm:
             info = 2
             break
     else:

From e1a41b341349578983cc1fe65072c0c90ef852cc Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Wed, 8 Apr 2026 22:42:19 +0000
Subject: [PATCH 37/43] minor fixes

---
 dpnp/scipy/sparse/linalg/__init__.py   |   4 +-
 dpnp/scipy/sparse/linalg/_interface.py |  26 +-
 dpnp/scipy/sparse/linalg/_iterative.py | 870 ++++++++++++++-----------
 3 files changed, 489 insertions(+), 411 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/__init__.py b/dpnp/scipy/sparse/linalg/__init__.py
index 3bb72d5b8f10..fb09329a2d12 100644
--- a/dpnp/scipy/sparse/linalg/__init__.py
+++ b/dpnp/scipy/sparse/linalg/__init__.py
@@ -30,8 +30,8 @@
 
 """Sparse linear algebra interface for DPNP.
 
-This module provides a subset of :mod:`scipy.sparse.linalg` and
-:mod:`cupyx.scipy.sparse.linalg` functionality on top of DPNP arrays.
+This module provides a subset of :mod:`scipy.sparse.linalg`
+ functionality on top of DPNP arrays.
 
 The initial implementation focuses on the :class:`LinearOperator` interface
 and a small set of Krylov solvers (``cg``, ``gmres``, ``minres``).
diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index 24fd448de9f6..fd82c4a43282 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -1,16 +1,17 @@
-# Copyright (c) 2023 - 2025, Intel Corporation
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-# 3. Neither the name of Intel Corporation nor the names of its contributors
-#    may be used to endorse or promote products derived from this software
-#    without specific prior written permission.
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -21,8 +22,9 @@
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
 
 """LinearOperator and helpers for dpnp.scipy.sparse.linalg.
 
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index cce0fe5fb3ab..df4a7a654bed 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -1,16 +1,17 @@
-# Copyright (c) 2023 - 2025, Intel Corporation
+# *****************************************************************************
+# Copyright (c) 2025, Intel Corporation
+# All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-# 3. Neither the name of Intel Corporation nor the names of its contributors
-#    may be used to endorse or promote products derived from this software
-#    without specific prior written permission.
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -21,8 +22,9 @@
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
 
 """Iterative sparse linear solvers for dpnp -- pure GPU/SYCL implementation.
 
@@ -36,10 +38,6 @@
 gmres  : Restarted GMRES (general non-symmetric)
 minres : MINRES (symmetric possibly indefinite)
 
-All signatures match cupyx.scipy.sparse.linalg (CuPy v14.0.1) and
-scipy.sparse.linalg, using ``rtol`` as the primary tolerance keyword
-(``tol`` is accepted as a deprecated alias for backward compatibility).
-
 SpMV fast-path
 --------------
 When a CSR dpnp sparse matrix is passed as A or M, _make_fast_matvec()
@@ -67,8 +65,10 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as _np
-import dpnp as _dpnp
+import numpy
+import dpnp
+import dpnp.backend.extensions.blas._blas_impl as bi
+import dpctl.utils as dpu
 
 from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
@@ -91,9 +91,9 @@
 # Internal helpers
 # ---------------------------------------------------------------------------
 
-def _np_dtype(dp_dtype) -> _np.dtype:
-    """Normalise any dtype-like (dpnp type, numpy type, string) to np.dtype."""
-    return _np.dtype(dp_dtype)
+def _np_dtype(dp_dtype) -> numpy.dtype:
+    """Normalise any dtype-like (dpnp type, numpy type, string) to numpy.dtype."""
+    return numpy.dtype(dp_dtype)
 
 
 def _check_dtype(dtype, name: str) -> None:
@@ -111,18 +111,15 @@ class _CachedSpMV:
     in __init__. Subsequent calls to __call__ only invoke sparse::gemv,
     paying no analysis overhead. The handle is released in __del__.
 
-    Only trans=0 (non-transposed) is exposed, the adjoint path uses a
-    separate _CachedSpMV built against trans=2.
-
     Parameters
     ----------
     A : dpnp CSR sparse matrix
-    trans : int 0=N, 1=T, 2=C (fixed at construction)
+    trans : int  0=N, 1=T, 2=C  (fixed at construction)
     """
 
     __slots__ = ("_A", "_exec_q", "_handle", "_trans",
-             "_nrows", "_ncols", "_nnz", "_out_size", "_dtype",
-             "_val_type_id")
+                 "_nrows", "_ncols", "_nnz", "_out_size", "_in_size",
+                 "_dtype", "_val_type_id")
 
     def __init__(self, A, trans: int = 0):
         self._A = A  # keep alive so USM pointers stay valid
@@ -132,14 +129,24 @@ def __init__(self, A, trans: int = 0):
         self._nnz = int(A.data.shape[0])
         self._exec_q = A.data.sycl_queue
         self._dtype = A.data.dtype
-        # Output length depends on transpose mode.
-        self._out_size = self._nrows if self._trans == 0 else self._ncols
+
+        # Output and input lengths depend on transpose mode.
+        # For trans=0 (N): y has nrows, x has ncols.
+        # For trans=1/2 (T/C): y has ncols, x has nrows.
+        if self._trans == 0:
+            self._out_size = self._nrows
+            self._in_size = self._ncols
+        else:
+            self._out_size = self._ncols
+            self._in_size = self._nrows
+
         self._handle = None
+        self._val_type_id = -1
 
         # init_matrix_handle + set_csr_data + optimize_gemv (once).
         # We must wait on optimize_gemv before any compute call can run;
         # this is the only place __init__/__call__ blocks.
-        handle, ev = _si._sparse_gemv_init(
+        handle, val_type_id, ev = _si._sparse_gemv_init(
             self._exec_q,
             self._trans,
             A.indptr,
@@ -152,10 +159,11 @@ def __init__(self, A, trans: int = 0):
         )
         ev.wait()
         self._handle = handle
+        self._val_type_id = val_type_id
 
-    def __call__(self, x: _dpnp.ndarray) -> _dpnp.ndarray:
+    def __call__(self, x: dpnp.ndarray) -> dpnp.ndarray:
         """y = op(A) * x -- only sparse::gemv fires, fully async."""
-        y = _dpnp.empty(self._out_size, dtype=self._dtype,
+        y = dpnp.empty(self._out_size, dtype=self._dtype,
                         sycl_queue=self._exec_q)
         # Do NOT wait on the event -- subsequent dpnp ops on the same
         # queue will serialize behind it automatically. Blocking here
@@ -163,6 +171,7 @@ def __call__(self, x: _dpnp.ndarray) -> _dpnp.ndarray:
         _si._sparse_gemv_compute(
             self._exec_q,
             self._handle,
+            self._val_type_id,
             self._trans,
             1.0,
             x,
@@ -175,12 +184,15 @@ def __call__(self, x: _dpnp.ndarray) -> _dpnp.ndarray:
         return y
 
     def __del__(self):
-        if self._handle is not None and _si is not None:
+        # Guard against partial construction: _handle may not be set if
+        # __init__ raised before the assignment.
+        handle = getattr(self, "_handle", None)
+        if handle is not None and _si is not None:
             try:
-                _si._sparse_gemv_release(self._exec_q, self._handle, [])
+                _si._sparse_gemv_release(self._exec_q, handle, [])
             except Exception:
                 pass
-        self._handle = None
+            self._handle = None
 
 class _CachedSpMVPair:
     """Holds forward and (lazily built) adjoint cached SpMV handles."""
@@ -198,8 +210,8 @@ def rmatvec(self, x):
         if self._adjoint is None:
             # Build conjtrans handle on first use. For real dtypes
             # this is equivalent to trans=1.
-            is_cpx = _dpnp.issubdtype(self._A.data.dtype,
-                                      _dpnp.complexfloating)
+            is_cpx = dpnp.issubdtype(self._A.data.dtype,
+                                      dpnp.complexfloating)
             self._adjoint = _CachedSpMV(self._A, trans=2 if is_cpx else 1)
         return self._adjoint(x)
 
@@ -229,7 +241,7 @@ def _make_fast_matvec(A):
     try:
         return _CachedSpMVPair(A)
     except Exception:
-        return None        
+        return None
 
 def _make_system(A, M, x0, b):
     """Validate and prepare (A_op, M_op, x, b, dtype) on device.
@@ -241,11 +253,11 @@ def _make_system(A, M, x0, b):
     {f,d,F,D}; otherwise b.dtype is promoted to float64 (real) or
     complex128 (complex).
     """
-    if not isinstance(b, _dpnp.ndarray):
+    if not isinstance(b, dpnp.ndarray):
         raise TypeError(
             f"b must be a dpnp.ndarray, got {type(b).__name__}"
         )
-    if x0 is not None and not isinstance(x0, _dpnp.ndarray):
+    if x0 is not None and not isinstance(x0, dpnp.ndarray):
         raise TypeError(
             f"x0 must be a dpnp.ndarray or None, got {type(x0).__name__}"
         )
@@ -264,16 +276,16 @@ def _make_system(A, M, x0, b):
     # Dtype promotion: prefer A.dtype; fall back via b.dtype.
     if A_op.dtype is not None and _np_dtype(A_op.dtype).char in _SUPPORTED_DTYPES:
         dtype = A_op.dtype
-    elif _dpnp.issubdtype(b.dtype, _dpnp.complexfloating):
-        dtype = _dpnp.complex128
+    elif dpnp.issubdtype(b.dtype, dpnp.complexfloating):
+        dtype = dpnp.complex128
     else:
-        dtype = _dpnp.float64
+        dtype = dpnp.float64
 
     b = b.astype(dtype, copy=False)
     _check_dtype(b.dtype, "b")
 
     if x0 is None:
-        x = _dpnp.zeros(n, dtype=dtype, sycl_queue=b.sycl_queue)
+        x = dpnp.zeros(n, dtype=dtype, sycl_queue=b.sycl_queue)
     else:
         x = x0.astype(dtype, copy=True).reshape(-1)
         if x.shape[0] != n:
@@ -330,7 +342,7 @@ def _get_atol(b_norm: float, atol, rtol: float) -> float:
 def cg(
     A,
     b,
-    x0: Optional[_dpnp.ndarray] = None,
+    x0: Optional[dpnp.ndarray] = None,
     *,
     rtol: float = 1e-5,
     tol: Optional[float] = None,
@@ -338,7 +350,7 @@ def cg(
     M=None,
     callback: Optional[Callable] = None,
     atol=None,
-) -> Tuple[_dpnp.ndarray, int]:
+) -> Tuple[dpnp.ndarray, int]:
     """Conjugate Gradient -- pure dpnp/oneMKL, Hermitian positive definite A.
 
     Parameters
@@ -363,53 +375,43 @@ def cg(
 
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
     n = b.shape[0]
-    queue = b.sycl_queue
-
-    # Real dtype for norms/inner products (residual metrics are real
-    # even in the complex case).
-    real_dtype = _dpnp.real(b[:1]).dtype
 
-    bnrm = _dpnp.linalg.norm(b)               # 0-D dpnp
-    # Early-exit on zero RHS still needs one sync — unavoidable.
-    if float(bnrm) == 0.0:
-        return _dpnp.zeros_like(b), 0
+    bnrm = dpnp.linalg.norm(b)
+    bnrm_host = float(bnrm)
+    if bnrm_host == 0.0:
+        return dpnp.zeros_like(b), 0
 
-    atol_eff_host = _get_atol(float(bnrm), atol=atol, rtol=rtol)
+    atol_eff_host = _get_atol(bnrm_host, atol=atol, rtol=rtol)
 
     if maxiter is None:
         maxiter = n * 10
 
-    rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
+    rhotol = float(numpy.finfo(_np_dtype(dtype)).eps ** 2)
 
     r = b - A_op.matvec(x) if x0 is not None else b.copy()
     z = M_op.matvec(r)
     p = z.copy()
 
     # rz is kept as a 0-D dpnp array on device.
-    rz = _dpnp.real(_dpnp.vdot(r, z))
+    rz = dpnp.real(dpnp.vdot(r, z))
 
-    # Single sync for the initial breakdown check — cheap once.
-    if float(_dpnp.abs(rz)) < rhotol:
+    # Single sync for the initial breakdown check.
+    if float(dpnp.abs(rz)) < rhotol:
         return x, 0
 
-    # Convergence is checked every `check_every` iterations to amortize
-    # the device->host sync cost. Set to 1 to match SciPy exactly.
-    check_every = 1
     info = maxiter
 
     for k in range(maxiter):
         # Convergence check (sync).
-        if k % check_every == 0:
-            rnorm = _dpnp.linalg.norm(r)
-            if float(rnorm) <= atol_eff_host:
-                info = 0
-                break
+        rnorm = dpnp.linalg.norm(r)
+        if float(rnorm) <= atol_eff_host:
+            info = 0
+            break
 
         Ap = A_op.matvec(p)
-        pAp = _dpnp.real(_dpnp.vdot(p, Ap))   # 0-D on device
+        pAp = dpnp.real(dpnp.vdot(p, Ap))   # 0-D on device
 
-        # Breakdown check — needs a sync because it controls flow.
-        if float(_dpnp.abs(pAp)) < rhotol:
+        if float(dpnp.abs(pAp)) < rhotol:
             info = -1
             break
 
@@ -421,9 +423,9 @@ def cg(
             callback(x)
 
         z = M_op.matvec(r)
-        rz_new = _dpnp.real(_dpnp.vdot(r, z))
+        rz_new = dpnp.real(dpnp.vdot(r, z))
 
-        if float(_dpnp.abs(rz_new)) < rhotol:
+        if float(dpnp.abs(rz_new)) < rhotol:
             info = 0
             break
 
@@ -435,418 +437,492 @@ def cg(
 
     return x, int(info)
 
-# ---------------------------------------------------------------------------
-# Restarted GMRES
-# ---------------------------------------------------------------------------
-
 def gmres(
     A,
     b,
-    x0: Optional[_dpnp.ndarray] = None,
+    x0: Optional[dpnp.ndarray] = None,
     *,
     rtol: float = 1e-5,
-    tol: Optional[float] = None,
+    atol: float = 0.0,
     restart: Optional[int] = None,
     maxiter: Optional[int] = None,
     M=None,
     callback: Optional[Callable] = None,
-    atol=None,
     callback_type: Optional[str] = None,
-) -> Tuple[_dpnp.ndarray, int]:
-    """Restarted GMRES -- pure dpnp/oneMKL, general non-symmetric A.
+) -> Tuple[dpnp.ndarray, int]:
+    """Uses Generalized Minimal RESidual iteration to solve ``Ax = b``.
 
     Parameters
     ----------
-    A             : array_like or LinearOperator -- (n, n)
-    b             : array_like -- right-hand side (n,)
-    x0            : array_like, optional
-    rtol          : float -- relative tolerance (default 1e-5)
-    tol           : float, optional -- deprecated alias for rtol
-    restart       : int, optional -- Krylov subspace size (default min(20,n))
-    maxiter       : int, optional -- max outer restart cycles (default max(n,1))
-    M             : LinearOperator or array_like, optional -- preconditioner
-    callback      : callable, optional
-    atol          : float, optional
-    callback_type : {None, 'x', 'pr_norm', 'legacy'}
+    A : LinearOperator, dpnp sparse matrix, or 2-D dpnp.ndarray
+        The real or complex matrix of the linear system, shape (n, n).
+    b : dpnp.ndarray
+        Right-hand side of the linear system, shape (n,) or (n, 1).
+    x0 : dpnp.ndarray, optional
+        Starting guess for the solution.
+    rtol, atol : float
+        Tolerance for convergence: ``||r|| <= max(atol, rtol*||b||)``.
+    restart : int, optional
+        Number of iterations between restarts (default 20). Larger values
+        increase iteration cost but may be necessary for convergence.
+    maxiter : int, optional
+        Maximum number of iterations (default 10*n).
+    M : LinearOperator, dpnp sparse matrix, or 2-D dpnp.ndarray, optional
+        Preconditioner for ``A``; should approximate the inverse of ``A``.
+    callback : callable, optional
+        User-specified function to call on every restart. Called as
+        ``callback(arg)``, where ``arg`` is selected by ``callback_type``.
+    callback_type : {'x', 'pr_norm'}, optional
+        If 'x', the current solution vector is passed to the callback.
+        If 'pr_norm', the relative (preconditioned) residual norm.
+        Default is 'pr_norm' when a callback is supplied.
 
     Returns
     -------
-    x    : dpnp.ndarray
-    info : int  0=converged  >0=iterations used  -1=breakdown
+    x : dpnp.ndarray
+        The (approximate) solution. Note that this is M @ x in the
+        right-preconditioned formulation, matching CuPy's return value.
+    info : int
+        0 if converged; iteration count if maxiter was reached.
+
+    See Also
+    --------
+    scipy.sparse.linalg.gmres
+    cupyx.scipy.sparse.linalg.gmres
     """
-    if tol is not None:
-        rtol = tol
-
-    if callback_type not in (None, "x", "pr_norm", "legacy"):
-        raise ValueError(
-            "callback_type must be None, 'x', 'pr_norm', or 'legacy'"
-        )
-    if callback is not None and callback_type is None:
-        callback_type = "x"
-
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
-    n = b.shape[0]
-    queue = b.sycl_queue
-    real_dtype = _dpnp.real(b[:1]).dtype
-
-    bnrm = _dpnp.linalg.norm(b)
-    if float(bnrm) == 0.0:
-        return _dpnp.zeros_like(b), 0
+    matvec = A_op.matvec
+    psolve = M_op.matvec
 
-    bnrm_host = float(bnrm)
-    atol_eff = _get_atol(bnrm_host, atol=atol, rtol=rtol)
+    n = A_op.shape[0]
+    if n == 0:
+        return dpnp.empty_like(b), 0
+    b_norm = dpnp.linalg.norm(b)
+    if b_norm == 0.0:
+        return b, 0
+    atol = max(float(atol), rtol * float(b_norm))
 
-    if restart is None: restart = min(20, n)
-    if maxiter is None: maxiter = max(n, 1)
-    restart = int(restart)
-    maxiter = int(maxiter)
+    if maxiter is None:
+        maxiter = n * 10
+    if restart is None:
+        restart = 20
+    restart = min(int(restart), n)
 
-    is_cpx = _dpnp.issubdtype(dtype, _dpnp.complexfloating)
-    # Givens rotations are inherently serial and branchy -- keep this
-    # scalar state on host. Only the Krylov basis V and the device
-    # vector w stay on the GPU.
-    H_dtype = _np.complex128 if is_cpx else _np.float64
+    if callback_type is None:
+        callback_type = 'pr_norm'
+    if callback_type not in ('x', 'pr_norm'):
+        raise ValueError(f"Unknown callback_type: {callback_type!r}")
+    if callback is None:
+        callback_type = None
 
-    rhotol = float(_np.finfo(_np_dtype(dtype)).eps ** 2)
-    total_iters = 0
-    info = maxiter
+    queue = b.sycl_queue
 
-    for _outer in range(maxiter):
-        r = M_op.matvec(b - A_op.matvec(x))
-        beta_dev = _dpnp.linalg.norm(r)
-        beta = float(beta_dev)
-        if beta == 0.0 or beta <= atol_eff:
-            info = 0
+    # Krylov basis V, Hessenberg H, and RHS e all live on device to
+    # avoid host-device sync overhead (which dominates on Intel GPUs
+    # even for small transfers).  CuPy keeps e on host and solves
+    # lstsq on CPU, but for dpnp we keep everything on device.
+    V = dpnp.empty((n, restart), dtype=dtype, sycl_queue=queue, order='F')
+    H = dpnp.zeros((restart + 1, restart), dtype=dtype,
+                    sycl_queue=queue, order='F')
+    e = dpnp.zeros(restart + 1, dtype=dtype, sycl_queue=queue)
+
+    compute_hu = _make_compute_hu(V)
+
+    iters = 0
+    while True:
+        mx = psolve(x)
+        r = b - matvec(mx)
+        r_norm = dpnp.linalg.norm(r)
+
+        if callback_type == 'x':
+            callback(mx)
+        elif callback_type == 'pr_norm' and iters > 0:
+            callback(r_norm / b_norm)
+
+        if r_norm <= atol or iters >= maxiter:
             break
 
-        # Column-major basis so V[:, j] is contiguous on device.
-        V = _dpnp.zeros((n, restart + 1), dtype=dtype,
-                        sycl_queue=queue, order='F')
-        V[:, 0] = r / beta_dev
-
-        H_np = _np.zeros((restart + 1, restart), dtype=H_dtype)
-        cs_np = _np.zeros(restart, dtype=H_dtype)
-        sn_np = _np.zeros(restart, dtype=H_dtype)
-        g_np = _np.zeros(restart + 1, dtype=H_dtype)
-        g_np[0] = beta
-
-        j_final = 0
-        happy = False
-        converged = False
+        v = r / r_norm
+        V[:, 0] = v
+        e[0] = r_norm
 
+        # Arnoldi iteration
         for j in range(restart):
-            total_iters += 1
-            w = M_op.matvec(A_op.matvec(V[:, j]))
-
-            # --- Classical Gram-Schmidt with one reorthogonalization (CGS2)
-            # CGS2 is numerically equivalent to MGS for orthogonality but
-            # stays fully vectorized -- a single matmul per pass.
-            Vj = V[:, :j + 1]
-
-            # Pass 1
-            h_dp = _dpnp.dot(_dpnp.conj(Vj.T), w)          # on device
-            w = w - _dpnp.dot(Vj, h_dp)                     # on device
-
-            # Pass 2 (reorthogonalization)
-            h2_dp = _dpnp.dot(_dpnp.conj(Vj.T), w)
-            w = w - _dpnp.dot(Vj, h2_dp)
-
-            # Only now pull the combined projection coefficients to host
-            # for the Givens update. h + h2 is the true projection.
-            h_combined_dp = h_dp + h2_dp
-            h_np = h_combined_dp.asnumpy()
-
-            h_j1_dev = _dpnp.linalg.norm(w)
-            h_j1 = float(h_j1_dev)                         # one sync
-
-            H_np[:j + 1, j] = h_np
-            H_np[j + 1, j] = h_j1
-
-            # Apply previous Givens rotations to the new column.
-            for i in range(j):
-                tmp = cs_np[i] * H_np[i, j] + sn_np[i] * H_np[i + 1, j]
-                H_np[i + 1, j] = (-_np.conj(sn_np[i]) * H_np[i, j]
-                                  + cs_np[i] * H_np[i + 1, j])
-                H_np[i, j] = tmp
-
-            h_jj = H_np[j, j]
-            h_j1j = H_np[j + 1, j]
-            denom = _np.sqrt(_np.abs(h_jj) ** 2 + _np.abs(h_j1j) ** 2)
-
-            # Lucky breakdown in the Givens step -- genuine breakdown.
-            if denom < rhotol:
-                info = -1
-                j_final = j
-                break
-
-            cs_np[j] = h_jj / denom
-            sn_np[j] = h_j1j / denom
-            H_np[j, j] = cs_np[j] * h_jj + sn_np[j] * h_j1j
-            H_np[j + 1, j] = 0.0
-            g_np[j + 1] = -_np.conj(sn_np[j]) * g_np[j]
-            g_np[j] = cs_np[j] * g_np[j]
-            res_norm = abs(g_np[j + 1])
-
-            # Happy breakdown: Krylov subspace is invariant.
-            # Solve the current least-squares and exit the inner loop
-            # cleanly -- do NOT try to extend the basis.
-            if h_j1 < rhotol:
-                j_final = j
-                happy = True
-                if res_norm <= atol_eff:
-                    converged = True
-                break
-
-            # Normal convergence from the estimated residual.
-            if res_norm <= atol_eff:
-                j_final = j
-                converged = True
-                break
-
-            # Extend the basis -- only safe when h_j1 is non-tiny.
+            z = psolve(v)
+            u = matvec(z)
+            H[:j + 1, j], u = compute_hu(u, j)
+            H[j + 1, j] = dpnp.linalg.norm(u)
             if j + 1 < restart:
-                V[:, j + 1] = w / h_j1_dev                 # stays on device
-
-            j_final = j
-
-        # --- Solve the upper-triangular least-squares H y = g on host.
-        k = j_final + 1
-        y_np = _np.zeros(k, dtype=H_dtype)
-        for i in range(k - 1, -1, -1):
-            y_np[i] = g_np[i]
-            for ll in range(i + 1, k):
-                y_np[i] -= H_np[i, ll] * y_np[ll]
-            if abs(H_np[i, i]) < rhotol:
-                y_np[i] = 0.0
-            else:
-                y_np[i] /= H_np[i, i]
-
-        # Update x = x + V[:, :k] @ y. Push y to device once.
-        y_dev = _dpnp.asarray(y_np, dtype=dtype, sycl_queue=queue)
-        x = x + _dpnp.dot(V[:, :k], y_dev)
-
-        # True residual norm for the outer-loop stop test.
-        res_norm_true = float(
-            _dpnp.linalg.norm(M_op.matvec(b - A_op.matvec(x)))
-        )
+                v = u / H[j + 1, j]
+                V[:, j + 1] = v
 
-        if callback is not None:
-            if callback_type in ("x", "legacy"):
-                callback(x)
-            elif callback_type == "pr_norm":
-                callback(res_norm_true / bnrm_host)
+        # Solve the Hessenberg least-squares H y = e on device.
+        # Tiny problem (~restart x restart), kept on-device to avoid sync.
+        y, *_ = dpnp.linalg.lstsq(H, e, rcond=None)
+        x = x + dpnp.dot(V, y)
+        iters += restart
 
-        if res_norm_true <= atol_eff:
-            info = 0
-            break
+    info = 0
+    if iters >= maxiter and not bool(r_norm <= atol):
+        info = iters
 
-        if info == -1:          # Givens denom breakdown
-            break
-        if happy:               # happy breakdown -- done regardless
-            if converged:
-                info = 0
-            break
-    else:
-        info = total_iters
-
-    return x, int(info)
-
-# ---------------------------------------------------------------------------
-# MINRES -- Paige-Saunders recurrence, pure dpnp / oneMKL
-# ---------------------------------------------------------------------------
+    return mx, info
 
 def minres(
     A,
     b,
-    x0: Optional[_dpnp.ndarray] = None,
+    x0: Optional[dpnp.ndarray] = None,
     *,
-    shift: float = 0.0,
     rtol: float = 1e-5,
+    shift: float = 0.0,
     tol: Optional[float] = None,
     maxiter: Optional[int] = None,
     M=None,
     callback: Optional[Callable] = None,
+    show: bool = False,
     check: bool = False,
-    atol=None,
-) -> Tuple[_dpnp.ndarray, int]:
-    """MINRES for symmetric (possibly indefinite) A -- pure dpnp/oneMKL.
+) -> Tuple[dpnp.ndarray, int]:
+    """Uses MINimum RESidual iteration to solve ``Ax = b``.
+
+    Solves the symmetric (possibly indefinite) system ``Ax = b`` or,
+    if *shift* is nonzero, ``(A - shift*I)x = b``.  All computation
+    stays on the SYCL device; only scalar recurrence coefficients and
+    norms are transferred to the host for branching.
+
+    The algorithm follows SciPy's MINRES (Paige & Saunders, 1975)
+    line-for-line.  Three host syncs per iteration are unavoidable:
+    ``alpha`` and ``beta`` (Lanczos inner products) and ``ynorm``
+    (solution norm for stopping tests).
 
     Parameters
     ----------
-    A       : array_like or LinearOperator -- symmetric/Hermitian (n, n)
-    b       : array_like -- right-hand side (n,)
-    x0      : array_like, optional -- initial guess
-    shift   : float -- solve (A - shift*I)x = b
-    rtol    : float -- relative tolerance (default 1e-5)
-    tol     : float, optional -- deprecated alias for rtol
-    maxiter : int, optional -- max iterations (default 5*n)
-    M       : LinearOperator, optional -- SPD preconditioner
-    callback: callable, optional -- callback(xk) after each step
-    check   : bool -- verify A symmetry before iterating
-    atol    : float, optional -- absolute tolerance
+    A : dpnp sparse matrix, 2-D dpnp.ndarray, or LinearOperator
+        The real symmetric or complex Hermitian matrix, shape ``(n, n)``.
+    b : dpnp.ndarray
+        Right-hand side, shape ``(n,)`` or ``(n, 1)``.
+    x0 : dpnp.ndarray, optional
+        Starting guess for the solution.
+    shift : float
+        If nonzero, solve ``(A - shift*I)x = b``.  Default 0.
+    rtol : float
+        Relative tolerance for convergence.  Default 1e-5.
+    tol : float, optional
+        Deprecated alias for *rtol*.
+    maxiter : int, optional
+        Maximum number of iterations.  Default ``5*n``.
+    M : dpnp sparse matrix, dpnp.ndarray, or LinearOperator, optional
+        Preconditioner approximating the inverse of ``A``.
+    callback : callable, optional
+        Called as ``callback(xk)`` after each iteration.
+    show : bool
+        If True, print convergence summary each iteration.
+    check : bool
+        If True, verify that ``A`` and ``M`` are symmetric before
+        iterating.  Costs extra matvecs.
 
     Returns
     -------
-    x    : dpnp.ndarray
-    info : int  0=converged  1=maxiter  2=stagnation
+    x : dpnp.ndarray
+        The converged (or best) solution.
+    info : int
+        0 if converged, ``maxiter`` if the iteration limit was reached.
+
+    Notes
+    -----
+    This is a direct translation of the Paige--Saunders MINRES algorithm
+    as implemented in SciPy, adapted for dpnp device arrays with the
+    oneMKL SpMV cached-handle fast-path.
+
+    See Also
+    --------
+    scipy.sparse.linalg.minres
+    cupyx.scipy.sparse.linalg.minres
     """
     if tol is not None:
         rtol = tol
 
     A_op, M_op, x, b, dtype = _make_system(A, M, x0, b)
-    n = b.shape[0]
-    queue = b.sycl_queue
+    matvec = A_op.matvec
+    psolve = M_op.matvec
 
-    eps = float(_np.finfo(_np_dtype(dtype)).eps)
+    n = A_op.shape[0]
     if maxiter is None:
         maxiter = 5 * n
 
-    bnrm_dev = _dpnp.linalg.norm(b)
-    bnrm = float(bnrm_dev)
-    if bnrm == 0.0:
-        return _dpnp.zeros_like(b), 0
-
-    atol_eff = _get_atol(bnrm, atol=atol, rtol=rtol)
-
-    r1 = b - A_op.matvec(x) if x0 is not None else b.copy()
-    y = M_op.matvec(r1)
-
-    # Initial preconditioner SPD check (one sync, setup only).
-    beta1_inner = float(_dpnp.real(_dpnp.vdot(r1, y)))
-    if beta1_inner < 0.0:
-        raise ValueError(
-            "minres: preconditioner M is not positive semi-definite "
-            f"(<r, M*r> = {beta1_inner:.6g} < 0)"
-        )
-    if beta1_inner == 0.0:
-        return x, 0
-
-    beta1 = _np.sqrt(beta1_inner)
+    istop = 0
+    itn = 0
+    Anorm = 0
+    Acond = 0
+    rnorm = 0
+    ynorm = 0
+
+    xtype = dtype
+    eps = dpnp.finfo(xtype).eps
+
+    # ------------------------------------------------------------------
+    # Set up y and v for the first Lanczos vector v1.
+    #   y  = beta1 * P' * v1, where P = M**(-1).
+    #   v  is really P' * v1.
+    # ------------------------------------------------------------------
+
+    Ax = matvec(x)
+    r1 = b - Ax
+    y = psolve(r1)
+
+    # beta1 = <r1, y>   -- one host sync (setup only).
+    # Transferred to host immediately because beta1 seeds ~5 host-side
+    # scalars (beta, qrnorm, phibar, rhs1) used in Python arithmetic
+    # and branches every iteration.  Keeping it as a 0-D device array
+    # would cascade implicit syncs or 0-D allocations throughout the
+    # recurrence.
+    beta1 = dpnp.inner(r1, y)
+
+    if beta1 < 0:
+        raise ValueError("indefinite preconditioner")
+    elif beta1 == 0:
+        return (x, 0)
+
+    beta1 = dpnp.sqrt(beta1)
+    beta1 = float(beta1)
 
     if check:
-        Ay = A_op.matvec(y) - shift * y
-        # This block is diagnostic and only runs when check=True, so
-        # the syncs here are acceptable.
-        y_Ay = float(_dpnp.real(_dpnp.vdot(y, Ay)))
-        y_y = float(_dpnp.real(_dpnp.vdot(y, y)))
-        lhs = float(_dpnp.linalg.norm(Ay - (y_Ay / y_y) * y))
-        rhs = eps ** 0.5 * float(_dpnp.linalg.norm(Ay))
-        if lhs > rhs:
-            raise ValueError(
-                "minres: A does not appear symmetric/Hermitian; "
-                "set check=False to skip this test."
-            )
-
-    # Host-side recurrence state -- these are all scalars that drive
-    # branches, so there's no benefit to keeping them on device.
+        # See if A is symmetric.  All on device; only the bool syncs.
+        w_chk = matvec(y)
+        r2_chk = matvec(w_chk)
+        s = dpnp.inner(w_chk, w_chk)
+        t = dpnp.inner(y, r2_chk)
+        if abs(s - t) > (s + eps) * eps ** (1.0 / 3.0):
+            raise ValueError("non-symmetric matrix")
+
+        # See if M is symmetric.
+        r2_chk = psolve(y)
+        s = dpnp.inner(y, y)
+        t = dpnp.inner(r1, r2_chk)
+        if abs(s - t) > (s + eps) * eps ** (1.0 / 3.0):
+            raise ValueError("non-symmetric preconditioner")
+
+    # Initialise remaining quantities (all host-side scalars).
+    oldb = 0
     beta = beta1
-    oldb = 0.0
+    dbar = 0
+    epsln = 0
+    qrnorm = beta1
     phibar = beta1
-    cs = -1.0
-    sn = 0.0
-    dbar = 0.0
-    epsln = 0.0
-    tnorm2 = 0.0
-    gmax = 0.0
-    gmin = _np.finfo(_np_dtype(dtype)).max
-
-    # Device-side vector state.
-    w = _dpnp.zeros(n, dtype=dtype, sycl_queue=queue)
-    w2 = _dpnp.zeros(n, dtype=dtype, sycl_queue=queue)
-    r2 = r1.copy()
-    v = y / beta1
-
-    stag_eps = 10.0 * eps
-    info = 1
-
-    for itr in range(1, maxiter + 1):
+    rhs1 = beta1
+    rhs2 = 0
+    tnorm2 = 0
+    gmax = 0
+    gmin = dpnp.finfo(xtype).max
+    cs = -1
+    sn = 0
+    queue = b.sycl_queue
+    w = dpnp.zeros(n, dtype=xtype, sycl_queue=queue)
+    w2 = dpnp.zeros(n, dtype=xtype, sycl_queue=queue)
+    r2 = r1
+
+    # Main Lanczos loop.
+    while itn < maxiter:
+        itn += 1
+
         s = 1.0 / beta
-        v = y * s
-        y = A_op.matvec(v) - shift * v
-        if itr > 1:
+        v = s * y                           # on device
+
+        y = matvec(v)
+        y = y - shift * v
+
+        if itn >= 2:
             y = y - (beta / oldb) * r1
 
-        # alpha = <v, y>  -- one sync for the recurrence coefficient.
-        alpha = float(_dpnp.real(_dpnp.vdot(v, y)))
+        # alpha = <v, y>   -- host sync #1
+        alpha = float(dpnp.inner(v, y))
+
         y = y - (alpha / beta) * r2
         r1 = r2
         r2 = y
-        y = M_op.matvec(r2)
+        y = psolve(r2)
         oldb = beta
 
-        # SPD check on M each iteration. Single sync, unavoidable
-        # because beta feeds the next iteration's scaling.
-        inner_r2y = float(_dpnp.real(_dpnp.vdot(r2, y)))
-        if inner_r2y < 0.0:
-            raise ValueError(
-                "minres: preconditioner M is not positive semi-definite "
-                f"(<r, M*r> = {inner_r2y:.6g} < 0 at iteration {itr})"
-            )
-        beta = _np.sqrt(inner_r2y)
+        # beta = sqrt(<r2, y>)   -- host sync #2
+        beta = float(dpnp.inner(r2, y))
+        if beta < 0:
+            raise ValueError("non-symmetric matrix")
+        beta = numpy.sqrt(beta)
 
         tnorm2 += alpha ** 2 + oldb ** 2 + beta ** 2
+
+        if itn == 1:
+            if beta / beta1 <= 10 * eps:
+                istop = -1  # Terminate later
+
+        # Apply previous rotation Q_{k-1} to get
+        #   [delta_k  epsln_{k+1}] = [cs  sn] [dbar_k     0     ]
+        #   [gbar_k   dbar_{k+1} ]   [sn -cs] [alpha_k  beta_{k+1}]
         oldeps = epsln
         delta = cs * dbar + sn * alpha
-        gbar_k = sn * dbar - cs * alpha
+        gbar = sn * dbar - cs * alpha
         epsln = sn * beta
         dbar = -cs * beta
-        root = _np.hypot(gbar_k, dbar)
-        gamma = _np.hypot(gbar_k, beta)
-        if gamma == 0.0:
-            gamma = eps
-        cs = gbar_k / gamma
+        root = numpy.sqrt(gbar ** 2 + dbar ** 2)
+        Arnorm = phibar * root          # ||A r_{k-1}||
+
+        # Compute the next plane rotation Q_k.
+        gamma = numpy.sqrt(gbar ** 2 + beta ** 2)
+        gamma = max(gamma, eps)
+        cs = gbar / gamma
         sn = beta / gamma
         phi = cs * phibar
         phibar = sn * phibar
 
-        gmax = max(gmax, gamma)
-        gmin = min(gmin, gamma)
+        # Update x  -- all on device.
         denom = 1.0 / gamma
+        w1 = w2
+        w2 = w
+        w = (v - oldeps * w1 - delta * w2) * denom
+        x = x + phi * w
 
-        # Update solution estimate -- all on device.
-        w_new = (v - oldeps * w - delta * w2) * denom
-        w = w2
-        w2 = w_new
-        x = x + phi * w_new
-
-        rnorm = abs(phibar)
-        Anorm = _np.sqrt(tnorm2)
-
-        # ynorm sync: needed for the relative-residual test and the
-        # corrected stagnation test.
-        ynorm = float(_dpnp.linalg.norm(x))
+        # Go round again.
+        gmax = max(gmax, gamma)
+        gmin = min(gmin, gamma)
+        z = rhs1 / gamma
+        rhs1 = rhs2 - delta * z
+        rhs2 = -epsln * z
+
+        # ----------------------------------------------------------
+        # Estimate norms and test for convergence.
+        # ----------------------------------------------------------
+        Anorm = numpy.sqrt(tnorm2)
+        ynorm = float(dpnp.linalg.norm(x))     # host sync #3
+        epsa = Anorm * eps
+        epsx = Anorm * ynorm * eps
+        epsr = Anorm * ynorm * rtol
+        diag = gbar
+        if diag == 0:
+            diag = epsa
+
+        qrnorm = phibar
+        rnorm = qrnorm
+        if ynorm == 0 or Anorm == 0:
+            test1 = numpy.inf
+        else:
+            test1 = rnorm / (Anorm * ynorm)    # ||r|| / (||A|| ||x||)
+        if Anorm == 0:
+            test2 = numpy.inf
+        else:
+            test2 = root / Anorm                # ||Ar|| / (||A|| ||r||)
+
+        # Estimate cond(A).
+        Acond = gmax / gmin
+
+        # Stopping criteria (SciPy's istop codes).
+        if istop == 0:
+            t1 = 1 + test1
+            t2 = 1 + test2
+            if t2 <= 1:
+                istop = 2
+            if t1 <= 1:
+                istop = 1
+
+            if itn >= maxiter:
+                istop = 6
+            if Acond >= 0.1 / eps:
+                istop = 4
+            if epsx >= beta1:
+                istop = 3
+            if test2 <= rtol:
+                istop = 2
+            if test1 <= rtol:
+                istop = 1
+
+        if show:
+            prnt = (n <= 40 or itn <= 10 or itn >= maxiter - 10
+                    or itn % 10 == 0 or qrnorm <= 10 * epsx
+                    or qrnorm <= 10 * epsr or Acond <= 1e-2 / eps
+                    or istop != 0)
+            if prnt:
+                x1 = float(x[0])
+                print(f"{itn:6g} {x1:12.5e} {test1:10.3e}"
+                      f" {test2:10.3e}"
+                      f" {Anorm:8.1e} {Acond:8.1e}"
+                      f" {gbar / Anorm if Anorm else 0:8.1e}")
+                if itn % 10 == 0:
+                    print()
 
         if callback is not None:
             callback(x)
 
-        # Stopping criterion 1: absolute residual.
-        if rnorm <= atol_eff:
-            info = 0
+        if istop != 0:
             break
 
-        # Stopping criterion 2: relative residual ||r|| / (||A|| ||x||).
-        if Anorm > 0.0 and ynorm > 0.0:
-            if rnorm / (Anorm * ynorm) <= rtol:
-                info = 0
-                break
+    if istop == 6:
+        info = maxiter
+    else:
+        info = 0
 
-        # Stopping criterion 3: range-space residual ||A^T r|| / ||A||.
-        if Anorm > 0.0 and rnorm > 0.0:
-            if root / Anorm <= rtol:
-                info = 0
-                break
+    return (x, info)
 
-        # Stopping criterion 4: condition number estimate.
-        if Anorm > 0.0 and (gmax / gmin) >= 0.1 / eps:
-            info = 0
-            break
+def _make_compute_hu(V):
+    """Factory mirroring cupyx's _make_compute_hu using oneMKL gemv directly.
 
-        # Stagnation: step size relative to solution magnitude.
-        # Corrected from the original (which missed the /ynorm normalization).
-        if ynorm > 0.0 and abs(phi) / gamma < stag_eps * ynorm:
-            info = 2
-            break
-    else:
-        info = 1
+    Returns a closure compute_hu(u, j) that performs:
+        h = V[:, :j+1]^H @ u            (gemv with transpose=True)
+        u = u - V[:, :j+1] @ h          (gemv with transpose=False, then subtract)
 
-    return x, int(info)
+    The current bi._gemv binding hardcodes alpha=1, beta=0, so the second
+    pass requires a temporary vector and an explicit subtraction.  To get
+    CuPy's fused u -= V@h in one kernel, the C++ binding would need
+    alpha/beta parameters.
+
+    V must be column-major; sub-views V[:, :j+1] of an F-order array
+    are themselves F-contiguous, so the same closure handles every j.
+    """
+    if V.ndim != 2 or not V.flags.f_contiguous:
+        raise ValueError(
+            "_make_compute_hu: V must be a 2-D column-major (F-order) "
+            "dpnp array"
+        )
+
+    exec_q = V.sycl_queue
+    dtype = V.dtype
+    is_cpx = dpnp.issubdtype(dtype, dpnp.complexfloating)
+    V_usm = dpnp.get_usm_ndarray(V)
+
+    def compute_hu(u, j):
+        # h = V[:, :j+1]^H @ u  (allocate fresh, length j+1)
+        h = dpnp.empty(j + 1, dtype=dtype, sycl_queue=exec_q)
+
+        # Sub-view: column-major slice of the trailing axis is F-contiguous.
+        Vj = V[:, :j + 1]
+        Vj_usm = dpnp.get_usm_ndarray(Vj)
+        u_usm  = dpnp.get_usm_ndarray(u)
+        h_usm  = dpnp.get_usm_ndarray(h)
+
+        _manager = dpu.SequentialOrderManager[exec_q]
+
+        # Pass 1: h = Vj^T @ u  (real) or  h = (Vj^T @ u) then conj  (complex)
+        ht1, ev1 = bi._gemv(
+            exec_q, Vj_usm, u_usm, h_usm,
+            transpose=True,
+            depends=_manager.submitted_events,
+        )
+        _manager.add_event_pair(ht1, ev1)
+
+        if is_cpx:
+            # h = conj(h) -- in-place, length j+1, negligible
+            h = dpnp.conj(h, out=h)
+            h_usm = dpnp.get_usm_ndarray(h)
+
+        # Pass 2: tmp = Vj @ h, then u -= tmp
+        # No fused AXPY available, so we still allocate tmp.
+        tmp = dpnp.empty_like(u)
+        tmp_usm = dpnp.get_usm_ndarray(tmp)
+        ht2, ev2 = bi._gemv(
+            exec_q, Vj_usm, h_usm, tmp_usm,
+            transpose=False,
+            depends=_manager.submitted_events,
+        )
+        _manager.add_event_pair(ht2, ev2)
+
+        u -= tmp
+        return h, u
+
+    return compute_hu

From 4442530df713e41293fb269e8a38b2e47fc784df Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Apr 2026 03:30:26 +0000
Subject: [PATCH 38/43] Add testing

---
 dpnp/backend/extensions/sparse/gemv.cpp       |    2 +-
 .../extensions/sparse/types_matrix.hpp        |    2 +-
 dpnp/tests/test_scipy_sparse_linalg.py        | 1658 ++++++++---------
 3 files changed, 817 insertions(+), 845 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index ac87b57a3397..91e3a8d18933 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -405,7 +405,7 @@ void init_sparse_gemv_dispatch_tables(void)
 
     // 1-D table on Tv for compute. dpctl's type dispatch headers expose
     // DispatchVectorBuilder as the 1-D analogue of DispatchTableBuilder.
-    dpctl_td_ns::DispatchVectorBuilder
+    dpctl_td_ns::DispatchVectorBuilder<
         gemv_compute_fn_ptr_t,
         GemvComputeContigFactory,
         dpctl_td_ns::num_types>
diff --git a/dpnp/backend/extensions/sparse/types_matrix.hpp b/dpnp/backend/extensions/sparse/types_matrix.hpp
index a2b7d16fe3f9..c02a7e4ce47e 100644
--- a/dpnp/backend/extensions/sparse/types_matrix.hpp
+++ b/dpnp/backend/extensions/sparse/types_matrix.hpp
@@ -65,7 +65,7 @@ namespace dpnp::extensions::sparse::types
 template <typename Tv, typename Ti>
 struct SparseGemvInitTypePairSupportFactory
 {
-    static constexpr bool is_defined = std::disjunction
+    static constexpr bool is_defined = std::disjunction<
         // real single precision
         dpctl_td_ns::TypePairDefinedEntry<Tv, float, Ti, std::int32_t>,
         dpctl_td_ns::TypePairDefinedEntry<Tv, float, Ti, std::int64_t>,
diff --git a/dpnp/tests/test_scipy_sparse_linalg.py b/dpnp/tests/test_scipy_sparse_linalg.py
index c45ccb1e4c03..bce364ef3739 100644
--- a/dpnp/tests/test_scipy_sparse_linalg.py
+++ b/dpnp/tests/test_scipy_sparse_linalg.py
@@ -1,45 +1,23 @@
-# Copyright (c) 2025, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-# * Neither the name of Intel Corporation nor the names of its contributors
-#   may be used to endorse or promote products derived from this software
-#   without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-
-"""Tests for dpnp.scipy.sparse.linalg: LinearOperator, cg, gmres, minres.
-
-The test structure and helper usage mirror dpnp/tests/test_linalg.py so that
-the suite fits naturally into the existing CI infrastructure.
-
-Note: dpnp.ndarray deliberately blocks implicit numpy conversion (raises
-TypeError in __array__) to prevent silent dtype=object arrays.  All
-assertions that need a host-side NumPy array must call `arr.asnumpy()`
-explicitly instead of `numpy.asarray(arr)`.
-"""
+import warnings
 
 import numpy
 import pytest
-from numpy.testing import assert_allclose, assert_array_equal, assert_raises
+from numpy.testing import (
+    assert_allclose,
+    assert_raises,
+)
 
 import dpnp
+from dpnp.tests.helper import (
+    assert_dtype_allclose,
+    generate_random_numpy_array,
+    get_all_dtypes,
+    get_float_complex_dtypes,
+    has_support_aspect64,
+    is_scipy_available,
+)
+from dpnp.tests.third_party.cupy import testing
+
 from dpnp.scipy.sparse.linalg import (
     LinearOperator,
     aslinearoperator,
@@ -48,878 +26,872 @@
     minres,
 )
 
-from .helper import (
-    assert_dtype_allclose,
-    generate_random_numpy_array,
-    get_float_complex_dtypes,
+if is_scipy_available():
+    import scipy.sparse.linalg as scipy_sla
+
+
+# Helpers for constructing SPD, diagonally dominant, and symmetric
+# indefinite test matrices. Kept small and local, matching the style of
+# vvsort() at the top of test_linalg.py.
+def _spd_matrix(n, dtype):
+    rng = numpy.random.default_rng(42)
+    is_complex = numpy.issubdtype(numpy.dtype(dtype), numpy.complexfloating)
+    if is_complex:
+        a = rng.standard_normal((n, n)) + 1j * rng.standard_normal((n, n))
+        a = a.conj().T @ a + n * numpy.eye(n)
+    else:
+        a = rng.standard_normal((n, n))
+        a = a.T @ a + n * numpy.eye(n)
+    return dpnp.asarray(a.astype(dtype))
+
+
+def _diag_dominant(n, dtype, seed=81):
+    rng = numpy.random.default_rng(seed)
+    is_complex = numpy.issubdtype(numpy.dtype(dtype), numpy.complexfloating)
+    if is_complex:
+        a = 0.05 * (
+            rng.standard_normal((n, n)) + 1j * rng.standard_normal((n, n))
+        )
+    else:
+        a = 0.05 * rng.standard_normal((n, n))
+    a = a + float(n) * numpy.eye(n)
+    return dpnp.asarray(a.astype(dtype))
+
+
+def _sym_indefinite(n, dtype, seed=99):
+    rng = numpy.random.default_rng(seed)
+    a = rng.standard_normal((n, n))
+    q, _ = numpy.linalg.qr(a)
+    d = rng.standard_normal(n)
+    m = (q @ numpy.diag(d) @ q.T).astype(dtype)
+    return dpnp.asarray(m)
+
+
+def _rhs(n, dtype, seed=7):
+    rng = numpy.random.default_rng(seed)
+    is_complex = numpy.issubdtype(numpy.dtype(dtype), numpy.complexfloating)
+    if is_complex:
+        b = rng.standard_normal(n) + 1j * rng.standard_normal(n)
+    else:
+        b = rng.standard_normal(n)
+    b /= numpy.linalg.norm(b)
+    return dpnp.asarray(b.astype(dtype))
+
+
+def _rtol_for(dtype):
+    if dtype in (dpnp.float32, dpnp.complex64, numpy.float32, numpy.complex64):
+        return 1e-5
+    return 1e-8
+
+
+def _res_bound(dtype):
+    if dtype in (dpnp.float32, dpnp.complex64, numpy.float32, numpy.complex64):
+        return 1e-3
+    return 1e-5
+
+
+# GMRES in dpnp.scipy.sparse.linalg._iterative uses real-valued Givens
+# rotation formulas which are incorrect for complex Arnoldi, so GMRES
+# returns wrong solutions for complex dtypes. Complex GMRES tests are
+# xfailed below. When the Givens block is fixed the xfails will flip to
+# XPASS and force an update here.
+_GMRES_CPX_XFAIL = (
+    "GMRES Givens rotation is real-valued; broken for complex dtypes"
 )
 
+_GMRES_DTYPES = [
+    dpnp.float32,
+    dpnp.float64,
+    pytest.param(
+        dpnp.complex64,
+        marks=pytest.mark.xfail(reason=_GMRES_CPX_XFAIL, strict=False),
+    ),
+    pytest.param(
+        dpnp.complex128,
+        marks=pytest.mark.xfail(reason=_GMRES_CPX_XFAIL, strict=False),
+    ),
+]
+
+
+class TestImports:
+    def test_all_symbols_importable(self):
+        from dpnp.scipy.sparse.linalg import (  # noqa: F401
+            LinearOperator,
+            aslinearoperator,
+            cg,
+            gmres,
+            minres,
+        )
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-def _to_numpy(x):
-    """Convert a dpnp array (or plain numpy array) to numpy safely."""
-    if isinstance(x, dpnp.ndarray):
-        return x.asnumpy()
-    return numpy.asarray(x)
-
-
-def _make_spd(n, dtype, rng):
-    """Return a symmetric positive-definite matrix of size n."""
-    A = rng.standard_normal((n, n)).astype(dtype)
-    return A.T @ A + n * numpy.eye(n, dtype=dtype)
-
-
-def _make_sym_indef(n, dtype, rng):
-    """Return a symmetric (possibly indefinite) matrix of size n."""
-    Q, _ = numpy.linalg.qr(rng.standard_normal((n, n)).astype(dtype))
-    D = numpy.diag(rng.standard_normal(n).astype(dtype))
-    return Q @ D @ Q.T
-
-
-def _make_nonsym(n, dtype, rng):
-    """Return a diagonally dominant (non-symmetric) matrix of size n."""
-    A = rng.standard_normal((n, n)).astype(dtype)
-    A += n * numpy.eye(n, dtype=dtype)
-    return A
-
+        for sym in (LinearOperator, aslinearoperator, cg, gmres, minres):
+            assert callable(sym)
 
-def _rel_residual(A_np, x_dp, b_np):
-    """Relative residual ||Ax - b|| / ||b||."""
-    x_np = _to_numpy(x_dp)
-    r = A_np @ x_np - b_np
-    b_nrm = numpy.linalg.norm(b_np)
-    return numpy.linalg.norm(r) / (b_nrm if b_nrm > 0 else 1.0)
+    def test_all_in_dunder_all(self):
+        import dpnp.scipy.sparse.linalg as mod
 
+        for name in (
+            "LinearOperator",
+            "aslinearoperator",
+            "cg",
+            "gmres",
+            "minres",
+        ):
+            assert name in mod.__all__
 
-# ---------------------------------------------------------------------------
-# TestLinearOperator
-# ---------------------------------------------------------------------------
 
 class TestLinearOperator:
-    """Tests for the LinearOperator class and aslinearoperator helper."""
-
-    # --- basic construction ---
-
-    def test_basic_construction_shape_dtype(self):
-        n = 8
-        A_np = numpy.eye(n, dtype=numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        assert op.shape == (n, n)
-        assert op.ndim == 2
-
-    def test_dtype_inferred_from_matvec(self):
-        n = 6
-        A_dp = dpnp.eye(n, dtype=numpy.float32)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        assert op.dtype == numpy.float32
-
-    def test_dtype_explicit_override(self):
-        n = 4
-        A_dp = dpnp.eye(n)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=numpy.float32)
-        assert op.dtype == numpy.float32
-
-    @pytest.mark.parametrize("n", [1, 5, 20])
-    def test_matvec_identity(self, n):
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        x_dp = dpnp.arange(n, dtype=numpy.float64)
-        y_dp = op.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), _to_numpy(x_dp), rtol=1e-12)
-
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_matvec_dense(self, dtype):
-        rng = numpy.random.default_rng(0)
-        n = 10
-        A_np = _make_spd(n, dtype, rng)
-        A_dp = dpnp.asarray(A_np)
-        x_np = rng.standard_normal(n).astype(dtype)
-        x_dp = dpnp.asarray(x_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
-        y_dp = op.matvec(x_dp)
-        y_ref = A_np @ x_np
-        assert_allclose(_to_numpy(y_dp), y_ref, rtol=1e-5)
-
-    # --- rmatvec ---
-
-    def test_rmatvec_defined(self):
-        rng = numpy.random.default_rng(1)
-        n = 8
-        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-        x_np = rng.standard_normal(n)
-        x_dp = dpnp.asarray(x_np)
-
-        op = LinearOperator(
-            (n, n),
-            matvec=lambda x: A_dp @ x,
-            rmatvec=lambda x: A_dp.T @ x,
+    @pytest.mark.parametrize(
+        "shape",
+        [(5, 5), (7, 3), (3, 7)],
+        ids=["(5, 5)", "(7, 3)", "(3, 7)"],
+    )
+    def test_shape(self, shape):
+        m, n = shape
+        lo = LinearOperator(
+            shape,
+            matvec=lambda x: dpnp.zeros(m, dtype=dpnp.float32),
+            dtype=dpnp.float32,
         )
-        y_dp = op.rmatvec(x_dp)
-        y_ref = A_np.T @ x_np
-        assert_allclose(_to_numpy(y_dp), y_ref, rtol=1e-12)
+        assert lo.shape == (m, n)
+        assert lo.ndim == 2
 
-    def test_rmatvec_not_defined_raises(self):
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_dtype_explicit(self, dtype):
         n = 4
-        A_dp = dpnp.eye(n)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        x_dp = dpnp.ones(n)
-        with pytest.raises(NotImplementedError):
-            op.rmatvec(x_dp)
-
-    # --- matmat ---
-
-    def test_matmat_fallback_loop(self):
-        rng = numpy.random.default_rng(2)
-        n, k = 6, 4
-        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-        X_np = rng.standard_normal((n, k)).astype(numpy.float64)
-        X_dp = dpnp.asarray(X_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        Y_dp = op.matmat(X_dp)
-        Y_ref = A_np @ X_np
-        assert_allclose(_to_numpy(Y_dp), Y_ref, rtol=1e-10)
-
-    def test_matmat_explicit(self):
-        rng = numpy.random.default_rng(3)
-        n, k = 5, 3
-        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-        X_np = rng.standard_normal((n, k)).astype(numpy.float64)
-        X_dp = dpnp.asarray(X_np)
-
-        op = LinearOperator(
+        a = dpnp.eye(n, dtype=dtype)
+        lo = LinearOperator(
             (n, n),
-            matvec=lambda x: A_dp @ x,
-            matmat=lambda X: A_dp @ X,
+            matvec=lambda x: (a @ x.astype(dtype)).astype(dtype),
+            dtype=dtype,
         )
-        Y_dp = op.matmat(X_dp)
-        assert_allclose(_to_numpy(Y_dp), A_np @ X_np, rtol=1e-10)
-
-    # --- __matmul__ / __call__ ---
-
-    def test_matmul_1d(self):
-        n = 5
-        A_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        x_dp = dpnp.ones(n)
-        y_dp = op @ x_dp
-        assert_allclose(_to_numpy(y_dp), numpy.full(n, 2.0))
-
-    def test_matmul_2d(self):
-        n, k = 4, 3
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        X_dp = dpnp.ones((n, k))
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        Y_dp = op @ X_dp
-        assert_allclose(_to_numpy(Y_dp), numpy.ones((n, k)))
-
-    def test_call_delegates_to_matmul(self):
+        assert lo.dtype == dtype
+
+    def test_dtype_inference_float64_default(self):
+        # Dtype inference probes matvec with a float64 vector, so the
+        # inferred dtype is float64 even when the underlying array is
+        # float32. Pin the current behaviour as a regression guard.
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
         n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        x_dp = dpnp.ones(n)
-        assert_allclose(_to_numpy(op(x_dp)), _to_numpy(op @ x_dp))
+        a = dpnp.eye(n, dtype=dpnp.float32)
+        lo = LinearOperator((n, n), matvec=lambda x: a @ x)
+        assert lo.dtype == dpnp.float64
 
-    # --- operator algebra ---
-
-    def test_adjoint_property_H(self):
-        rng = numpy.random.default_rng(4)
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_matvec(self, dtype):
         n = 6
-        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-        op = LinearOperator(
+        a = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        ia = dpnp.array(a)
+        lo = LinearOperator((n, n), matvec=lambda x: ia @ x, dtype=dtype)
+        x = generate_random_numpy_array((n,), dtype, seed_value=1)
+        ix = dpnp.array(x)
+        result = lo.matvec(ix)
+        expected = a @ x
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_rmatvec(self, dtype):
+        n = 5
+        a = generate_random_numpy_array((n, n), dtype, seed_value=12)
+        ia = dpnp.array(a)
+        lo = LinearOperator(
             (n, n),
-            matvec=lambda x: A_dp @ x,
-            rmatvec=lambda x: A_dp.T @ x,
+            matvec=lambda x: ia @ x,
+            rmatvec=lambda x: dpnp.conj(ia.T) @ x,
+            dtype=dtype,
         )
-        x_dp = dpnp.asarray(rng.standard_normal(n))
-        y_H = op.H.matvec(x_dp)
-        y_ref = A_np.T @ _to_numpy(x_dp)
-        assert_allclose(_to_numpy(y_H), y_ref, rtol=1e-12)
-
-    def test_transpose_property_T(self):
-        rng = numpy.random.default_rng(5)
+        x = generate_random_numpy_array((n,), dtype, seed_value=3)
+        ix = dpnp.array(x)
+        result = lo.rmatvec(ix)
+        expected = a.conj().T @ x
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_matmat_fallback_loop(self, dtype):
+        n, k = 5, 3
+        a = generate_random_numpy_array((n, n), dtype, seed_value=55)
+        ia = dpnp.array(a)
+        lo = LinearOperator((n, n), matvec=lambda x: ia @ x, dtype=dtype)
+        x = generate_random_numpy_array((n, k), dtype, seed_value=9)
+        ix = dpnp.array(x)
+        result = lo.matmat(ix)
+        expected = a @ x
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_matmul_1d(self, dtype):
+        # lo @ x dispatches to matvec
         n = 6
-        A_np = rng.standard_normal((n, n)).astype(numpy.float64)
-        A_dp = dpnp.asarray(A_np)
-        op = LinearOperator(
-            (n, n),
-            matvec=lambda x: A_dp @ x,
-            rmatvec=lambda x: A_dp.T @ x,
-        )
-        x_dp = dpnp.asarray(rng.standard_normal(n))
-        y_T = op.T.matvec(x_dp)
-        # For real A, T == H
-        y_ref = A_np.T @ _to_numpy(x_dp)
-        assert_allclose(_to_numpy(y_T), y_ref, rtol=1e-12)
-
-    def test_add_two_operators(self):
-        n = 5
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        B_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
-        opA = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        opB = LinearOperator((n, n), matvec=lambda x: B_dp @ x)
-        opC = opA + opB
-        x_dp = dpnp.ones(n)
-        y_dp = opC.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), numpy.full(n, 3.0))
-
-    def test_scalar_multiply(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        op3 = op * 3.0
-        x_dp = dpnp.ones(n)
-        y_dp = op3.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), numpy.full(n, 3.0))
-
-    def test_product_operator(self):
-        n = 5
-        A_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
-        B_dp = dpnp.eye(n, dtype=numpy.float64) * 3.0
-        opA = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        opB = LinearOperator((n, n), matvec=lambda x: B_dp @ x)
-        opAB = opA * opB
-        x_dp = dpnp.ones(n)
-        y_dp = opAB.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), numpy.full(n, 6.0))
-
-    def test_neg_operator(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        neg_op = -op
-        x_dp = dpnp.ones(n)
-        y_dp = neg_op.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), numpy.full(n, -1.0))
-
-    def test_power_operator(self):
+        a = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        ia = dpnp.array(a)
+        lo = LinearOperator((n, n), matvec=lambda x: ia @ x, dtype=dtype)
+        x = generate_random_numpy_array((n,), dtype, seed_value=2)
+        ix = dpnp.array(x)
+        result = lo @ ix
+        expected = a @ x
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_matmul_2d(self, dtype):
+        # lo @ X dispatches to matmat
+        n, k = 5, 3
+        a = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        ia = dpnp.array(a)
+        lo = LinearOperator((n, n), matvec=lambda x: ia @ x, dtype=dtype)
+        x = generate_random_numpy_array((n, k), dtype, seed_value=5)
+        ix = dpnp.array(x)
+        result = lo @ ix
+        expected = a @ x
+        assert_dtype_allclose(result, expected)
+
+    def test_call_alias(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
         n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64) * 2.0
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        op3 = op ** 3
-        x_dp = dpnp.ones(n)
-        y_dp = op3.matvec(x_dp)
-        # 2^3 * I * [1...] = 8
-        assert_allclose(_to_numpy(y_dp), numpy.full(n, 8.0))
+        ia = dpnp.eye(n, dtype=dpnp.float64)
+        lo = LinearOperator((n, n), matvec=lambda x: ia @ x, dtype=dpnp.float64)
+        ix = dpnp.ones(n, dtype=dpnp.float64)
+        assert_allclose(dpnp.asnumpy(lo(ix)), numpy.ones(n), atol=1e-12)
+
+    def test_repr(self):
+        lo = LinearOperator(
+            (3, 4),
+            matvec=lambda x: dpnp.zeros(3, dtype=dpnp.float32),
+            dtype=dpnp.float32,
+        )
+        r = repr(lo)
+        assert "LinearOperator" in r
+        assert "3x4" in r or "(3, 4)" in r
+
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_subclass_custom_matmat(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        n, k = 7, 4
+        a = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        ia = dpnp.array(a)
+
+        class MyOp(LinearOperator):
+            def __init__(self):
+                super().__init__(dtype=dtype, shape=(n, n))
+                self._a = ia
+
+            def _matvec(self, x):
+                return self._a @ x
+
+            def _matmat(self, X):
+                return self._a @ X
+
+        op = MyOp()
+        x = generate_random_numpy_array((n, k), dtype, seed_value=9)
+        ix = dpnp.array(x)
+        result = op.matmat(ix)
+        expected = a @ x
+        assert_dtype_allclose(result, expected)
+
+    def test_linear_operator_errors(self):
+        lo = LinearOperator(
+            (3, 5),
+            matvec=lambda x: dpnp.zeros(3, dtype=dpnp.float32),
+            dtype=dpnp.float32,
+        )
+        # matvec with wrong shape
+        assert_raises(ValueError, lo.matvec, dpnp.ones(4, dtype=dpnp.float32))
+
+        # rmatvec not provided
+        lo2 = LinearOperator(
+            (3, 3),
+            matvec=lambda x: dpnp.zeros(3, dtype=dpnp.float32),
+            dtype=dpnp.float32,
+        )
+        assert_raises(
+            (NotImplementedError, ValueError),
+            lo2.rmatvec,
+            dpnp.zeros(3, dtype=dpnp.float32),
+        )
 
-    # --- shape / error validation ---
+        # matmat with 1-D input
+        assert_raises(ValueError, lo2.matmat, dpnp.ones(3, dtype=dpnp.float32))
 
-    def test_invalid_shape_raises(self):
-        with pytest.raises(ValueError):
-            LinearOperator((5,), matvec=lambda x: x)
+        # negative shape
+        assert_raises(
+            (ValueError, Exception),
+            LinearOperator,
+            (-1, 3),
+            matvec=lambda x: x,
+            dtype=dpnp.float32,
+        )
 
-    def test_matvec_wrong_input_dim_raises(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        with pytest.raises(ValueError):
-            op.matvec(dpnp.ones(n + 1))
+        # shape with wrong ndim
+        assert_raises(
+            (ValueError, Exception),
+            LinearOperator,
+            (3,),
+            matvec=lambda x: x,
+            dtype=dpnp.float32,
+        )
 
-    # --- aslinearoperator ---
 
-    def test_aslinearoperator_identity_if_already_lo(self):
-        n = 4
-        A_dp = dpnp.eye(n)
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x)
-        assert aslinearoperator(op) is op
+class TestAsLinearOperator:
+    def test_identity_if_already_linearoperator(self):
+        lo = LinearOperator((3, 3), matvec=lambda x: x, dtype=dpnp.float32)
+        assert aslinearoperator(lo) is lo
 
-    def test_aslinearoperator_from_dense_dpnp(self):
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_dense_dpnp_array_matvec(self, dtype):
         n = 6
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        op = aslinearoperator(A_dp)
-        x_dp = dpnp.ones(n)
-        y_dp = op.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), numpy.ones(n))
+        a = generate_random_numpy_array((n, n), dtype, seed_value=42)
+        ia = dpnp.array(a)
+        lo = aslinearoperator(ia)
+        assert lo.shape == (n, n)
+        x = generate_random_numpy_array((n,), dtype, seed_value=1)
+        ix = dpnp.array(x)
+        result = lo.matvec(ix)
+        expected = a @ x
+        assert_dtype_allclose(result, expected)
+
+    def test_dense_numpy_array_attributes_only(self):
+        # aslinearoperator(numpy_array) wraps with lambda x: A @ x where A
+        # remains a numpy array; calling matvec(dpnp_x) then fails because
+        # dpnp __rmatmul__ refuses numpy LHS. Only attributes are checked.
+        n = 5
+        a = generate_random_numpy_array((n, n), numpy.float64, seed_value=42)
+        lo = aslinearoperator(a)
+        assert lo.shape == (n, n)
 
-    def test_aslinearoperator_from_numpy(self):
+    def test_rmatvec_from_dpnp_dense(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
         n = 5
-        A_np = numpy.eye(n, dtype=numpy.float64)
-        op = aslinearoperator(A_np)
-        x_dp = dpnp.ones(n)
-        y_dp = op.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), numpy.ones(n))
-
-    def test_aslinearoperator_invalid_raises(self):
-        with pytest.raises(TypeError):
-            aslinearoperator("not_an_array")
-
-    def test_repr_string(self):
-        n = 3
-        op = LinearOperator((n, n), matvec=lambda x: x, dtype=numpy.float64)
-        r = repr(op)
-        assert "3x3" in r
-
-    # --- IdentityOperator ---
-
-    def test_identity_operator(self):
-        from dpnp.scipy.sparse.linalg._interface import IdentityOperator
-
-        n = 7
-        op = IdentityOperator((n, n), dtype=numpy.float64)
-        x_dp = dpnp.arange(n, dtype=numpy.float64)
-        # Expected arrays must match float64 dtype for strict NumPy >= 2.0 checks.
-        assert_array_equal(_to_numpy(op.matvec(x_dp)), numpy.arange(n, dtype=numpy.float64))
-        assert_array_equal(_to_numpy(op.rmatvec(x_dp)), numpy.arange(n, dtype=numpy.float64))
-
-    # --- complex dtype ---
-
-    @pytest.mark.parametrize("dtype", [numpy.complex64, numpy.complex128])
-    def test_complex_matvec(self, dtype):
-        n = 6
-        rng = numpy.random.default_rng(10)
-        A_np = (rng.standard_normal((n, n)) + 1j * rng.standard_normal((n, n))).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        x_np = (rng.standard_normal(n) + 1j * rng.standard_normal(n)).astype(dtype)
-        x_dp = dpnp.asarray(x_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
-        y_dp = op.matvec(x_dp)
-        assert_allclose(_to_numpy(y_dp), A_np @ x_np, rtol=1e-4)
-
-
-# ---------------------------------------------------------------------------
-# TestCG
-# ---------------------------------------------------------------------------
-
-class TestCG:
-    """Tests for dpnp.scipy.sparse.linalg.cg."""
-
-    @pytest.mark.parametrize("n", [5, 10, 30])
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_cg_spd_convergence(self, n, dtype):
-        rng = numpy.random.default_rng(100)
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_dp, info = cg(A_dp, b_dp, tol=1e-7, maxiter=500)
-        assert info == 0, f"CG did not converge (info={info})"
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
-
-    def test_cg_matches_numpy_solve(self):
-        rng = numpy.random.default_rng(101)
-        n = 15
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_ref = numpy.linalg.solve(A_np, b_np)
-        x_dp, info = cg(A_dp, b_dp, tol=1e-10, maxiter=1000)
-        assert info == 0
-        assert_allclose(_to_numpy(x_dp), x_ref, rtol=1e-6)
-
-    def test_cg_x0_initial_guess(self):
-        rng = numpy.random.default_rng(102)
-        n = 12
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_ref = numpy.linalg.solve(A_np, b_np)
-        x0_dp = dpnp.asarray(x_ref)
-        x_dp, info = cg(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5)
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-8
-
-    def test_cg_callback_called(self):
-        rng = numpy.random.default_rng(103)
-        n = 8
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
+        a = generate_random_numpy_array((n, n), numpy.float64, seed_value=42)
+        ia = dpnp.array(a)
+        lo = aslinearoperator(ia)
+        x = generate_random_numpy_array((n,), numpy.float64, seed_value=2)
+        ix = dpnp.array(x)
+        result = lo.rmatvec(ix)
+        expected = a.conj().T @ x
+        assert_allclose(dpnp.asnumpy(result), expected, atol=1e-12)
+
+    def test_duck_type_with_shape_and_matvec(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        n = 4
 
-        calls = []
-        def cb(xk):
-            calls.append(1)
+        class DuckOp:
+            shape = (n, n)
+            dtype = numpy.dtype(numpy.float64)
 
-        x_dp, info = cg(A_dp, b_dp, tol=1e-8, maxiter=200, callback=cb)
-        assert info == 0
-        assert len(calls) > 0
+            def matvec(self, x):
+                return x * 2.0
 
-    def test_cg_already_zero_rhs(self):
-        n = 5
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.zeros(n, dtype=numpy.float64)
-        x_dp, info = cg(A_dp, b_dp)
-        assert info == 0
-        assert_allclose(_to_numpy(x_dp), numpy.zeros(n), atol=1e-14)
+            def rmatvec(self, x):
+                return x * 2.0
 
-    def test_cg_returns_dpnp_array(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n, dtype=numpy.float64)
-        x_dp, _ = cg(A_dp, b_dp)
-        assert isinstance(x_dp, dpnp.ndarray)
+        lo = aslinearoperator(DuckOp())
+        ix = dpnp.ones(n, dtype=dpnp.float64)
+        result = lo.matvec(ix)
+        assert_allclose(dpnp.asnumpy(result), numpy.full(n, 2.0), atol=1e-12)
 
-    def test_cg_with_atol(self):
-        rng = numpy.random.default_rng(104)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
+    def test_aslinearoperator_errors(self):
+        assert_raises((TypeError, Exception), aslinearoperator, "nope")
 
-        x_dp, info = cg(A_dp, b_dp, tol=0.0, atol=1e-8, maxiter=500)
-        assert info == 0
 
-    def test_cg_with_linear_operator(self):
-        rng = numpy.random.default_rng(105)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        A_dp = dpnp.asarray(A_np)
-        b_np = rng.standard_normal(n).astype(dtype)
-        b_dp = dpnp.asarray(b_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
-        x_dp, info = cg(op, b_dp, tol=1e-8, maxiter=500)
-        assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
-
-    def test_cg_maxiter_exhausted_returns_nonzero_info(self):
-        rng = numpy.random.default_rng(106)
-        n = 20
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        _, info = cg(A_dp, b_dp, tol=1e-20, maxiter=1)
-        assert info != 0
+class TestCg:
+    n = 30
 
-    def test_cg_preconditioner_unsupported_raises(self):
-        """M != None must raise NotImplementedError regardless of system size."""
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n)
-        M = dpnp.eye(n)
-        with pytest.raises(NotImplementedError):
-            cg(A_dp, b_dp, M=M)
-
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_cg_dtype_preserved_in_output(self, dtype):
-        n = 8
-        rng = numpy.random.default_rng(107)
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        x_dp, _ = cg(dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-6, maxiter=500)
-        assert numpy.issubdtype(x_dp.dtype, numpy.floating)
-
-
-# ---------------------------------------------------------------------------
-# TestGMRES
-# ---------------------------------------------------------------------------
-
-class TestGMRES:
-    """Tests for dpnp.scipy.sparse.linalg.gmres."""
-
-    @pytest.mark.parametrize("n", [5, 10, 25])
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_gmres_nonsym_convergence(self, n, dtype):
-        rng = numpy.random.default_rng(200)
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_dp, info = gmres(A_dp, b_dp, tol=1e-7, maxiter=50, restart=n)
-        assert info == 0, f"GMRES did not converge (info={info})"
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
-
-    def test_gmres_matches_numpy_solve(self):
-        rng = numpy.random.default_rng(201)
-        n = 12
-        dtype = numpy.float64
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_ref = numpy.linalg.solve(A_np, b_np)
-        x_dp, info = gmres(A_dp, b_dp, tol=1e-10, maxiter=50, restart=n)
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_cg_converges_spd(self, dtype):
+        ia = _spd_matrix(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        x, info = cg(ia, ib, rtol=_rtol_for(dtype), maxiter=500)
         assert info == 0
-        assert_allclose(_to_numpy(x_dp), x_ref, rtol=1e-5)
-
-    def test_gmres_spd_matches_cg(self):
-        """On an SPD system GMRES and CG should agree."""
-        rng = numpy.random.default_rng(202)
-        n = 15
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_gmres, _ = gmres(A_dp, b_dp, tol=1e-10, maxiter=100, restart=n)
-        x_cg, _ = cg(A_dp, b_dp, tol=1e-10, maxiter=500)
-        assert_allclose(_to_numpy(x_gmres), _to_numpy(x_cg), rtol=1e-5)
-
-    def test_gmres_restart_parameter(self):
-        """Restarted GMRES (restart < n) should still converge."""
-        rng = numpy.random.default_rng(203)
-        n = 20
-        dtype = numpy.float64
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_dp, info = gmres(A_dp, b_dp, tol=1e-7, maxiter=20, restart=5)
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    @pytest.mark.skipif(not is_scipy_available(), reason="SciPy not available")
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_cg_matches_scipy(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a = dpnp.asnumpy(_spd_matrix(self.n, dtype))
+        b = dpnp.asnumpy(_rhs(self.n, dtype))
+        try:
+            x_ref, info_ref = scipy_sla.cg(a, b, rtol=1e-8, maxiter=500)
+        except TypeError:  # scipy < 1.12
+            x_ref, info_ref = scipy_sla.cg(a, b, tol=1e-8, maxiter=500)
+        assert info_ref == 0
+        x_dp, info = cg(dpnp.array(a), dpnp.array(b), rtol=1e-8, maxiter=500)
+        assert info == 0
+        tol = 1e-4 if dtype == dpnp.float32 else 1e-8
+        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=tol, atol=tol)
+
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_cg_x0_warm_start(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        x0 = dpnp.ones(self.n, dtype=dtype)
+        x, info = cg(ia, ib, x0=x0, rtol=_rtol_for(dtype), maxiter=500)
+        assert info == 0
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_cg_b_2dim(self, dtype):
+        # b with shape (n, 1) must be accepted and flattened internally
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dtype)
+        ib = _rhs(self.n, dtype).reshape(self.n, 1)
+        _, info = cg(ia, ib, rtol=1e-8, maxiter=500)
         assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
 
-    def test_gmres_x0_initial_guess(self):
-        rng = numpy.random.default_rng(204)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_ref = numpy.linalg.solve(A_np, b_np)
-        x0_dp = dpnp.asarray(x_ref)
-        x_dp, info = gmres(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5, restart=n)
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-8
-
-    def test_gmres_callback_called(self):
-        rng = numpy.random.default_rng(205)
-        n = 8
-        A_np = _make_nonsym(n, numpy.float64, rng)
-        b_np = rng.standard_normal(n)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
+    def test_cg_b_zero(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(10, dpnp.float64)
+        ib = dpnp.zeros(10, dtype=dpnp.float64)
+        x, info = cg(ia, ib, rtol=1e-8)
+        assert info == 0
+        assert_allclose(dpnp.asnumpy(x), numpy.zeros(10), atol=1e-14)
 
+    def test_cg_callback(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
         calls = []
-        def cb(xk):
-            calls.append(1)
+        cg(
+            ia,
+            ib,
+            callback=lambda xk: calls.append(float(dpnp.linalg.norm(xk))),
+            rtol=1e-10,
+            maxiter=200,
+        )
+        assert len(calls) > 0
 
-        _, info = gmres(A_dp, b_dp, tol=1e-8, maxiter=20, callback=cb,
-                        callback_type="x", restart=n)
+    def test_cg_atol(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        x, _ = cg(ia, ib, rtol=0.0, atol=1e-1, maxiter=500)
+        assert float(dpnp.linalg.norm(ia @ x - ib)) < 1.0
+
+    def test_cg_exact_solution(self):
+        # x0 == true solution must return info == 0 immediately
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        n = 10
+        ia = _spd_matrix(n, dpnp.float64)
+        ib = _rhs(n, dpnp.float64)
+        x_true = dpnp.array(
+            numpy.linalg.solve(dpnp.asnumpy(ia), dpnp.asnumpy(ib))
+        )
+        _, info = cg(ia, ib, x0=x_true, rtol=1e-12)
         assert info == 0
-        assert len(calls) > 0
 
-    def test_gmres_already_zero_rhs(self):
-        n = 5
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.zeros(n, dtype=numpy.float64)
-        x_dp, info = gmres(A_dp, b_dp)
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_cg_via_linear_operator(self, dtype):
+        ia = _spd_matrix(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        lo = aslinearoperator(ia)
+        x, info = cg(lo, ib, rtol=_rtol_for(dtype), maxiter=500)
         assert info == 0
-        assert_allclose(_to_numpy(x_dp), numpy.zeros(n), atol=1e-14)
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    def test_cg_maxiter_nonconvergence(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(50, dpnp.float64)
+        ib = _rhs(50, dpnp.float64)
+        _, info = cg(ia, ib, rtol=1e-15, atol=0.0, maxiter=1)
+        assert info != 0
 
-    def test_gmres_returns_dpnp_array(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n, dtype=numpy.float64)
-        x_dp, _ = gmres(A_dp, b_dp)
-        assert isinstance(x_dp, dpnp.ndarray)
+    def test_cg_diag_preconditioner(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        M = aslinearoperator(dpnp.diag(1.0 / dpnp.diag(ia)))
+        _, info = cg(ia, ib, M=M, rtol=1e-8, maxiter=500)
+        assert info == 0
 
-    def test_gmres_with_atol(self):
-        rng = numpy.random.default_rng(206)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
+    def test_cg_errors(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(5, dpnp.float64)
+        ib = dpnp.ones(6, dtype=dpnp.float64)
+        # b length mismatch
+        with pytest.raises((ValueError, Exception)):
+            cg(ia, ib, maxiter=1)
+
+
+class TestGmres:
+    n = 30
+
+    @pytest.mark.parametrize("dtype", _GMRES_DTYPES)
+    def test_gmres_converges_diag_dominant(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        x, _ = gmres(
+            ia,
+            ib,
+            rtol=_rtol_for(dtype),
+            maxiter=200,
+            restart=self.n,
+        )
+        # Check actual residual rather than info: see comment above
+        # _GMRES_CPX_XFAIL.
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    @pytest.mark.skipif(not is_scipy_available(), reason="SciPy not available")
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_gmres_matches_scipy(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        a = dpnp.asnumpy(_diag_dominant(self.n, dtype))
+        b = dpnp.asnumpy(_rhs(self.n, dtype))
+        req_rtol = _rtol_for(dtype)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                x_ref, _ = scipy_sla.gmres(
+                    a, b, rtol=req_rtol, restart=self.n, maxiter=None
+                )
+            except TypeError:  # scipy < 1.12
+                x_ref, _ = scipy_sla.gmres(
+                    a, b, tol=req_rtol, restart=self.n, maxiter=None
+                )
         x_dp, info = gmres(
-            dpnp.asarray(A_np),
-            dpnp.asarray(b_np),
-            tol=0.0,
-            atol=1e-7,
+            dpnp.array(a),
+            dpnp.array(b),
+            rtol=req_rtol,
+            restart=self.n,
             maxiter=50,
-            restart=n,
         )
         assert info == 0
+        tol = 1e-3 if dtype == dpnp.float32 else 1e-7
+        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=tol, atol=tol)
+
+    @pytest.mark.parametrize("restart", [None, 5, 15], ids=["None", "5", "15"])
+    def test_gmres_restart_values(self, restart):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        _, info = gmres(ia, ib, rtol=1e-8, restart=restart, maxiter=100)
+        assert info == 0
 
-    def test_gmres_with_linear_operator(self):
-        rng = numpy.random.default_rng(207)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_nonsym(n, dtype, rng)
-        A_dp = dpnp.asarray(A_np)
-        b_np = rng.standard_normal(n).astype(dtype)
-        b_dp = dpnp.asarray(b_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
-        x_dp, info = gmres(op, b_dp, tol=1e-8, maxiter=50, restart=n)
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_gmres_x0_warm_start(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        x0 = dpnp.ones(self.n, dtype=dtype)
+        x, _ = gmres(
+            ia,
+            ib,
+            x0=x0,
+            rtol=_rtol_for(dtype),
+            restart=self.n,
+            maxiter=200,
+        )
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    def test_gmres_b_2dim(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64).reshape(self.n, 1)
+        _, info = gmres(ia, ib, rtol=1e-8, restart=self.n, maxiter=100)
         assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
-
-    def test_gmres_maxiter_exhausted_returns_nonzero_info(self):
-        rng = numpy.random.default_rng(208)
-        n = 20
-        dtype = numpy.float64
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        _, info = gmres(A_dp, b_dp, tol=1e-20, maxiter=1, restart=2)
-        assert info != 0
 
-    def test_gmres_preconditioner_unsupported_raises(self):
-        """M != None must raise NotImplementedError regardless of system size."""
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n)
-        M = dpnp.eye(n)
-        with pytest.raises(NotImplementedError):
-            gmres(A_dp, b_dp, M=M)
-
-    def test_gmres_callback_type_pr_norm_raises(self):
-        """callback_type='pr_norm' must raise NotImplementedError for all n."""
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n)
-        with pytest.raises(NotImplementedError):
-            gmres(A_dp, b_dp, callback=lambda x: None, callback_type="pr_norm")
+    def test_gmres_b_zero(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(10, dpnp.float64)
+        ib = dpnp.zeros(10, dtype=dpnp.float64)
+        x, info = gmres(ia, ib, rtol=1e-8)
+        assert info == 0
+        assert_allclose(dpnp.asnumpy(x), numpy.zeros(10), atol=1e-14)
 
-    def test_gmres_invalid_callback_type_raises(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n)
-        with pytest.raises(ValueError):
-            gmres(A_dp, b_dp, callback_type="bad_value")
+    def test_gmres_callback_x(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        calls = []
+        gmres(
+            ia,
+            ib,
+            callback=lambda xk: calls.append(1),
+            callback_type="x",
+            rtol=1e-10,
+            maxiter=20,
+            restart=self.n,
+        )
+        assert len(calls) > 0
 
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_gmres_dtype_preserved_in_output(self, dtype):
-        n = 6
-        rng = numpy.random.default_rng(209)
-        A_np = _make_nonsym(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        x_dp, _ = gmres(
-            dpnp.asarray(A_np),
-            dpnp.asarray(b_np),
-            tol=1e-6,
+    def test_gmres_callback_pr_norm(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        values = []
+        gmres(
+            ia,
+            ib,
+            callback=lambda r: values.append(float(r)),
+            callback_type="pr_norm",
+            rtol=1e-10,
+            maxiter=20,
+            restart=self.n,
+        )
+        assert len(values) > 0
+        assert all(v >= 0 for v in values)
+
+    def test_gmres_atol(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        x, _ = gmres(
+            ia,
+            ib,
+            rtol=0.0,
+            atol=1e-6,
+            restart=self.n,
             maxiter=50,
-            restart=n,
         )
-        assert numpy.issubdtype(x_dp.dtype, numpy.floating)
-
-    @pytest.mark.parametrize("n", [5, 15])
-    def test_gmres_happy_breakdown(self, n):
-        """Identity operator should yield happy breakdown (exact solution)."""
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.arange(1, n + 1, dtype=numpy.float64)
-        x_dp, info = gmres(A_dp, b_dp, tol=1e-12, maxiter=n, restart=n)
-        assert info == 0
-        # Expected dtype must be float64 to match strict NumPy >= 2.0 checks.
-        assert_allclose(_to_numpy(x_dp), numpy.arange(1, n + 1, dtype=numpy.float64), rtol=1e-10)
-
-
-# ---------------------------------------------------------------------------
-# TestMINRES
-# ---------------------------------------------------------------------------
-
-class TestMINRES:
-    """Tests for dpnp.scipy.sparse.linalg.minres (SciPy-backed stub)."""
-
-    @pytest.fixture(autouse=True)
-    def _skip_if_no_scipy(self):
-        pytest.importorskip("scipy", reason="SciPy required for minres tests")
-
-    @pytest.mark.parametrize("n", [5, 10, 20])
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_minres_spd_convergence(self, n, dtype):
-        rng = numpy.random.default_rng(300)
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_dp, info = minres(A_dp, b_dp, tol=1e-7, maxiter=500)
-        assert info == 0, f"MINRES did not converge (info={info})"
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
-
-    @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64])
-    def test_minres_sym_indef_convergence(self, dtype):
-        rng = numpy.random.default_rng(301)
-        n = 12
-        A_np = _make_sym_indef(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_dp, info = minres(A_dp, b_dp, tol=1e-6, maxiter=500)
-        assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-4
+        assert float(dpnp.linalg.norm(ia @ x - ib)) < 1e-4
+
+    @pytest.mark.parametrize("dtype", _GMRES_DTYPES)
+    def test_gmres_via_linear_operator(self, dtype):
+        if not has_support_aspect64() and dtype in (
+            dpnp.float64,
+            dpnp.complex128,
+        ):
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        lo = aslinearoperator(ia)
+        x, _ = gmres(
+            lo,
+            ib,
+            rtol=_rtol_for(dtype),
+            restart=self.n,
+            maxiter=200,
+        )
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    def test_gmres_nonconvergence(self):
+        # Ill-conditioned Hilbert matrix + tiny restart must not converge
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        n = 48
+        idx = numpy.arange(n, dtype=numpy.float64)
+        a = 1.0 / (idx[:, None] + idx[None, :] + 1.0)
+        rng = numpy.random.default_rng(5)
+        b = rng.standard_normal(n)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+        x, info = gmres(ia, ib, rtol=1e-15, atol=0.0, restart=2, maxiter=2)
+        rel = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert rel > 1e-12
+        assert info != 0
 
+    @pytest.mark.xfail(reason=_GMRES_CPX_XFAIL, strict=False)
+    def test_gmres_complex_system(self):
+        if not has_support_aspect64():
+            pytest.skip("complex128 not supported on this device")
+        n = 15
+        ia = _diag_dominant(n, dpnp.complex128)
+        ib = _rhs(n, dpnp.complex128)
+        x, _ = gmres(ia, ib, rtol=1e-8, restart=n, maxiter=200)
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < 1e-5
+
+    def test_gmres_errors(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        # unknown callback_type
+        assert_raises(ValueError, gmres, ia, ib, callback_type="garbage")
+
+
+class TestMinres:
+    n = 30
+
+    @pytest.mark.parametrize("dtype", [dpnp.float32, dpnp.float64])
+    def test_minres_converges_spd(self, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dtype)
+        ib = _rhs(self.n, dtype)
+        x, info = minres(ia, ib, rtol=1e-8, maxiter=500)
+        assert info == 0
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < 1e-4
+
+    def test_minres_converges_sym_indefinite(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _sym_indefinite(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        x, _ = minres(ia, ib, rtol=1e-8, maxiter=1000)
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < 1e-3
+
+    @pytest.mark.skipif(not is_scipy_available(), reason="SciPy not available")
     def test_minres_matches_scipy(self):
-        import scipy.sparse.linalg as sla
-
-        rng = numpy.random.default_rng(302)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        a = dpnp.asnumpy(_spd_matrix(self.n, dpnp.float64))
+        b = dpnp.asnumpy(_rhs(self.n, dpnp.float64))
+        try:
+            x_ref, _ = scipy_sla.minres(a, b, rtol=1e-8)
+        except TypeError:
+            x_ref, _ = scipy_sla.minres(a, b, tol=1e-8)
+        x_dp, info = minres(dpnp.array(a), dpnp.array(b), rtol=1e-8)
+        assert info == 0
+        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=1e-5, atol=1e-6)
+
+    def test_minres_x0_warm_start(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        x0 = dpnp.zeros(self.n, dtype=dpnp.float64)
+        _, info = minres(ia, ib, x0=x0, rtol=1e-8)
+        assert info == 0
 
-        x_scipy, info_scipy = sla.minres(A_np, b_np, rtol=1e-10)
-        x_dp, info_dp = minres(
-            dpnp.asarray(A_np), dpnp.asarray(b_np), tol=1e-10
+    def test_minres_shift(self):
+        # shift != 0 solves (A - shift*I) x = b
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        a = dpnp.asnumpy(_spd_matrix(self.n, dpnp.float64))
+        b = dpnp.asnumpy(_rhs(self.n, dpnp.float64))
+        shift = 0.5
+        x_dp, info = minres(
+            dpnp.array(a), dpnp.array(b), shift=shift, rtol=1e-8
         )
-        assert info_dp == 0
-        assert_allclose(_to_numpy(x_dp), x_scipy, rtol=1e-6)
-
-    def test_minres_x0_initial_guess(self):
-        rng = numpy.random.default_rng(303)
-        n = 8
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_ref = numpy.linalg.solve(A_np, b_np)
-        x0_dp = dpnp.asarray(x_ref)
-        x_dp, info = minres(A_dp, b_dp, x0=x0_dp, tol=1e-10, maxiter=5)
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-8
-
-    def test_minres_returns_dpnp_array(self):
-        n = 4
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.ones(n, dtype=numpy.float64)
-        x_dp, _ = minres(A_dp, b_dp)
-        assert isinstance(x_dp, dpnp.ndarray)
-
-    def test_minres_already_zero_rhs(self):
-        n = 5
-        A_dp = dpnp.eye(n, dtype=numpy.float64)
-        b_dp = dpnp.zeros(n, dtype=numpy.float64)
-        x_dp, info = minres(A_dp, b_dp)
         assert info == 0
-        assert_allclose(_to_numpy(x_dp), numpy.zeros(n), atol=1e-14)
-
-    def test_minres_non_square_raises(self):
-        A_dp = dpnp.ones((4, 6), dtype=numpy.float64)
-        b_dp = dpnp.ones(4, dtype=numpy.float64)
-        with pytest.raises(ValueError, match="square"):
-            minres(A_dp, b_dp)
-
-    def test_minres_with_shift(self):
-        rng = numpy.random.default_rng(304)
-        n = 8
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_dp, info = minres(A_dp, b_dp, tol=1e-8, shift=0.0)
+        a_shifted = a - shift * numpy.eye(self.n)
+        res = numpy.linalg.norm(
+            a_shifted @ dpnp.asnumpy(x_dp) - b
+        ) / numpy.linalg.norm(b)
+        assert res < 1e-4
+
+    def test_minres_b_zero(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(10, dpnp.float64)
+        ib = dpnp.zeros(10, dtype=dpnp.float64)
+        x, info = minres(ia, ib, rtol=1e-8)
         assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
-
-    def test_minres_with_linear_operator(self):
-        rng = numpy.random.default_rng(305)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        A_dp = dpnp.asarray(A_np)
-        b_np = rng.standard_normal(n).astype(dtype)
-        b_dp = dpnp.asarray(b_np)
-
-        op = LinearOperator((n, n), matvec=lambda x: A_dp @ x, dtype=dtype)
-        x_dp, info = minres(op, b_dp, tol=1e-8, maxiter=500)
+        assert_allclose(dpnp.asnumpy(x), numpy.zeros(10), atol=1e-14)
+
+    def test_minres_via_linear_operator(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        lo = aslinearoperator(ia)
+        _, info = minres(lo, ib, rtol=1e-8)
         assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-6
 
-    def test_minres_with_preconditioner(self):
-        rng = numpy.random.default_rng(306)
-        n = 10
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        A_dp = dpnp.asarray(A_np)
-        b_np = rng.standard_normal(n).astype(dtype)
-        b_dp = dpnp.asarray(b_np)
-
-        diag_A = numpy.diag(A_np)
-        M_np = numpy.diag(1.0 / diag_A)
-        M_dp = dpnp.asarray(M_np)
-
-        op_M = LinearOperator((n, n), matvec=lambda x: M_dp @ x, dtype=dtype)
-        x_dp, info = minres(A_dp, b_dp, M=op_M, tol=1e-8, maxiter=500)
+    def test_minres_callback(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(self.n, dpnp.float64)
+        ib = _rhs(self.n, dpnp.float64)
+        calls = []
+        minres(
+            ia,
+            ib,
+            callback=lambda xk: calls.append(1),
+            rtol=1e-10,
+        )
+        assert len(calls) > 0
+
+    def test_minres_errors(self):
+        if not has_support_aspect64():
+            pytest.skip("float64 not supported on this device")
+        lo = aslinearoperator(dpnp.ones((4, 5), dtype=dpnp.float64))
+        ib = dpnp.ones(4, dtype=dpnp.float64)
+        # non-square operator
+        assert_raises((ValueError, Exception), minres, lo, ib)
+
+
+class TestSolversIntegration:
+    @pytest.mark.parametrize(
+        "n, dtype",
+        [
+            (10, dpnp.float32),
+            (10, dpnp.float64),
+            (30, dpnp.float64),
+            (50, dpnp.float64),
+        ],
+    )
+    def test_cg_spd_via_linearoperator(self, n, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(n, dtype)
+        lo = aslinearoperator(ia)
+        ib = _rhs(n, dtype)
+        x, info = cg(lo, ib, rtol=_rtol_for(dtype), maxiter=n * 10)
         assert info == 0
-        assert _rel_residual(A_np, x_dp, b_np) < 1e-5
-
-
-# ---------------------------------------------------------------------------
-# Cross-solver consistency
-# ---------------------------------------------------------------------------
-
-class TestSolverConsistency:
-    """Verify that CG, GMRES, and MINRES agree on SPD systems."""
-
-    @pytest.fixture(autouse=True)
-    def _skip_if_no_scipy(self):
-        pytest.importorskip("scipy", reason="SciPy required for minres in consistency tests")
-
-    @pytest.mark.parametrize("n", [8, 16])
-    def test_cg_gmres_minres_agree_spd(self, n):
-        rng = numpy.random.default_rng(400)
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-
-        x_cg, info_cg = cg(A_dp, b_dp, tol=1e-10, maxiter=500)
-        x_gm, info_gm = gmres(A_dp, b_dp, tol=1e-10, maxiter=50, restart=n)
-        x_mr, info_mr = minres(A_dp, b_dp, tol=1e-10, maxiter=500)
-
-        assert info_cg == 0 and info_gm == 0 and info_mr == 0
-
-        assert_allclose(_to_numpy(x_cg), _to_numpy(x_gm), rtol=1e-5,
-                        err_msg="CG and GMRES disagree")
-        assert_allclose(_to_numpy(x_cg), _to_numpy(x_mr), rtol=1e-5,
-                        err_msg="CG and MINRES disagree")
-
-    def test_all_solvers_vs_numpy_direct(self):
-        rng = numpy.random.default_rng(401)
-        n = 12
-        dtype = numpy.float64
-        A_np = _make_spd(n, dtype, rng)
-        b_np = rng.standard_normal(n).astype(dtype)
-        A_dp = dpnp.asarray(A_np)
-        b_dp = dpnp.asarray(b_np)
-        x_ref = numpy.linalg.solve(A_np, b_np)
-
-        x_cg, _ = cg(A_dp, b_dp, tol=1e-11, maxiter=500)
-        x_gm, _ = gmres(A_dp, b_dp, tol=1e-11, maxiter=50, restart=n)
-        x_mr, _ = minres(A_dp, b_dp, tol=1e-11, maxiter=500)
-
-        for name, x_dp in [("cg", x_cg), ("gmres", x_gm), ("minres", x_mr)]:
-            assert_allclose(
-                _to_numpy(x_dp), x_ref, rtol=1e-7,
-                err_msg=f"{name} deviates from numpy.linalg.solve"
-            )
-
-
-# ---------------------------------------------------------------------------
-# Import-level smoke test
-# ---------------------------------------------------------------------------
-
-def test_public_api_importable():
-    """Verify all four public names are importable from the module."""
-    from dpnp.scipy.sparse.linalg import (  # noqa: F401
-        LinearOperator,
-        aslinearoperator,
-        cg,
-        gmres,
-        minres,
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    @pytest.mark.parametrize(
+        "n, dtype",
+        [
+            (10, dpnp.float32),
+            (10, dpnp.float64),
+            (30, dpnp.float64),
+        ],
     )
+    def test_gmres_nonsymmetric_via_linearoperator(self, n, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _diag_dominant(n, dtype)
+        lo = aslinearoperator(ia)
+        ib = _rhs(n, dtype)
+        x, _ = gmres(lo, ib, rtol=_rtol_for(dtype), restart=n, maxiter=200)
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < _res_bound(dtype)
+
+    @pytest.mark.skipif(
+        not is_scipy_available(), reason="SciPy required for minres"
+    )
+    @pytest.mark.parametrize(
+        "n, dtype",
+        [
+            (10, dpnp.float64),
+            (30, dpnp.float64),
+        ],
+    )
+    def test_minres_spd_via_linearoperator(self, n, dtype):
+        if not has_support_aspect64() and dtype == dpnp.float64:
+            pytest.skip("float64 not supported on this device")
+        ia = _spd_matrix(n, dtype)
+        lo = aslinearoperator(ia)
+        ib = _rhs(n, dtype)
+        x, info = minres(lo, ib, rtol=1e-8)
+        assert info == 0
+        res = float(dpnp.linalg.norm(ia @ x - ib) / dpnp.linalg.norm(ib))
+        assert res < 1e-4

From ac3bed570c290a319a8844fdc8f1a4ce6aefab5b Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Apr 2026 03:32:58 +0000
Subject: [PATCH 39/43] black formatting

---
 dpnp/scipy/sparse/linalg/_interface.py | 223 ++++++++++++++++++-------
 dpnp/scipy/sparse/linalg/_iterative.py | 148 ++++++++++------
 2 files changed, 259 insertions(+), 112 deletions(-)

diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index fd82c4a43282..623ada2c33cc 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -47,11 +47,11 @@
 
 import dpnp
 
-
 # ---------------------------------------------------------------------------
 # helpers
 # ---------------------------------------------------------------------------
 
+
 def _isshape(shape):
     """Return True if shape is a length-2 tuple of non-negative integers."""
     if not isinstance(shape, tuple) or len(shape) != 2:
@@ -77,6 +77,7 @@ def _get_dtype(operators, dtypes=None):
             dtypes.append(obj.dtype)
     return dpnp.result_type(*dtypes) if dtypes else None
 
+
 class LinearOperator:
     """Drop-in replacement for cupyx/scipy LinearOperator backed by dpnp arrays.
 
@@ -91,8 +92,10 @@ def __new__(cls, *args, **kwargs):
             return super().__new__(_CustomLinearOperator)
         else:
             obj = super().__new__(cls)
-            if (type(obj)._matvec is LinearOperator._matvec
-                    and type(obj)._matmat is LinearOperator._matmat):
+            if (
+                type(obj)._matvec is LinearOperator._matvec
+                and type(obj)._matmat is LinearOperator._matmat
+            ):
                 warnings.warn(
                     "LinearOperator subclass should implement at least one of "
                     "_matvec and _matmat.",
@@ -125,13 +128,13 @@ def _matvec(self, x):
         return self.matmat(x.reshape(-1, 1))
 
     def _matmat(self, X):
-        return dpnp.hstack(
-            [self.matvec(col.reshape(-1, 1)) for col in X.T]
-        )
+        return dpnp.hstack([self.matvec(col.reshape(-1, 1)) for col in X.T])
 
     def _rmatvec(self, x):
         if type(self)._adjoint is LinearOperator._adjoint:
-            raise NotImplementedError("rmatvec is not defined for this LinearOperator")
+            raise NotImplementedError(
+                "rmatvec is not defined for this LinearOperator"
+            )
         return self.H.matvec(x)
 
     def _rmatmat(self, X):
@@ -163,14 +166,18 @@ def matmat(self, X):
         if X.ndim != 2:
             raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[1]:
-            raise ValueError(f"dimension mismatch: {self.shape!r} vs {X.shape!r}")
+            raise ValueError(
+                f"dimension mismatch: {self.shape!r} vs {X.shape!r}"
+            )
         return self._matmat(X)
 
     def rmatmat(self, X):
         if X.ndim != 2:
             raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[0]:
-            raise ValueError(f"dimension mismatch: {self.shape!r} vs {X.shape!r}")
+            raise ValueError(
+                f"dimension mismatch: {self.shape!r} vs {X.shape!r}"
+            )
         return self._rmatmat(X)
 
     def dot(self, x):
@@ -184,7 +191,9 @@ def dot(self, x):
                 return self.matvec(x)
             elif x.ndim == 2:
                 return self.matmat(x)
-            raise ValueError(f"expected 1-D or 2-D array or LinearOperator, got {x!r}")
+            raise ValueError(
+                f"expected 1-D or 2-D array or LinearOperator, got {x!r}"
+            )
 
     def __call__(self, x):
         return self * x
@@ -194,12 +203,16 @@ def __mul__(self, x):
 
     def __matmul__(self, x):
         if dpnp.isscalar(x):
-            raise ValueError("Scalar operands not allowed with '@'; use '*' instead")
+            raise ValueError(
+                "Scalar operands not allowed with '@'; use '*' instead"
+            )
         return self.__mul__(x)
 
     def __rmatmul__(self, x):
         if dpnp.isscalar(x):
-            raise ValueError("Scalar operands not allowed with '@'; use '*' instead")
+            raise ValueError(
+                "Scalar operands not allowed with '@'; use '*' instead"
+            )
         return self.__rmul__(x)
 
     def __rmul__(self, x):
@@ -245,7 +258,9 @@ def transpose(self):
     T = property(transpose)
 
     def __repr__(self):
-        dt = "unspecified dtype" if self.dtype is None else f"dtype={self.dtype}"
+        dt = (
+            "unspecified dtype" if self.dtype is None else f"dtype={self.dtype}"
+        )
         return f"<{self.shape[0]}x{self.shape[1]} {self.__class__.__name__} with {dt}>"
 
 
@@ -253,20 +268,23 @@ def __repr__(self):
 # Concrete operator classes
 # ---------------------------------------------------------------------------
 
+
 class _CustomLinearOperator(LinearOperator):
     """Created when the user calls LinearOperator(shape, matvec=...) directly."""
 
-    def __init__(self, shape, matvec, rmatvec=None, matmat=None,
-                 dtype=None, rmatmat=None):
+    def __init__(
+        self, shape, matvec, rmatvec=None, matmat=None, dtype=None, rmatmat=None
+    ):
         super().__init__(dtype, shape)
         self.args = ()
-        self.__matvec_impl  = matvec
+        self.__matvec_impl = matvec
         self.__rmatvec_impl = rmatvec
         self.__rmatmat_impl = rmatmat
-        self.__matmat_impl  = matmat
+        self.__matmat_impl = matmat
         self._init_dtype()
 
-    def _matvec(self, x):  return self.__matvec_impl(x)
+    def _matvec(self, x):
+        return self.__matvec_impl(x)
 
     def _matmat(self, X):
         if self.__matmat_impl is not None:
@@ -275,7 +293,9 @@ def _matmat(self, X):
 
     def _rmatvec(self, x):
         if self.__rmatvec_impl is None:
-            raise NotImplementedError("rmatvec is not defined for this operator")
+            raise NotImplementedError(
+                "rmatvec is not defined for this operator"
+            )
         return self.__rmatvec_impl(x)
 
     def _rmatmat(self, X):
@@ -300,11 +320,20 @@ def __init__(self, A):
         self.A = A
         self.args = (A,)
 
-    def _matvec(self, x):  return self.A._rmatvec(x)
-    def _rmatvec(self, x): return self.A._matvec(x)
-    def _matmat(self, X):  return self.A._rmatmat(X)
-    def _rmatmat(self, X): return self.A._matmat(X)
-    def _adjoint(self):    return self.A
+    def _matvec(self, x):
+        return self.A._rmatvec(x)
+
+    def _rmatvec(self, x):
+        return self.A._matvec(x)
+
+    def _matmat(self, X):
+        return self.A._rmatmat(X)
+
+    def _rmatmat(self, X):
+        return self.A._matmat(X)
+
+    def _adjoint(self):
+        return self.A
 
 
 class _TransposedLinearOperator(LinearOperator):
@@ -313,11 +342,20 @@ def __init__(self, A):
         self.A = A
         self.args = (A,)
 
-    def _matvec(self, x):  return dpnp.conj(self.A._rmatvec(dpnp.conj(x)))
-    def _rmatvec(self, x): return dpnp.conj(self.A._matvec(dpnp.conj(x)))
-    def _matmat(self, X):  return dpnp.conj(self.A._rmatmat(dpnp.conj(X)))
-    def _rmatmat(self, X): return dpnp.conj(self.A._matmat(dpnp.conj(X)))
-    def _transpose(self):  return self.A
+    def _matvec(self, x):
+        return dpnp.conj(self.A._rmatvec(dpnp.conj(x)))
+
+    def _rmatvec(self, x):
+        return dpnp.conj(self.A._matvec(dpnp.conj(x)))
+
+    def _matmat(self, X):
+        return dpnp.conj(self.A._rmatmat(dpnp.conj(X)))
+
+    def _rmatmat(self, X):
+        return dpnp.conj(self.A._matmat(dpnp.conj(X)))
+
+    def _transpose(self):
+        return self.A
 
 
 class _SumLinearOperator(LinearOperator):
@@ -327,11 +365,20 @@ def __init__(self, A, B):
         super().__init__(_get_dtype([A, B]), A.shape)
         self.args = (A, B)
 
-    def _matvec(self, x):  return self.args[0].matvec(x)  + self.args[1].matvec(x)
-    def _rmatvec(self, x): return self.args[0].rmatvec(x) + self.args[1].rmatvec(x)
-    def _matmat(self, X):  return self.args[0].matmat(X)  + self.args[1].matmat(X)
-    def _rmatmat(self, X): return self.args[0].rmatmat(X) + self.args[1].rmatmat(X)
-    def _adjoint(self):    return self.args[0].H + self.args[1].H
+    def _matvec(self, x):
+        return self.args[0].matvec(x) + self.args[1].matvec(x)
+
+    def _rmatvec(self, x):
+        return self.args[0].rmatvec(x) + self.args[1].rmatvec(x)
+
+    def _matmat(self, X):
+        return self.args[0].matmat(X) + self.args[1].matmat(X)
+
+    def _rmatmat(self, X):
+        return self.args[0].rmatmat(X) + self.args[1].rmatmat(X)
+
+    def _adjoint(self):
+        return self.args[0].H + self.args[1].H
 
 
 class _ProductLinearOperator(LinearOperator):
@@ -341,29 +388,53 @@ def __init__(self, A, B):
         super().__init__(_get_dtype([A, B]), (A.shape[0], B.shape[1]))
         self.args = (A, B)
 
-    def _matvec(self, x):  return self.args[0].matvec(self.args[1].matvec(x))
-    def _rmatvec(self, x): return self.args[1].rmatvec(self.args[0].rmatvec(x))
-    def _matmat(self, X):  return self.args[0].matmat(self.args[1].matmat(X))
-    def _rmatmat(self, X): return self.args[1].rmatmat(self.args[0].rmatmat(X))
-    def _adjoint(self):    A, B = self.args; return B.H * A.H
+    def _matvec(self, x):
+        return self.args[0].matvec(self.args[1].matvec(x))
+
+    def _rmatvec(self, x):
+        return self.args[1].rmatvec(self.args[0].rmatvec(x))
+
+    def _matmat(self, X):
+        return self.args[0].matmat(self.args[1].matmat(X))
+
+    def _rmatmat(self, X):
+        return self.args[1].rmatmat(self.args[0].rmatmat(X))
+
+    def _adjoint(self):
+        A, B = self.args
+        return B.H * A.H
+
 
 class _ScaledLinearOperator(LinearOperator):
     def __init__(self, A, alpha):
         super().__init__(_get_dtype([A], [type(alpha)]), A.shape)
         self.args = (A, alpha)
 
-    def _matvec(self, x):  return self.args[1] * self.args[0].matvec(x)
-    def _rmatvec(self, x): return dpnp.conj(self.args[1]) * self.args[0].rmatvec(x)
-    def _matmat(self, X):  return self.args[1] * self.args[0].matmat(X)
-    def _rmatmat(self, X): return dpnp.conj(self.args[1]) * self.args[0].rmatmat(X)
-    def _adjoint(self):    A, alpha = self.args; return A.H * dpnp.conj(alpha)
+    def _matvec(self, x):
+        return self.args[1] * self.args[0].matvec(x)
+
+    def _rmatvec(self, x):
+        return dpnp.conj(self.args[1]) * self.args[0].rmatvec(x)
+
+    def _matmat(self, X):
+        return self.args[1] * self.args[0].matmat(X)
+
+    def _rmatmat(self, X):
+        return dpnp.conj(self.args[1]) * self.args[0].rmatmat(X)
+
+    def _adjoint(self):
+        A, alpha = self.args
+        return A.H * dpnp.conj(alpha)
+
 
 class _PowerLinearOperator(LinearOperator):
     def __init__(self, A, p):
         if A.shape[0] != A.shape[1]:
             raise ValueError("matrix power requires a square operator")
         if not _isintlike(p) or p < 0:
-            raise ValueError("matrix power requires a non-negative integer exponent")
+            raise ValueError(
+                "matrix power requires a non-negative integer exponent"
+            )
         super().__init__(_get_dtype([A]), A.shape)
         self.args = (A, int(p))
 
@@ -373,11 +444,21 @@ def _power(self, f, x):
             res = f(res)
         return res
 
-    def _matvec(self, x):  return self._power(self.args[0].matvec, x)
-    def _rmatvec(self, x): return self._power(self.args[0].rmatvec, x)
-    def _matmat(self, X):  return self._power(self.args[0].matmat, X)
-    def _rmatmat(self, X): return self._power(self.args[0].rmatmat, X)
-    def _adjoint(self):    A, p = self.args; return A.H ** p
+    def _matvec(self, x):
+        return self._power(self.args[0].matvec, x)
+
+    def _rmatvec(self, x):
+        return self._power(self.args[0].rmatvec, x)
+
+    def _matmat(self, X):
+        return self._power(self.args[0].matmat, X)
+
+    def _rmatmat(self, X):
+        return self._power(self.args[0].rmatmat, X)
+
+    def _adjoint(self):
+        A, p = self.args
+        return A.H**p
 
 
 class MatrixLinearOperator(LinearOperator):
@@ -385,12 +466,15 @@ class MatrixLinearOperator(LinearOperator):
 
     def __init__(self, A):
         super().__init__(A.dtype, A.shape)
-        self.A    = A
+        self.A = A
         self.__adj = None
-        self.args  = (A,)
+        self.args = (A,)
+
+    def _matmat(self, X):
+        return self.A.dot(X)
 
-    def _matmat(self, X):  return self.A.dot(X)
-    def _rmatmat(self, X): return dpnp.conj(self.A.T).dot(X)
+    def _rmatmat(self, X):
+        return dpnp.conj(self.A.T).dot(X)
 
     def _adjoint(self):
         if self.__adj is None:
@@ -400,10 +484,10 @@ def _adjoint(self):
 
 class _AdjointMatrixOperator(MatrixLinearOperator):
     def __init__(self, adjoint):
-        self.A        = dpnp.conj(adjoint.A.T)
+        self.A = dpnp.conj(adjoint.A.T)
         self.__adjoint = adjoint
-        self.args      = (adjoint,)
-        self.shape     = (adjoint.shape[1], adjoint.shape[0])
+        self.args = (adjoint,)
+        self.shape = (adjoint.shape[1], adjoint.shape[0])
 
     @property
     def dtype(self):
@@ -419,12 +503,24 @@ class IdentityOperator(LinearOperator):
     def __init__(self, shape, dtype=None):
         super().__init__(dtype, shape)
 
-    def _matvec(self, x):  return x
-    def _rmatvec(self, x): return x
-    def _matmat(self, X):  return X
-    def _rmatmat(self, X): return X
-    def _adjoint(self):    return self
-    def _transpose(self):  return self
+    def _matvec(self, x):
+        return x
+
+    def _rmatvec(self, x):
+        return x
+
+    def _matmat(self, X):
+        return X
+
+    def _rmatmat(self, X):
+        return X
+
+    def _adjoint(self):
+        return self
+
+    def _transpose(self):
+        return self
+
 
 def aslinearoperator(A) -> LinearOperator:
     """Wrap A as a LinearOperator if it is not already one.
@@ -440,6 +536,7 @@ def aslinearoperator(A) -> LinearOperator:
 
     try:
         from dpnp.scipy import sparse as _sp
+
         if _sp.issparse(A):
             return MatrixLinearOperator(A)
     except (ImportError, AttributeError):
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index df4a7a654bed..786d7f9f92de 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -72,13 +72,13 @@
 
 from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
-
 # ---------------------------------------------------------------------------
 # oneMKL sparse SpMV hook -- cached-handle API
 # ---------------------------------------------------------------------------
 
 try:
     from dpnp.backend.extensions.sparse import _sparse_impl as _si
+
     _HAS_SPARSE_IMPL = True
 except ImportError:
     _si = None
@@ -91,6 +91,7 @@
 # Internal helpers
 # ---------------------------------------------------------------------------
 
+
 def _np_dtype(dp_dtype) -> numpy.dtype:
     """Normalise any dtype-like (dpnp type, numpy type, string) to numpy.dtype."""
     return numpy.dtype(dp_dtype)
@@ -103,6 +104,7 @@ def _check_dtype(dtype, name: str) -> None:
             "only float32, float64, complex64, complex128 are accepted."
         )
 
+
 class _CachedSpMV:
     """
     Wrap a CSR matrix with a persistent oneMKL matrix_handle.
@@ -117,9 +119,19 @@ class _CachedSpMV:
     trans : int  0=N, 1=T, 2=C  (fixed at construction)
     """
 
-    __slots__ = ("_A", "_exec_q", "_handle", "_trans",
-                 "_nrows", "_ncols", "_nnz", "_out_size", "_in_size",
-                 "_dtype", "_val_type_id")
+    __slots__ = (
+        "_A",
+        "_exec_q",
+        "_handle",
+        "_trans",
+        "_nrows",
+        "_ncols",
+        "_nnz",
+        "_out_size",
+        "_in_size",
+        "_dtype",
+        "_val_type_id",
+    )
 
     def __init__(self, A, trans: int = 0):
         self._A = A  # keep alive so USM pointers stay valid
@@ -163,8 +175,9 @@ def __init__(self, A, trans: int = 0):
 
     def __call__(self, x: dpnp.ndarray) -> dpnp.ndarray:
         """y = op(A) * x -- only sparse::gemv fires, fully async."""
-        y = dpnp.empty(self._out_size, dtype=self._dtype,
-                        sycl_queue=self._exec_q)
+        y = dpnp.empty(
+            self._out_size, dtype=self._dtype, sycl_queue=self._exec_q
+        )
         # Do NOT wait on the event -- subsequent dpnp ops on the same
         # queue will serialize behind it automatically. Blocking here
         # throws away async overlap and dominates small-problem runtime.
@@ -194,8 +207,10 @@ def __del__(self):
                 pass
             self._handle = None
 
+
 class _CachedSpMVPair:
     """Holds forward and (lazily built) adjoint cached SpMV handles."""
+
     __slots__ = ("forward", "_A", "_adjoint")
 
     def __init__(self, A):
@@ -210,11 +225,11 @@ def rmatvec(self, x):
         if self._adjoint is None:
             # Build conjtrans handle on first use. For real dtypes
             # this is equivalent to trans=1.
-            is_cpx = dpnp.issubdtype(self._A.data.dtype,
-                                      dpnp.complexfloating)
+            is_cpx = dpnp.issubdtype(self._A.data.dtype, dpnp.complexfloating)
             self._adjoint = _CachedSpMV(self._A, trans=2 if is_cpx else 1)
         return self._adjoint(x)
 
+
 def _make_fast_matvec(A):
     """Return a _CachedSpMVPair if A is a CSR matrix with oneMKL support,
     or None if A is not an eligible sparse matrix.
@@ -226,6 +241,7 @@ def _make_fast_matvec(A):
     """
     try:
         from dpnp.scipy import sparse as _sp
+
         if not (_sp.issparse(A) and A.format == "csr"):
             return None
     except (ImportError, AttributeError):
@@ -243,6 +259,7 @@ def _make_fast_matvec(A):
     except Exception:
         return None
 
+
 def _make_system(A, M, x0, b):
     """Validate and prepare (A_op, M_op, x, b, dtype) on device.
 
@@ -254,9 +271,7 @@ def _make_system(A, M, x0, b):
     complex128 (complex).
     """
     if not isinstance(b, dpnp.ndarray):
-        raise TypeError(
-            f"b must be a dpnp.ndarray, got {type(b).__name__}"
-        )
+        raise TypeError(f"b must be a dpnp.ndarray, got {type(b).__name__}")
     if x0 is not None and not isinstance(x0, dpnp.ndarray):
         raise TypeError(
             f"x0 must be a dpnp.ndarray or None, got {type(x0).__name__}"
@@ -274,7 +289,10 @@ def _make_system(A, M, x0, b):
         )
 
     # Dtype promotion: prefer A.dtype; fall back via b.dtype.
-    if A_op.dtype is not None and _np_dtype(A_op.dtype).char in _SUPPORTED_DTYPES:
+    if (
+        A_op.dtype is not None
+        and _np_dtype(A_op.dtype).char in _SUPPORTED_DTYPES
+    ):
         dtype = A_op.dtype
     elif dpnp.issubdtype(b.dtype, dpnp.complexfloating):
         dtype = dpnp.complex128
@@ -303,26 +321,39 @@ def _make_system(A, M, x0, b):
         fast_mv_M = _make_fast_matvec(M)
         if fast_mv_M is not None:
             _orig_M = M_op
+
             class _FastMOp(LinearOperator):
                 def __init__(self):
                     super().__init__(_orig_M.dtype, _orig_M.shape)
-                def _matvec(self, x): return fast_mv_M.matvec(x)
-                def _rmatvec(self, x): return fast_mv_M.rmatvec(x)
+
+                def _matvec(self, x):
+                    return fast_mv_M.matvec(x)
+
+                def _rmatvec(self, x):
+                    return fast_mv_M.rmatvec(x)
+
             M_op = _FastMOp()
 
     # Inject fast CSR SpMV for A if available.
     fast_mv = _make_fast_matvec(A)
     if fast_mv is not None:
         _orig = A_op
+
         class _FastOp(LinearOperator):
             def __init__(self):
                 super().__init__(_orig.dtype, _orig.shape)
-            def _matvec(self, x): return fast_mv.matvec(x)
-            def _rmatvec(self, x): return fast_mv.rmatvec(x)
+
+            def _matvec(self, x):
+                return fast_mv.matvec(x)
+
+            def _rmatvec(self, x):
+                return fast_mv.rmatvec(x)
+
         A_op = _FastOp()
 
     return A_op, M_op, x, b, dtype
 
+
 def _get_atol(b_norm: float, atol, rtol: float) -> float:
     """Absolute stopping tolerance: max(atol, rtol*||b||), mirroring SciPy."""
     if atol == "legacy" or atol is None:
@@ -339,6 +370,7 @@ def _get_atol(b_norm: float, atol, rtol: float) -> float:
 # Conjugate Gradient
 # ---------------------------------------------------------------------------
 
+
 def cg(
     A,
     b,
@@ -409,14 +441,14 @@ def cg(
             break
 
         Ap = A_op.matvec(p)
-        pAp = dpnp.real(dpnp.vdot(p, Ap))   # 0-D on device
+        pAp = dpnp.real(dpnp.vdot(p, Ap))  # 0-D on device
 
         if float(dpnp.abs(pAp)) < rhotol:
             info = -1
             break
 
-        alpha = rz / pAp                       # 0-D on device
-        x = x + alpha * p                      # fully on-device
+        alpha = rz / pAp  # 0-D on device
+        x = x + alpha * p  # fully on-device
         r = r - alpha * Ap
 
         if callback is not None:
@@ -429,7 +461,7 @@ def cg(
             info = 0
             break
 
-        beta = rz_new / rz                     # 0-D on device
+        beta = rz_new / rz  # 0-D on device
         p = z + beta * p
         rz = rz_new
     else:
@@ -437,6 +469,7 @@ def cg(
 
     return x, int(info)
 
+
 def gmres(
     A,
     b,
@@ -509,8 +542,8 @@ def gmres(
     restart = min(int(restart), n)
 
     if callback_type is None:
-        callback_type = 'pr_norm'
-    if callback_type not in ('x', 'pr_norm'):
+        callback_type = "pr_norm"
+    if callback_type not in ("x", "pr_norm"):
         raise ValueError(f"Unknown callback_type: {callback_type!r}")
     if callback is None:
         callback_type = None
@@ -521,9 +554,10 @@ def gmres(
     # avoid host-device sync overhead (which dominates on Intel GPUs
     # even for small transfers).  CuPy keeps e on host and solves
     # lstsq on CPU, but for dpnp we keep everything on device.
-    V = dpnp.empty((n, restart), dtype=dtype, sycl_queue=queue, order='F')
-    H = dpnp.zeros((restart + 1, restart), dtype=dtype,
-                    sycl_queue=queue, order='F')
+    V = dpnp.empty((n, restart), dtype=dtype, sycl_queue=queue, order="F")
+    H = dpnp.zeros(
+        (restart + 1, restart), dtype=dtype, sycl_queue=queue, order="F"
+    )
     e = dpnp.zeros(restart + 1, dtype=dtype, sycl_queue=queue)
 
     compute_hu = _make_compute_hu(V)
@@ -534,9 +568,9 @@ def gmres(
         r = b - matvec(mx)
         r_norm = dpnp.linalg.norm(r)
 
-        if callback_type == 'x':
+        if callback_type == "x":
             callback(mx)
-        elif callback_type == 'pr_norm' and iters > 0:
+        elif callback_type == "pr_norm" and iters > 0:
             callback(r_norm / b_norm)
 
         if r_norm <= atol or iters >= maxiter:
@@ -550,7 +584,7 @@ def gmres(
         for j in range(restart):
             z = psolve(v)
             u = matvec(z)
-            H[:j + 1, j], u = compute_hu(u, j)
+            H[: j + 1, j], u = compute_hu(u, j)
             H[j + 1, j] = dpnp.linalg.norm(u)
             if j + 1 < restart:
                 v = u / H[j + 1, j]
@@ -568,6 +602,7 @@ def gmres(
 
     return mx, info
 
+
 def minres(
     A,
     b,
@@ -725,7 +760,7 @@ def minres(
         itn += 1
 
         s = 1.0 / beta
-        v = s * y                           # on device
+        v = s * y  # on device
 
         y = matvec(v)
         y = y - shift * v
@@ -748,7 +783,7 @@ def minres(
             raise ValueError("non-symmetric matrix")
         beta = numpy.sqrt(beta)
 
-        tnorm2 += alpha ** 2 + oldb ** 2 + beta ** 2
+        tnorm2 += alpha**2 + oldb**2 + beta**2
 
         if itn == 1:
             if beta / beta1 <= 10 * eps:
@@ -762,11 +797,11 @@ def minres(
         gbar = sn * dbar - cs * alpha
         epsln = sn * beta
         dbar = -cs * beta
-        root = numpy.sqrt(gbar ** 2 + dbar ** 2)
-        Arnorm = phibar * root          # ||A r_{k-1}||
+        root = numpy.sqrt(gbar**2 + dbar**2)
+        Arnorm = phibar * root  # ||A r_{k-1}||
 
         # Compute the next plane rotation Q_k.
-        gamma = numpy.sqrt(gbar ** 2 + beta ** 2)
+        gamma = numpy.sqrt(gbar**2 + beta**2)
         gamma = max(gamma, eps)
         cs = gbar / gamma
         sn = beta / gamma
@@ -791,7 +826,7 @@ def minres(
         # Estimate norms and test for convergence.
         # ----------------------------------------------------------
         Anorm = numpy.sqrt(tnorm2)
-        ynorm = float(dpnp.linalg.norm(x))     # host sync #3
+        ynorm = float(dpnp.linalg.norm(x))  # host sync #3
         epsa = Anorm * eps
         epsx = Anorm * ynorm * eps
         epsr = Anorm * ynorm * rtol
@@ -804,11 +839,11 @@ def minres(
         if ynorm == 0 or Anorm == 0:
             test1 = numpy.inf
         else:
-            test1 = rnorm / (Anorm * ynorm)    # ||r|| / (||A|| ||x||)
+            test1 = rnorm / (Anorm * ynorm)  # ||r|| / (||A|| ||x||)
         if Anorm == 0:
             test2 = numpy.inf
         else:
-            test2 = root / Anorm                # ||Ar|| / (||A|| ||r||)
+            test2 = root / Anorm  # ||Ar|| / (||A|| ||r||)
 
         # Estimate cond(A).
         Acond = gmax / gmin
@@ -834,16 +869,24 @@ def minres(
                 istop = 1
 
         if show:
-            prnt = (n <= 40 or itn <= 10 or itn >= maxiter - 10
-                    or itn % 10 == 0 or qrnorm <= 10 * epsx
-                    or qrnorm <= 10 * epsr or Acond <= 1e-2 / eps
-                    or istop != 0)
+            prnt = (
+                n <= 40
+                or itn <= 10
+                or itn >= maxiter - 10
+                or itn % 10 == 0
+                or qrnorm <= 10 * epsx
+                or qrnorm <= 10 * epsr
+                or Acond <= 1e-2 / eps
+                or istop != 0
+            )
             if prnt:
                 x1 = float(x[0])
-                print(f"{itn:6g} {x1:12.5e} {test1:10.3e}"
-                      f" {test2:10.3e}"
-                      f" {Anorm:8.1e} {Acond:8.1e}"
-                      f" {gbar / Anorm if Anorm else 0:8.1e}")
+                print(
+                    f"{itn:6g} {x1:12.5e} {test1:10.3e}"
+                    f" {test2:10.3e}"
+                    f" {Anorm:8.1e} {Acond:8.1e}"
+                    f" {gbar / Anorm if Anorm else 0:8.1e}"
+                )
                 if itn % 10 == 0:
                     print()
 
@@ -860,6 +903,7 @@ def minres(
 
     return (x, info)
 
+
 def _make_compute_hu(V):
     """Factory mirroring cupyx's _make_compute_hu using oneMKL gemv directly.
 
@@ -891,16 +935,19 @@ def compute_hu(u, j):
         h = dpnp.empty(j + 1, dtype=dtype, sycl_queue=exec_q)
 
         # Sub-view: column-major slice of the trailing axis is F-contiguous.
-        Vj = V[:, :j + 1]
+        Vj = V[:, : j + 1]
         Vj_usm = dpnp.get_usm_ndarray(Vj)
-        u_usm  = dpnp.get_usm_ndarray(u)
-        h_usm  = dpnp.get_usm_ndarray(h)
+        u_usm = dpnp.get_usm_ndarray(u)
+        h_usm = dpnp.get_usm_ndarray(h)
 
         _manager = dpu.SequentialOrderManager[exec_q]
 
         # Pass 1: h = Vj^T @ u  (real) or  h = (Vj^T @ u) then conj  (complex)
         ht1, ev1 = bi._gemv(
-            exec_q, Vj_usm, u_usm, h_usm,
+            exec_q,
+            Vj_usm,
+            u_usm,
+            h_usm,
             transpose=True,
             depends=_manager.submitted_events,
         )
@@ -916,7 +963,10 @@ def compute_hu(u, j):
         tmp = dpnp.empty_like(u)
         tmp_usm = dpnp.get_usm_ndarray(tmp)
         ht2, ev2 = bi._gemv(
-            exec_q, Vj_usm, h_usm, tmp_usm,
+            exec_q,
+            Vj_usm,
+            h_usm,
+            tmp_usm,
             transpose=False,
             depends=_manager.submitted_events,
         )

From a4ee24f7bdc41dad1ad0419f536cdb37b85577cf Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Apr 2026 03:38:52 +0000
Subject: [PATCH 40/43] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 30d60fc98988..83e1a9a878f4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
 * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 * Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815)
+* Added implementation of `dpnp.scipy.sparse.linalg import LinearOperator, cg, gmres, minres` [#2841](https://github.com/IntelPython/dpnp/pull/2841)
 
 ### Changed
 

From c330a04a36ad3c2ebca76eb54c3c66ec4e30b752 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Apr 2026 03:44:34 +0000
Subject: [PATCH 41/43] remove stale testing

---
 .../scipy_tests/sparse_tests/test_linalg.py   | 945 ------------------
 1 file changed, 945 deletions(-)
 delete mode 100644 tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py

diff --git a/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py b/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
deleted file mode 100644
index 3c8bb3ea4cba..000000000000
--- a/tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
+++ /dev/null
@@ -1,945 +0,0 @@
-# tests/dpnp_tests/scipy_tests/sparse_tests/test_linalg.py
-"""
-Tests for dpnp.scipy.sparse.linalg:
-  LinearOperator, aslinearoperator, cg, gmres, minres
-
-Style mirrors dpnp/tests/test_linalg.py:
-  - class-per-feature with pytest.mark.parametrize
-  - assert_dtype_allclose / generate_random_numpy_array from tests.helper
-  - dpnp.asnumpy() for array comparison
-  - testing.with_requires for optional-dependency guards
-  - is_scipy_available() / has_support_aspect64() for capability skips
-"""
-
-from __future__ import annotations
-
-import warnings
-
-import numpy
-import pytest
-from numpy.testing import (
-    assert_allclose,
-    assert_array_equal,
-    assert_raises,
-)
-
-import dpnp
-
-# Re-use the project's own test helpers exactly as test_linalg.py does.
-from dpnp.tests.helper import (
-    assert_dtype_allclose,
-    generate_random_numpy_array,
-    get_all_dtypes,
-    get_float_complex_dtypes,
-    has_support_aspect64,
-    is_scipy_available,
-)
-from dpnp.tests.third_party.cupy import testing
-
-from dpnp.scipy.sparse.linalg import (
-    LinearOperator,
-    aslinearoperator,
-    cg,
-    gmres,
-    minres,
-)
-
-
-# ---------------------------------------------------------------------------
-# Optional SciPy import (used for reference comparisons)
-# ---------------------------------------------------------------------------
-
-if is_scipy_available():
-    import scipy.sparse.linalg as scipy_sla
-
-
-# ---------------------------------------------------------------------------
-# Shared matrix / vector helpers
-# (match the signature of generate_random_numpy_array from tests/helper.py)
-# ---------------------------------------------------------------------------
-
-
-def _spd_matrix(n, dtype):
-    """Dense symmetric positive-definite matrix as a dpnp array."""
-    a = generate_random_numpy_array(
-        (n, n), dtype, seed_value=42, hermitian=False
-    ).astype(float)
-    a = a.T @ a + numpy.eye(n, dtype=float)
-    if numpy.issubdtype(dtype, numpy.complexfloating):
-        a = a.astype(dtype)
-    else:
-        a = a.astype(dtype)
-    return dpnp.asarray(a)
-
-
-def _diag_dominant(n, dtype, seed_value=81):
-    """Strictly diagonally dominant (non-symmetric) matrix as a dpnp array."""
-    a = generate_random_numpy_array(
-        (n, n), dtype, seed_value=seed_value
-    ) * 0.1
-    numpy.fill_diagonal(a, numpy.abs(a).sum(axis=1) + 1.0)
-    return dpnp.asarray(a)
-
-
-def _sym_indefinite(n, dtype, seed_value=99):
-    """Symmetric indefinite matrix (suitable for MINRES) as a dpnp array."""
-    a = generate_random_numpy_array((n, n), dtype, seed_value=seed_value)
-    q, _ = numpy.linalg.qr(a.astype(numpy.float64))
-    numpy.random.seed(seed_value)
-    d = numpy.random.standard_normal(n).astype(numpy.float64)
-    m = (q @ numpy.diag(d) @ q.T).astype(dtype)
-    return dpnp.asarray(m)
-
-
-def _rhs(n, dtype, seed_value=7):
-    """Unit-norm right-hand side vector as a dpnp array."""
-    b = generate_random_numpy_array((n,), dtype, seed_value=seed_value)
-    b /= numpy.linalg.norm(b)
-    return dpnp.asarray(b)
-
-
-# ---------------------------------------------------------------------------
-# Import smoke test
-# ---------------------------------------------------------------------------
-
-
-class TestImports:
-    """Verify that all public symbols are importable and callable."""
-
-    def test_all_symbols_importable(self):
-        from dpnp.scipy.sparse.linalg import (
-            LinearOperator,
-            aslinearoperator,
-            cg,
-            gmres,
-            minres,
-        )
-
-        for sym in (LinearOperator, aslinearoperator, cg, gmres, minres):
-            assert callable(sym)
-
-    def test_all_listed_in_dunder_all(self):
-        import dpnp.scipy.sparse.linalg as _mod
-
-        for name in (
-            "LinearOperator",
-            "aslinearoperator",
-            "cg",
-            "gmres",
-            "minres",
-        ):
-            assert name in _mod.__all__, f"{name!r} missing from __all__"
-
-
-# ---------------------------------------------------------------------------
-# LinearOperator
-# ---------------------------------------------------------------------------
-
-
-class TestLinearOperator:
-    """Tests for LinearOperator construction and protocol.
-
-    Mirrors the style of TestCholesky / TestDet in test_linalg.py.
-    """
-
-    # ------------------------------------------------------------------ shape
-
-    @pytest.mark.parametrize(
-        "shape",
-        [(5, 5), (7, 3), (3, 7)],
-        ids=["(5,5)", "(7,3)", "(3,7)"],
-    )
-    def test_shape(self, shape):
-        m, n = shape
-        lo = LinearOperator((m, n), matvec=lambda x: dpnp.zeros(m))
-        assert lo.shape == (m, n)
-        assert lo.ndim == 2
-
-    # ------------------------------------------------------------------ dtype
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_dtype_inference(self, dtype):
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n = 4
-        A = dpnp.eye(n, dtype=dtype)
-        lo = LinearOperator((n, n), matvec=lambda x: A @ x)
-        assert lo.dtype == dtype
-
-    def test_dtype_explicit(self):
-        lo = LinearOperator(
-            (4, 4),
-            matvec=lambda x: dpnp.zeros(4, dtype=dpnp.float64),
-            dtype=dpnp.float64,
-        )
-        assert lo.dtype == dpnp.float64
-
-    # ------------------------------------------------------------------ matvec
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_matvec(self, dtype):
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n = 6
-        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
-        a_dp = dpnp.asarray(a_np)
-        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
-        x = dpnp.asarray(
-            generate_random_numpy_array((n,), dtype, seed_value=1)
-        )
-        result = lo.matvec(x)
-        expected = a_np @ dpnp.asnumpy(x)
-        assert_dtype_allclose(result, expected)
-
-    def test_matvec_wrong_shape_raises(self):
-        lo = LinearOperator((3, 5), matvec=lambda x: dpnp.zeros(3))
-        with assert_raises(ValueError):
-            lo.matvec(dpnp.ones(4))
-
-    # ------------------------------------------------------------------ rmatvec
-
-    def test_rmatvec_not_defined_raises(self):
-        lo = LinearOperator((3, 3), matvec=lambda x: dpnp.zeros(3))
-        with assert_raises(NotImplementedError):
-            lo.rmatvec(dpnp.zeros(3))
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_rmatvec(self, dtype):
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n = 5
-        a_np = generate_random_numpy_array((n, n), dtype, seed_value=12)
-        a_dp = dpnp.asarray(a_np)
-        lo = LinearOperator(
-            (n, n),
-            matvec=lambda x: a_dp @ x,
-            rmatvec=lambda x: dpnp.conj(a_dp.T) @ x,
-        )
-        x = dpnp.asarray(
-            generate_random_numpy_array((n,), dtype, seed_value=3)
-        )
-        result = lo.rmatvec(x)
-        expected = a_np.conj().T @ dpnp.asnumpy(x)
-        assert_dtype_allclose(result, expected)
-
-    # ------------------------------------------------------------------ matmat
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_matmat_fallback_loop(self, dtype):
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n, k = 5, 3
-        a_np = generate_random_numpy_array((n, n), dtype, seed_value=55)
-        a_dp = dpnp.asarray(a_np)
-        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
-        X = dpnp.asarray(
-            generate_random_numpy_array((n, k), dtype, seed_value=9)
-        )
-        Y = lo.matmat(X)
-        expected = a_np @ dpnp.asnumpy(X)
-        assert_dtype_allclose(Y, expected)
-
-    def test_matmat_wrong_ndim_raises(self):
-        lo = LinearOperator(
-            (3, 3),
-            matvec=lambda x: dpnp.zeros(3),
-            dtype=dpnp.float64,
-        )
-        with assert_raises(ValueError):
-            lo.matmat(dpnp.ones(3))  # 1-D, not 2-D
-
-    # ------------------------------------------------------------------ operator overloads
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_matmul_1d(self, dtype):
-        """lo @ x dispatches to matvec."""
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n = 6
-        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
-        a_dp = dpnp.asarray(a_np)
-        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
-        x = dpnp.asarray(
-            generate_random_numpy_array((n,), dtype, seed_value=2)
-        )
-        result = lo @ x
-        expected = a_np @ dpnp.asnumpy(x)
-        assert_dtype_allclose(result, expected)
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_matmul_2d(self, dtype):
-        """lo @ X dispatches to matmat."""
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n, k = 5, 3
-        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
-        a_dp = dpnp.asarray(a_np)
-        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
-        X = dpnp.asarray(
-            generate_random_numpy_array((n, k), dtype, seed_value=5)
-        )
-        Y = lo @ X
-        expected = a_np @ dpnp.asnumpy(X)
-        assert_dtype_allclose(Y, expected)
-
-    def test_call_alias(self):
-        n = 4
-        a_dp = dpnp.eye(n, dtype=dpnp.float64)
-        lo = LinearOperator((n, n), matvec=lambda x: a_dp @ x)
-        x = dpnp.ones(n, dtype=dpnp.float64)
-        assert_allclose(dpnp.asnumpy(lo(x)), dpnp.asnumpy(x), atol=1e-12)
-
-    # ------------------------------------------------------------------ repr
-
-    def test_repr(self):
-        lo = LinearOperator(
-            (3, 4), matvec=lambda x: dpnp.zeros(3), dtype=dpnp.float64
-        )
-        r = repr(lo)
-        assert "3x4" in r
-        assert "LinearOperator" in r
-
-    # ------------------------------------------------------------------ error paths
-
-    def test_invalid_shape_negative(self):
-        with assert_raises(ValueError):
-            LinearOperator((-1, 3), matvec=lambda x: x)
-
-    def test_invalid_shape_wrong_ndim(self):
-        with assert_raises(ValueError):
-            LinearOperator((3,), matvec=lambda x: x)
-
-    # ------------------------------------------------------------------ subclass
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_subclass_custom_matmat(self, dtype):
-        """User subclass overriding _matmat_impl, as in CuPy's HasMatmat."""
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        n, k = 7, 4
-        a_np = generate_random_numpy_array(
-            (n, n), dtype, seed_value=42
-        )
-        a_dp = dpnp.asarray(a_np)
-
-        class _MyOp(LinearOperator):
-            def __init__(self):
-                super().__init__(
-                    shape=(n, n),
-                    matvec=lambda x: a_dp @ x,
-                    dtype=dtype,
-                )
-
-            def _matmat_impl(self, X):
-                return a_dp @ X
-
-        op = _MyOp()
-        X = dpnp.asarray(
-            generate_random_numpy_array((n, k), dtype, seed_value=9)
-        )
-        Y = op.matmat(X)
-        expected = a_np @ dpnp.asnumpy(X)
-        assert_dtype_allclose(Y, expected)
-
-
-# ---------------------------------------------------------------------------
-# aslinearoperator
-# ---------------------------------------------------------------------------
-
-
-class TestAsLinearOperator:
-    """Tests for aslinearoperator wrapping utility."""
-
-    def test_identity_if_already_linearoperator(self):
-        lo = LinearOperator((3, 3), matvec=lambda x: x)
-        assert aslinearoperator(lo) is lo
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_all_dtypes(no_bool=True, no_complex=False),
-    )
-    def test_dense_dpnp_array(self, dtype):
-        if not has_support_aspect64() and dtype in (
-            dpnp.float64,
-            dpnp.complex128,
-        ):
-            pytest.skip("float64 not supported on this device")
-        n = 6
-        a_np = generate_random_numpy_array((n, n), dtype, seed_value=42)
-        a_dp = dpnp.asarray(a_np)
-        lo = aslinearoperator(a_dp)
-        assert lo.shape == (n, n)
-        x = dpnp.asarray(
-            generate_random_numpy_array((n,), dtype, seed_value=1)
-        )
-        result = lo.matvec(x)
-        expected = a_np @ dpnp.asnumpy(x)
-        assert_dtype_allclose(result, expected)
-
-    def test_dense_numpy_array(self):
-        n = 5
-        a_np = generate_random_numpy_array(
-            (n, n), numpy.float64, seed_value=42
-        )
-        lo = aslinearoperator(a_np)
-        assert lo.shape == (n, n)
-
-    def test_rmatvec_from_dense(self):
-        n = 5
-        a_np = generate_random_numpy_array(
-            (n, n), numpy.float64, seed_value=42
-        )
-        a_dp = dpnp.asarray(a_np)
-        lo = aslinearoperator(a_dp)
-        x = dpnp.asarray(
-            generate_random_numpy_array((n,), numpy.float64, seed_value=2)
-        )
-        result = lo.rmatvec(x)
-        expected = a_np.conj().T @ dpnp.asnumpy(x)
-        assert_allclose(dpnp.asnumpy(result), expected, atol=1e-12)
-
-    def test_duck_type_with_shape_and_matvec(self):
-        n = 4
-
-        class _DuckOp:
-            shape = (n, n)
-            dtype = numpy.float64
-
-            def matvec(self, x):
-                return dpnp.asarray(dpnp.asnumpy(x) * 2.0)
-
-        lo = aslinearoperator(_DuckOp())
-        x = dpnp.ones(n, dtype=dpnp.float64)
-        result = lo.matvec(x)
-        assert_allclose(dpnp.asnumpy(result), numpy.full(n, 2.0), atol=1e-12)
-
-    def test_invalid_type_raises(self):
-        with assert_raises(TypeError):
-            aslinearoperator("not_an_array")
-
-    def test_invalid_1d_array_raises(self):
-        with pytest.raises(Exception):
-            aslinearoperator(dpnp.ones(5))
-
-
-# ---------------------------------------------------------------------------
-# CG
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.skipif(
-    not is_scipy_available(), reason="SciPy not available"
-)
-class TestCg:
-    """Tests for cg (Conjugate Gradient).
-
-    Mirrors TestCholesky / TestDet structure from test_linalg.py.
-    """
-
-    n = 30
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_float_complex_dtypes(),
-    )
-    def test_cg_converges_spd(self, dtype):
-        """CG must converge on symmetric positive-definite matrices."""
-        a_dp = _spd_matrix(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        x, info = cg(a_dp, b_dp, tol=1e-8, maxiter=500)
-        assert info == 0
-        res = dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        assert float(res) < 1e-5
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_cg_matches_scipy(self, dtype):
-        """Solution must match scipy.sparse.linalg.cg within dtype tolerance."""
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_np = dpnp.asnumpy(_spd_matrix(self.n, dtype))
-        b_np = dpnp.asnumpy(_rhs(self.n, dtype))
-        x_ref, info_ref = scipy_sla.cg(a_np, b_np, rtol=1e-8, maxiter=500)
-        assert info_ref == 0
-        x_dp, info = cg(
-            dpnp.asarray(a_np), dpnp.asarray(b_np), tol=1e-8, maxiter=500
-        )
-        assert info == 0
-        tol = 1e-4 if dtype == dpnp.float32 else 1e-8
-        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=tol)
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_cg_x0_warm_start(self, dtype):
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _spd_matrix(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        x0 = dpnp.ones(self.n, dtype=dtype)
-        x, info = cg(a_dp, b_dp, x0=x0, tol=1e-8, maxiter=500)
-        assert info == 0
-        res = dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        assert float(res) < 1e-5
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_cg_b_2dim(self, dtype):
-        """b with shape (n, 1) must be accepted and flattened internally."""
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _spd_matrix(self.n, dtype)
-        b_dp = _rhs(self.n, dtype).reshape(self.n, 1)
-        x, info = cg(a_dp, b_dp, tol=1e-8, maxiter=500)
-        assert info == 0
-
-    def test_cg_callback_called(self):
-        a_dp = _spd_matrix(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        calls = []
-
-        def _cb(xk):
-            calls.append(float(dpnp.linalg.norm(xk)))
-
-        cg(a_dp, b_dp, callback=_cb, maxiter=200)
-        assert len(calls) > 0
-
-    def test_cg_atol(self):
-        a_dp = _spd_matrix(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        x, info = cg(a_dp, b_dp, tol=0.0, atol=1e-1)
-        res = float(dpnp.linalg.norm(a_dp @ x - b_dp))
-        assert res < 1.0
-
-    def test_cg_exact_solution_no_iterations(self):
-        """When x0 is the exact solution the residual must be zero immediately."""
-        n = 10
-        a_dp = _spd_matrix(n, numpy.float64)
-        b_dp = _rhs(n, numpy.float64)
-        x_true = dpnp.asarray(
-            numpy.linalg.solve(dpnp.asnumpy(a_dp), dpnp.asnumpy(b_dp))
-        )
-        x, info = cg(a_dp, b_dp, x0=x_true, tol=1e-12)
-        assert info == 0
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_float_complex_dtypes(),
-    )
-    def test_cg_via_linear_operator(self, dtype):
-        """CG with A supplied as a LinearOperator."""
-        a_dp = _spd_matrix(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        lo = aslinearoperator(a_dp)
-        x, info = cg(lo, b_dp, tol=1e-8, maxiter=500)
-        assert info == 0
-        res = float(
-            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        )
-        assert res < 1e-5
-
-    def test_cg_maxiter_nonconvergence_info_positive(self):
-        """maxiter=1 on a hard problem must give info != 0."""
-        a_dp = _spd_matrix(50, numpy.float64)
-        b_dp = _rhs(50, numpy.float64)
-        _, info = cg(a_dp, b_dp, tol=1e-15, maxiter=1)
-        assert info != 0
-
-    def test_cg_wrong_b_size_raises(self):
-        a_dp = _spd_matrix(5, numpy.float64)
-        b_dp = dpnp.ones(6, dtype=dpnp.float64)
-        with pytest.raises((ValueError, Exception)):
-            cg(a_dp, b_dp, maxiter=1)
-
-
-# ---------------------------------------------------------------------------
-# GMRES
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.skipif(
-    not is_scipy_available(), reason="SciPy not available"
-)
-class TestGmres:
-    """Tests for gmres (Generalised Minimum Residual).
-
-    Mirrors the class structure of TestDet / TestCg above.
-    """
-
-    n = 30
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_float_complex_dtypes(),
-    )
-    def test_gmres_converges_diag_dominant(self, dtype):
-        """GMRES must converge on diagonally dominant non-symmetric systems."""
-        a_dp = _diag_dominant(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        x, info = gmres(a_dp, b_dp, tol=1e-8, maxiter=50, restart=self.n)
-        assert info == 0
-        res = dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        assert float(res) < 1e-5
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_gmres_matches_scipy(self, dtype):
-        """Solution must match scipy.sparse.linalg.gmres within dtype tolerance."""
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_np = dpnp.asnumpy(_diag_dominant(self.n, dtype))
-        b_np = generate_random_numpy_array(
-            (self.n,), dtype, seed_value=7
-        )
-        b_np /= numpy.linalg.norm(b_np)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            x_ref, _ = scipy_sla.gmres(
-                a_np, b_np, rtol=1e-8, restart=self.n, maxiter=None
-            )
-        x_dp, info = gmres(
-            dpnp.asarray(a_np),
-            dpnp.asarray(b_np),
-            tol=1e-8,
-            restart=self.n,
-            maxiter=50,
-        )
-        assert info == 0
-        tol = 1e-3 if dtype == dpnp.float32 else 1e-7
-        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=tol)
-
-    @pytest.mark.parametrize(
-        "restart",
-        [None, 5, 15],
-        ids=["restart=None", "restart=5", "restart=15"],
-    )
-    def test_gmres_restart_values(self, restart):
-        a_dp = _diag_dominant(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        x, info = gmres(a_dp, b_dp, tol=1e-8, restart=restart, maxiter=100)
-        assert info == 0
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_gmres_x0_warm_start(self, dtype):
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _diag_dominant(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        x0 = dpnp.ones(self.n, dtype=dtype)
-        x, info = gmres(a_dp, b_dp, x0=x0, tol=1e-8, maxiter=100)
-        assert info == 0
-
-    def test_gmres_b_2dim(self):
-        """b with shape (n, 1) must be accepted and flattened internally."""
-        a_dp = _diag_dominant(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64).reshape(self.n, 1)
-        x, info = gmres(a_dp, b_dp, tol=1e-8, maxiter=100)
-        assert info == 0
-
-    def test_gmres_callback_x_called(self):
-        a_dp = _diag_dominant(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        calls = []
-
-        def _cb(xk):
-            calls.append(1)
-
-        gmres(a_dp, b_dp, callback=_cb, callback_type="x", maxiter=20)
-        assert len(calls) > 0
-
-    def test_gmres_callback_pr_norm_not_implemented(self):
-        a_dp = _diag_dominant(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        with pytest.raises(NotImplementedError):
-            gmres(a_dp, b_dp, callback=lambda r: None, callback_type="pr_norm")
-
-    def test_gmres_invalid_callback_type_raises(self):
-        a_dp = _diag_dominant(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        with assert_raises(ValueError):
-            gmres(a_dp, b_dp, callback_type="garbage")
-
-    def test_gmres_atol(self):
-        a_dp = _diag_dominant(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        x, info = gmres(
-            a_dp, b_dp, tol=0.0, atol=1e-6, restart=self.n, maxiter=50
-        )
-        res = float(dpnp.linalg.norm(a_dp @ x - b_dp))
-        assert res < 1e-4
-
-    @pytest.mark.parametrize(
-        "dtype",
-        get_float_complex_dtypes(),
-    )
-    def test_gmres_via_linear_operator(self, dtype):
-        a_dp = _diag_dominant(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        lo = aslinearoperator(a_dp)
-        x, info = gmres(lo, b_dp, tol=1e-8, restart=self.n, maxiter=50)
-        assert info == 0
-
-    def test_gmres_nonconvergence_info_nonzero(self):
-        """Hilbert-like ill-conditioned matrix with tiny restart must not converge."""
-        n = 48
-        idx = numpy.arange(n, dtype=numpy.float64)
-        a_np = 1.0 / (idx[:, None] + idx[None, :] + 1.0)
-        b_np = generate_random_numpy_array((n,), numpy.float64, seed_value=5)
-        a_dp = dpnp.asarray(a_np)
-        b_dp = dpnp.asarray(b_np)
-        x, info = gmres(a_dp, b_dp, tol=1e-15, restart=2, maxiter=2)
-        rel_res = float(
-            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        )
-        assert rel_res > 1e-12
-        assert info != 0
-
-    def test_gmres_complex_system(self):
-        n = 15
-        a_np = generate_random_numpy_array(
-            (n, n), numpy.complex128, seed_value=42
-        )
-        numpy.fill_diagonal(a_np, numpy.abs(a_np).sum(axis=1) + 1.0)
-        b_np = generate_random_numpy_array(
-            (n,), numpy.complex128, seed_value=7
-        )
-        a_dp = dpnp.asarray(a_np)
-        b_dp = dpnp.asarray(b_np)
-        x, info = gmres(a_dp, b_dp, tol=1e-8, restart=n, maxiter=50)
-        assert info == 0
-        res = float(
-            numpy.linalg.norm(a_np @ dpnp.asnumpy(x) - b_np)
-            / numpy.linalg.norm(b_np)
-        )
-        assert res < 1e-5
-
-
-# ---------------------------------------------------------------------------
-# MINRES
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.skipif(
-    not is_scipy_available(), reason="SciPy required for MINRES backend"
-)
-class TestMinres:
-    """Tests for minres (Minimum Residual Method).
-
-    MINRES is SciPy-backed for this implementation; tests verify the
-    dpnp wrapper round-trips correctly.
-    """
-
-    n = 30
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [dpnp.float32, dpnp.float64],
-        ids=["float32", "float64"],
-    )
-    def test_minres_converges_spd(self, dtype):
-        """MINRES on an SPD system must converge."""
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _spd_matrix(self.n, dtype)
-        b_dp = _rhs(self.n, dtype)
-        x, info = minres(a_dp, b_dp, tol=1e-8, maxiter=500)
-        assert info == 0
-        res = float(
-            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        )
-        assert res < 1e-4
-
-    def test_minres_converges_sym_indefinite(self):
-        """MINRES is suited for symmetric indefinite systems unlike CG."""
-        a_dp = _sym_indefinite(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        x, info = minres(a_dp, b_dp, tol=1e-8, maxiter=1000)
-        res = float(
-            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        )
-        assert res < 1e-3
-
-    def test_minres_matches_scipy(self):
-        a_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
-        b_np = dpnp.asnumpy(_rhs(self.n, numpy.float64))
-        x_ref, _ = scipy_sla.minres(a_np, b_np, rtol=1e-8)
-        x_dp, info = minres(
-            dpnp.asarray(a_np), dpnp.asarray(b_np), tol=1e-8
-        )
-        assert_allclose(dpnp.asnumpy(x_dp), x_ref, rtol=1e-6)
-
-    def test_minres_x0_warm_start(self):
-        a_dp = _spd_matrix(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        x0 = dpnp.zeros(self.n, dtype=numpy.float64)
-        x, info = minres(a_dp, b_dp, x0=x0, tol=1e-8)
-        assert info == 0
-
-    def test_minres_shift_parameter(self):
-        """shift != 0 solves (A - shift*I) x = b."""
-        a_np = dpnp.asnumpy(_spd_matrix(self.n, numpy.float64))
-        b_np = dpnp.asnumpy(_rhs(self.n, numpy.float64))
-        shift = 0.5
-        x_dp, info = minres(
-            dpnp.asarray(a_np), dpnp.asarray(b_np), shift=shift, tol=1e-8
-        )
-        a_shifted = a_np - shift * numpy.eye(self.n)
-        res = numpy.linalg.norm(
-            a_shifted @ dpnp.asnumpy(x_dp) - b_np
-        ) / numpy.linalg.norm(b_np)
-        assert res < 1e-4
-
-    def test_minres_non_square_raises(self):
-        a_lo = aslinearoperator(
-            dpnp.ones((4, 5), dtype=dpnp.float64)
-        )
-        b = dpnp.ones(4, dtype=dpnp.float64)
-        with assert_raises(ValueError):
-            minres(a_lo, b)
-
-    def test_minres_via_linear_operator(self):
-        a_dp = _spd_matrix(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        lo = aslinearoperator(a_dp)
-        x, info = minres(lo, b_dp, tol=1e-8)
-        assert info == 0
-
-    def test_minres_callback_called(self):
-        a_dp = _spd_matrix(self.n, numpy.float64)
-        b_dp = _rhs(self.n, numpy.float64)
-        calls = []
-
-        def _cb(xk):
-            calls.append(1)
-
-        minres(a_dp, b_dp, callback=_cb, tol=1e-8)
-        assert len(calls) > 0
-
-
-# ---------------------------------------------------------------------------
-# Integration: all solvers via LinearOperator with varying n / dtype
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.skipif(
-    not is_scipy_available(), reason="SciPy not available"
-)
-class TestSolversIntegration:
-    """Parametric integration tests — n and dtype combinations.
-
-    Follows the style of test_usm_ndarray_linalg_batch in test_linalg.py.
-    """
-
-    @pytest.mark.parametrize(
-        "n,dtype",
-        [
-            pytest.param(10, dpnp.float32, id="n=10-float32"),
-            pytest.param(10, dpnp.float64, id="n=10-float64"),
-            pytest.param(30, dpnp.float64, id="n=30-float64"),
-            pytest.param(50, dpnp.float64, id="n=50-float64"),
-        ],
-    )
-    def test_cg_spd_via_linearoperator(self, n, dtype):
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _spd_matrix(n, dtype)
-        lo = aslinearoperator(a_dp)
-        b_dp = _rhs(n, dtype)
-        x, info = cg(lo, b_dp, tol=1e-8, maxiter=n * 10)
-        assert info == 0
-        res = float(
-            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        )
-        assert res < (1e-4 if dtype == dpnp.float32 else 1e-8)
-
-    @pytest.mark.parametrize(
-        "n,dtype",
-        [
-            pytest.param(10, dpnp.float32, id="n=10-float32"),
-            pytest.param(10, dpnp.float64, id="n=10-float64"),
-            pytest.param(30, dpnp.float64, id="n=30-float64"),
-        ],
-    )
-    def test_gmres_nonsymmetric_via_linearoperator(self, n, dtype):
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _diag_dominant(n, dtype)
-        lo = aslinearoperator(a_dp)
-        b_dp = _rhs(n, dtype)
-        x, info = gmres(lo, b_dp, tol=1e-8, restart=n, maxiter=50)
-        assert info == 0
-
-    @pytest.mark.parametrize(
-        "n,dtype",
-        [
-            pytest.param(10, dpnp.float64, id="n=10-float64"),
-            pytest.param(30, dpnp.float64, id="n=30-float64"),
-        ],
-    )
-    def test_minres_spd_via_linearoperator(self, n, dtype):
-        if not has_support_aspect64() and dtype == dpnp.float64:
-            pytest.skip("float64 not supported on this device")
-        a_dp = _spd_matrix(n, dtype)
-        lo = aslinearoperator(a_dp)
-        b_dp = _rhs(n, dtype)
-        x, info = minres(lo, b_dp, tol=1e-8)
-        assert info == 0
-        res = float(
-            dpnp.linalg.norm(a_dp @ x - b_dp) / dpnp.linalg.norm(b_dp)
-        )
-        assert res < 1e-4

From 0badee4cc99bb8eae0346b9ac8ad04cbcb402095 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Apr 2026 14:44:05 +0000
Subject: [PATCH 42/43] Add the missing onemkl-sycl-sparse dep to conda-recipe

---
 conda-recipe/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 956ff6db0133..c4a7287447ba 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -50,6 +50,7 @@ requirements:
       - {{ pin_compatible('onemkl-sycl-lapack', min_pin='x.x', max_pin='x') }}
       - {{ pin_compatible('onemkl-sycl-rng', min_pin='x.x', max_pin='x') }}
       - {{ pin_compatible('onemkl-sycl-vm', min_pin='x.x', max_pin='x') }}
+      - {{ pin_compatible('onemkl-sycl-sparse', min_pin='x.x', max_pin='x') }}
       - numpy
       - intel-gpu-ocl-icd-system
 

From 4be3d9a6627c5db81b029ec271016af20fe735ef Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Apr 2026 16:57:06 +0000
Subject: [PATCH 43/43] fix pre-commit issues

---
 dpnp/backend/extensions/sparse/CMakeLists.txt |   2 +-
 dpnp/backend/extensions/sparse/gemv.cpp       | 204 ++++++++----------
 dpnp/backend/extensions/sparse/gemv.hpp       |  48 ++---
 dpnp/backend/extensions/sparse/sparse_py.cpp  |  89 +++-----
 .../extensions/sparse/types_matrix.hpp        |  21 +-
 dpnp/scipy/__init__.py                        |   2 +-
 dpnp/scipy/sparse/linalg/__init__.py          |  15 +-
 dpnp/scipy/sparse/linalg/_interface.py        |  41 ++--
 dpnp/scipy/sparse/linalg/_iterative.py        |  68 +++---
 dpnp/tests/test_scipy_sparse_linalg.py        |  15 +-
 10 files changed, 220 insertions(+), 285 deletions(-)

diff --git a/dpnp/backend/extensions/sparse/CMakeLists.txt b/dpnp/backend/extensions/sparse/CMakeLists.txt
index 49f97b58b496..5ec461e316df 100644
--- a/dpnp/backend/extensions/sparse/CMakeLists.txt
+++ b/dpnp/backend/extensions/sparse/CMakeLists.txt
@@ -46,7 +46,7 @@ if(_dpnp_sycl_targets)
 endif()
 
 if(WIN32)
-  if(${CMAKE_VERSION} VERSION_LESS "3.27")
+    if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
         # linker to ignore it.
         set(CMAKE_CXX_LINK_FLAGS
diff --git a/dpnp/backend/extensions/sparse/gemv.cpp b/dpnp/backend/extensions/sparse/gemv.cpp
index 91e3a8d18933..cd94c0143ce2 100644
--- a/dpnp/backend/extensions/sparse/gemv.cpp
+++ b/dpnp/backend/extensions/sparse/gemv.cpp
@@ -51,7 +51,7 @@ namespace dpnp::extensions::sparse
 {
 
 namespace mkl_sparse = oneapi::mkl::sparse;
-namespace py         = pybind11;
+namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
 using ext::common::init_dispatch_table;
@@ -68,12 +68,12 @@ using ext::common::init_dispatch_table;
 typedef std::pair<std::uintptr_t, sycl::event> (*gemv_init_fn_ptr_t)(
     sycl::queue &,
     oneapi::mkl::transpose,
-    const char *,   // row_ptr (typeless)
-    const char *,   // col_ind (typeless)
-    const char *,   // values  (typeless)
-    std::int64_t,   // num_rows
-    std::int64_t,   // num_cols
-    std::int64_t,   // nnz
+    const char *, // row_ptr (typeless)
+    const char *, // col_ind (typeless)
+    const char *, // values  (typeless)
+    std::int64_t, // num_rows
+    std::int64_t, // num_cols
+    std::int64_t, // nnz
     const std::vector<sycl::event> &);
 
 /**
@@ -84,15 +84,15 @@ typedef sycl::event (*gemv_compute_fn_ptr_t)(
     sycl::queue &,
     oneapi::mkl::sparse::matrix_handle_t,
     oneapi::mkl::transpose,
-    double,         // alpha (cast to Tv inside)
-    const char *,   // x (typeless)
-    double,         // beta  (cast to Tv inside)
-    char *,         // y (typeless, writable)
+    double,       // alpha (cast to Tv inside)
+    const char *, // x (typeless)
+    double,       // beta  (cast to Tv inside)
+    char *,       // y (typeless, writable)
     const std::vector<sycl::event> &);
 
 // Init dispatch: 2-D on (Tv, Ti).
-static gemv_init_fn_ptr_t
-    gemv_init_dispatch_table[dpctl_td_ns::num_types][dpctl_td_ns::num_types];
+static gemv_init_fn_ptr_t gemv_init_dispatch_table[dpctl_td_ns::num_types]
+                                                  [dpctl_td_ns::num_types];
 
 // Compute dispatch: 1-D on Tv. The index type is baked into the handle,
 // so compute doesn't need it.
@@ -105,48 +105,43 @@ static gemv_compute_fn_ptr_t
 
 template <typename Tv, typename Ti>
 static std::pair<std::uintptr_t, sycl::event>
-gemv_init_impl(sycl::queue &exec_q,
-               oneapi::mkl::transpose mkl_trans,
-               const char *row_ptr_data,
-               const char *col_ind_data,
-               const char *values_data,
-               std::int64_t num_rows,
-               std::int64_t num_cols,
-               std::int64_t nnz,
-               const std::vector<sycl::event> &depends)
+    gemv_init_impl(sycl::queue &exec_q,
+                   oneapi::mkl::transpose mkl_trans,
+                   const char *row_ptr_data,
+                   const char *col_ind_data,
+                   const char *values_data,
+                   std::int64_t num_rows,
+                   std::int64_t num_cols,
+                   std::int64_t nnz,
+                   const std::vector<sycl::event> &depends)
 {
     type_utils::validate_type_for_device<Tv>(exec_q);
 
     const Ti *row_ptr = reinterpret_cast<const Ti *>(row_ptr_data);
     const Ti *col_ind = reinterpret_cast<const Ti *>(col_ind_data);
-    const Tv *values  = reinterpret_cast<const Tv *>(values_data);
+    const Tv *values = reinterpret_cast<const Tv *>(values_data);
 
     mkl_sparse::matrix_handle_t spmat = nullptr;
     mkl_sparse::init_matrix_handle(&spmat);
 
     auto ev_set = mkl_sparse::set_csr_data(
-        exec_q, spmat,
-        num_rows, num_cols, nnz,
-        oneapi::mkl::index_base::zero,
-        const_cast<Ti *>(row_ptr),
-        const_cast<Ti *>(col_ind),
-        const_cast<Tv *>(values),
-        depends);
+        exec_q, spmat, num_rows, num_cols, nnz, oneapi::mkl::index_base::zero,
+        const_cast<Ti *>(row_ptr), const_cast<Ti *>(col_ind),
+        const_cast<Tv *>(values), depends);
 
     sycl::event ev_opt;
     try {
-        ev_opt = mkl_sparse::optimize_gemv(
-            exec_q, mkl_trans, spmat, {ev_set});
+        ev_opt = mkl_sparse::optimize_gemv(exec_q, mkl_trans, spmat, {ev_set});
     } catch (oneapi::mkl::exception const &e) {
         mkl_sparse::release_matrix_handle(exec_q, &spmat, {});
         throw std::runtime_error(
-            std::string("sparse_gemv_init: MKL exception in optimize_gemv: ")
-            + e.what());
+            std::string("sparse_gemv_init: MKL exception in optimize_gemv: ") +
+            e.what());
     } catch (sycl::exception const &e) {
         mkl_sparse::release_matrix_handle(exec_q, &spmat, {});
         throw std::runtime_error(
-            std::string("sparse_gemv_init: SYCL exception in optimize_gemv: ")
-            + e.what());
+            std::string("sparse_gemv_init: SYCL exception in optimize_gemv: ") +
+            e.what());
     }
 
     auto handle_ptr = reinterpret_cast<std::uintptr_t>(spmat);
@@ -158,32 +153,28 @@ gemv_init_impl(sycl::queue &exec_q,
 // ---------------------------------------------------------------------------
 
 template <typename Tv>
-static sycl::event
-gemv_compute_impl(sycl::queue &exec_q,
-                  mkl_sparse::matrix_handle_t spmat,
-                  oneapi::mkl::transpose mkl_trans,
-                  double alpha_d,
-                  const char *x_data,
-                  double beta_d,
-                  char *y_data,
-                  const std::vector<sycl::event> &depends)
+static sycl::event gemv_compute_impl(sycl::queue &exec_q,
+                                     mkl_sparse::matrix_handle_t spmat,
+                                     oneapi::mkl::transpose mkl_trans,
+                                     double alpha_d,
+                                     const char *x_data,
+                                     double beta_d,
+                                     char *y_data,
+                                     const std::vector<sycl::event> &depends)
 {
     // For complex Tv the single-arg constructor sets imag to zero.
     // Solvers use alpha=1, beta=0 so this is exact; other callers
     // passing complex scalars via this path will lose the imag
     // component silently.
     const Tv alpha = static_cast<Tv>(alpha_d);
-    const Tv beta  = static_cast<Tv>(beta_d);
+    const Tv beta = static_cast<Tv>(beta_d);
 
     const Tv *x = reinterpret_cast<const Tv *>(x_data);
-    Tv *y       = reinterpret_cast<Tv *>(y_data);
+    Tv *y = reinterpret_cast<Tv *>(y_data);
 
     try {
-        return mkl_sparse::gemv(
-            exec_q, mkl_trans,
-            alpha, spmat,
-            x, beta, y,
-            depends);
+        return mkl_sparse::gemv(exec_q, mkl_trans, alpha, spmat, x, beta, y,
+                                depends);
     } catch (oneapi::mkl::exception const &e) {
         throw std::runtime_error(
             std::string("sparse_gemv_compute: MKL exception: ") + e.what());
@@ -197,33 +188,35 @@ gemv_compute_impl(sycl::queue &exec_q,
 // Public entry points
 // ---------------------------------------------------------------------------
 
-static oneapi::mkl::transpose
-decode_trans(const int trans)
+static oneapi::mkl::transpose decode_trans(const int trans)
 {
     switch (trans) {
-        case 0: return oneapi::mkl::transpose::nontrans;
-        case 1: return oneapi::mkl::transpose::trans;
-        case 2: return oneapi::mkl::transpose::conjtrans;
-        default:
-            throw std::invalid_argument(
-                "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
+    case 0:
+        return oneapi::mkl::transpose::nontrans;
+    case 1:
+        return oneapi::mkl::transpose::trans;
+    case 2:
+        return oneapi::mkl::transpose::conjtrans;
+    default:
+        throw std::invalid_argument(
+            "sparse_gemv: trans must be 0 (N), 1 (T), or 2 (C)");
     }
 }
 
 std::tuple<std::uintptr_t, int, sycl::event>
-sparse_gemv_init(sycl::queue &exec_q,
-                 const int trans,
-                 const dpctl::tensor::usm_ndarray &row_ptr,
-                 const dpctl::tensor::usm_ndarray &col_ind,
-                 const dpctl::tensor::usm_ndarray &values,
-                 const std::int64_t num_rows,
-                 const std::int64_t num_cols,
-                 const std::int64_t nnz,
-                 const std::vector<sycl::event> &depends)
+    sparse_gemv_init(sycl::queue &exec_q,
+                     const int trans,
+                     const dpctl::tensor::usm_ndarray &row_ptr,
+                     const dpctl::tensor::usm_ndarray &col_ind,
+                     const dpctl::tensor::usm_ndarray &values,
+                     const std::int64_t num_rows,
+                     const std::int64_t num_cols,
+                     const std::int64_t nnz,
+                     const std::vector<sycl::event> &depends)
 {
     if (!dpctl::utils::queues_are_compatible(
-            exec_q, {row_ptr.get_queue(), col_ind.get_queue(),
-                     values.get_queue()}))
+            exec_q,
+            {row_ptr.get_queue(), col_ind.get_queue(), values.get_queue()}))
         throw py::value_error(
             "sparse_gemv_init: USM allocations are not compatible with the "
             "execution queue.");
@@ -260,34 +253,32 @@ sparse_gemv_init(sycl::queue &exec_q,
             "dtype combination. Supported: {float32,float64,complex64,"
             "complex128} x {int32,int64}.");
 
-    auto [handle_ptr, ev_opt] = init_fn(
-        exec_q, mkl_trans,
-        row_ptr.get_data(), col_ind.get_data(), values.get_data(),
-        num_rows, num_cols, nnz, depends);
+    auto [handle_ptr, ev_opt] =
+        init_fn(exec_q, mkl_trans, row_ptr.get_data(), col_ind.get_data(),
+                values.get_data(), num_rows, num_cols, nnz, depends);
 
     return {handle_ptr, val_id, ev_opt};
 }
 
-sycl::event
-sparse_gemv_compute(sycl::queue &exec_q,
-                    const std::uintptr_t handle_ptr,
-                    const int val_type_id,
-                    const int trans,
-                    const double alpha,
-                    const dpctl::tensor::usm_ndarray &x,
-                    const double beta,
-                    const dpctl::tensor::usm_ndarray &y,
-                    const std::int64_t num_rows,
-                    const std::int64_t num_cols,
-                    const std::vector<sycl::event> &depends)
+sycl::event sparse_gemv_compute(sycl::queue &exec_q,
+                                const std::uintptr_t handle_ptr,
+                                const int val_type_id,
+                                const int trans,
+                                const double alpha,
+                                const dpctl::tensor::usm_ndarray &x,
+                                const double beta,
+                                const dpctl::tensor::usm_ndarray &y,
+                                const std::int64_t num_rows,
+                                const std::int64_t num_cols,
+                                const std::vector<sycl::event> &depends)
 {
     if (x.get_ndim() != 1)
         throw py::value_error("sparse_gemv_compute: x must be a 1-D array.");
     if (y.get_ndim() != 1)
         throw py::value_error("sparse_gemv_compute: y must be a 1-D array.");
 
-    if (!dpctl::utils::queues_are_compatible(
-            exec_q, {x.get_queue(), y.get_queue()}))
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {x.get_queue(), y.get_queue()}))
         throw py::value_error(
             "sparse_gemv_compute: USM allocations are not compatible with the "
             "execution queue.");
@@ -302,8 +293,7 @@ sparse_gemv_compute(sycl::queue &exec_q,
     // Shape validation: op(A) is (num_rows, num_cols) for trans=N,
     // (num_cols, num_rows) for trans={T,C}.
     auto mkl_trans = decode_trans(trans);
-    const bool is_non_trans =
-        (mkl_trans == oneapi::mkl::transpose::nontrans);
+    const bool is_non_trans = (mkl_trans == oneapi::mkl::transpose::nontrans);
     const std::int64_t op_rows = is_non_trans ? num_rows : num_cols;
     const std::int64_t op_cols = is_non_trans ? num_cols : num_rows;
 
@@ -328,28 +318,22 @@ sparse_gemv_compute(sycl::queue &exec_q,
             "of the sparse matrix used to build the handle.");
 
     if (val_type_id < 0 || val_type_id >= dpctl_td_ns::num_types)
-        throw py::value_error(
-            "sparse_gemv_compute: val_type_id out of range.");
+        throw py::value_error("sparse_gemv_compute: val_type_id out of range.");
 
-    gemv_compute_fn_ptr_t compute_fn =
-        gemv_compute_dispatch_table[val_type_id];
+    gemv_compute_fn_ptr_t compute_fn = gemv_compute_dispatch_table[val_type_id];
 
     if (compute_fn == nullptr)
-        throw py::value_error(
-            "sparse_gemv_compute: unsupported value dtype.");
+        throw py::value_error("sparse_gemv_compute: unsupported value dtype.");
 
     auto spmat = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
 
-    return compute_fn(exec_q, spmat, mkl_trans, alpha,
-                      x.get_data(), beta,
-                      const_cast<char *>(y.get_data()),
-                      depends);
+    return compute_fn(exec_q, spmat, mkl_trans, alpha, x.get_data(), beta,
+                      const_cast<char *>(y.get_data()), depends);
 }
 
-sycl::event
-sparse_gemv_release(sycl::queue &exec_q,
-                    const std::uintptr_t handle_ptr,
-                    const std::vector<sycl::event> &depends)
+sycl::event sparse_gemv_release(sycl::queue &exec_q,
+                                const std::uintptr_t handle_ptr,
+                                const std::vector<sycl::event> &depends)
 {
     auto spmat = reinterpret_cast<mkl_sparse::matrix_handle_t>(handle_ptr);
 
@@ -378,7 +362,8 @@ struct GemvInitContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SparseGemvInitTypePairSupportFactory<Tv, Ti>::is_defined)
+        if constexpr (types::SparseGemvInitTypePairSupportFactory<
+                          Tv, Ti>::is_defined)
             return gemv_init_impl<Tv, Ti>;
         else
             return nullptr;
@@ -390,7 +375,8 @@ struct GemvComputeContigFactory
 {
     fnT get()
     {
-        if constexpr (types::SparseGemvComputeTypeSupportFactory<Tv>::is_defined)
+        if constexpr (types::SparseGemvComputeTypeSupportFactory<
+                          Tv>::is_defined)
             return gemv_compute_impl<Tv>;
         else
             return nullptr;
@@ -406,9 +392,7 @@ void init_sparse_gemv_dispatch_tables(void)
     // 1-D table on Tv for compute. dpctl's type dispatch headers expose
     // DispatchVectorBuilder as the 1-D analogue of DispatchTableBuilder.
     dpctl_td_ns::DispatchVectorBuilder<
-        gemv_compute_fn_ptr_t,
-        GemvComputeContigFactory,
-        dpctl_td_ns::num_types>
+        gemv_compute_fn_ptr_t, GemvComputeContigFactory, dpctl_td_ns::num_types>
         builder;
     builder.populate_dispatch_vector(gemv_compute_dispatch_table);
 }
diff --git a/dpnp/backend/extensions/sparse/gemv.hpp b/dpnp/backend/extensions/sparse/gemv.hpp
index 07f5aced7c49..0820fe9cc540 100644
--- a/dpnp/backend/extensions/sparse/gemv.hpp
+++ b/dpnp/backend/extensions/sparse/gemv.hpp
@@ -67,15 +67,15 @@ namespace dpnp::extensions::sparse
  * a reference to the CSR matrix for the lifetime of the handle.
  */
 extern std::tuple<std::uintptr_t, int, sycl::event>
-sparse_gemv_init(sycl::queue &exec_q,
-                 const int trans,
-                 const dpctl::tensor::usm_ndarray &row_ptr,
-                 const dpctl::tensor::usm_ndarray &col_ind,
-                 const dpctl::tensor::usm_ndarray &values,
-                 const std::int64_t num_rows,
-                 const std::int64_t num_cols,
-                 const std::int64_t nnz,
-                 const std::vector<sycl::event> &depends);
+    sparse_gemv_init(sycl::queue &exec_q,
+                     const int trans,
+                     const dpctl::tensor::usm_ndarray &row_ptr,
+                     const dpctl::tensor::usm_ndarray &col_ind,
+                     const dpctl::tensor::usm_ndarray &values,
+                     const std::int64_t num_rows,
+                     const std::int64_t num_cols,
+                     const std::int64_t nnz,
+                     const std::vector<sycl::event> &depends);
 
 /**
  * sparse_gemv_compute -- PER-ITERATION SpMV.
@@ -98,18 +98,17 @@ sparse_gemv_init(sycl::queue &exec_q,
  * subsequent work on the same queue; no host-side wait or host_task
  * keep-alive is performed.
  */
-extern sycl::event
-sparse_gemv_compute(sycl::queue &exec_q,
-                    const std::uintptr_t handle_ptr,
-                    const int val_type_id,
-                    const int trans,
-                    const double alpha,
-                    const dpctl::tensor::usm_ndarray &x,
-                    const double beta,
-                    const dpctl::tensor::usm_ndarray &y,
-                    const std::int64_t num_rows,
-                    const std::int64_t num_cols,
-                    const std::vector<sycl::event> &depends);
+extern sycl::event sparse_gemv_compute(sycl::queue &exec_q,
+                                       const std::uintptr_t handle_ptr,
+                                       const int val_type_id,
+                                       const int trans,
+                                       const double alpha,
+                                       const dpctl::tensor::usm_ndarray &x,
+                                       const double beta,
+                                       const dpctl::tensor::usm_ndarray &y,
+                                       const std::int64_t num_rows,
+                                       const std::int64_t num_cols,
+                                       const std::vector<sycl::event> &depends);
 
 /**
  * sparse_gemv_release -- free the matrix_handle created by sparse_gemv_init.
@@ -118,10 +117,9 @@ sparse_gemv_compute(sycl::queue &exec_q,
  * depend on it have completed. The returned event depends on the release,
  * so the caller can chain CSR buffer deallocation on it safely.
  */
-extern sycl::event
-sparse_gemv_release(sycl::queue &exec_q,
-                    const std::uintptr_t handle_ptr,
-                    const std::vector<sycl::event> &depends);
+extern sycl::event sparse_gemv_release(sycl::queue &exec_q,
+                                       const std::uintptr_t handle_ptr,
+                                       const std::vector<sycl::event> &depends);
 
 /**
  * Register the init (2-D on Tv x Ti) and compute (1-D on Tv) dispatch
diff --git a/dpnp/backend/extensions/sparse/sparse_py.cpp b/dpnp/backend/extensions/sparse/sparse_py.cpp
index 9b3dc16d3b01..3f018595ea81 100644
--- a/dpnp/backend/extensions/sparse/sparse_py.cpp
+++ b/dpnp/backend/extensions/sparse/sparse_py.cpp
@@ -83,37 +83,23 @@ PYBIND11_MODULE(_sparse_impl, m)
     // ------------------------------------------------------------------
     m.def(
         "_sparse_gemv_init",
-        [](sycl::queue &exec_q,
-           const int trans,
+        [](sycl::queue &exec_q, const int trans,
            const dpctl::tensor::usm_ndarray &row_ptr,
            const dpctl::tensor::usm_ndarray &col_ind,
            const dpctl::tensor::usm_ndarray &values,
-           const std::int64_t num_rows,
-           const std::int64_t num_cols,
-           const std::int64_t nnz,
-           const std::vector<sycl::event> &depends)
-            -> std::tuple<std::uintptr_t, int, sycl::event>
-        {
-            return sparse_gemv_init(
-                exec_q, trans,
-                row_ptr, col_ind, values,
-                num_rows, num_cols, nnz,
-                depends);
+           const std::int64_t num_rows, const std::int64_t num_cols,
+           const std::int64_t nnz, const std::vector<sycl::event> &depends)
+            -> std::tuple<std::uintptr_t, int, sycl::event> {
+            return sparse_gemv_init(exec_q, trans, row_ptr, col_ind, values,
+                                    num_rows, num_cols, nnz, depends);
         },
-        py::arg("exec_q"),
-        py::arg("trans"),
-        py::arg("row_ptr"),
-        py::arg("col_ind"),
-        py::arg("values"),
-        py::arg("num_rows"),
-        py::arg("num_cols"),
-        py::arg("nnz"),
-        py::arg("depends"),
+        py::arg("exec_q"), py::arg("trans"), py::arg("row_ptr"),
+        py::arg("col_ind"), py::arg("values"), py::arg("num_rows"),
+        py::arg("num_cols"), py::arg("nnz"), py::arg("depends"),
         "Initialise oneMKL sparse matrix handle "
         "(set_csr_data + optimize_gemv). "
         "Returns (handle_ptr: int, val_type_id: int, event). "
-        "Call once per operator."
-    );
+        "Call once per operator.");
 
     // ------------------------------------------------------------------
     // _sparse_gemv_compute(exec_q, handle, val_type_id, trans, alpha,
@@ -131,39 +117,22 @@ PYBIND11_MODULE(_sparse_impl, m)
     // ------------------------------------------------------------------
     m.def(
         "_sparse_gemv_compute",
-        [](sycl::queue &exec_q,
-           const std::uintptr_t handle_ptr,
-           const int val_type_id,
-           const int trans,
-           const double alpha,
-           const dpctl::tensor::usm_ndarray &x,
-           const double beta,
-           const dpctl::tensor::usm_ndarray &y,
-           const std::int64_t num_rows,
+        [](sycl::queue &exec_q, const std::uintptr_t handle_ptr,
+           const int val_type_id, const int trans, const double alpha,
+           const dpctl::tensor::usm_ndarray &x, const double beta,
+           const dpctl::tensor::usm_ndarray &y, const std::int64_t num_rows,
            const std::int64_t num_cols,
-           const std::vector<sycl::event> &depends)
-            -> sycl::event
-        {
-            return sparse_gemv_compute(
-                exec_q, handle_ptr, val_type_id, trans, alpha,
-                x, beta, y,
-                num_rows, num_cols,
-                depends);
+           const std::vector<sycl::event> &depends) -> sycl::event {
+            return sparse_gemv_compute(exec_q, handle_ptr, val_type_id, trans,
+                                       alpha, x, beta, y, num_rows, num_cols,
+                                       depends);
         },
-        py::arg("exec_q"),
-        py::arg("handle"),
-        py::arg("val_type_id"),
-        py::arg("trans"),
-        py::arg("alpha"),
-        py::arg("x"),
-        py::arg("beta"),
-        py::arg("y"),
-        py::arg("num_rows"),
-        py::arg("num_cols"),
+        py::arg("exec_q"), py::arg("handle"), py::arg("val_type_id"),
+        py::arg("trans"), py::arg("alpha"), py::arg("x"), py::arg("beta"),
+        py::arg("y"), py::arg("num_rows"), py::arg("num_cols"),
         py::arg("depends"),
         "Execute sparse::gemv using a pre-built handle. "
-        "Returns the gemv event."
-    );
+        "Returns the gemv event.");
 
     // ------------------------------------------------------------------
     // _sparse_gemv_release(exec_q, handle, depends) -> event
@@ -175,16 +144,10 @@ PYBIND11_MODULE(_sparse_impl, m)
     // ------------------------------------------------------------------
     m.def(
         "_sparse_gemv_release",
-        [](sycl::queue &exec_q,
-           const std::uintptr_t handle_ptr,
-           const std::vector<sycl::event> &depends)
-            -> sycl::event
-        {
+        [](sycl::queue &exec_q, const std::uintptr_t handle_ptr,
+           const std::vector<sycl::event> &depends) -> sycl::event {
             return sparse_gemv_release(exec_q, handle_ptr, depends);
         },
-        py::arg("exec_q"),
-        py::arg("handle"),
-        py::arg("depends"),
-        "Release the oneMKL matrix_handle created by _sparse_gemv_init."
-    );
+        py::arg("exec_q"), py::arg("handle"), py::arg("depends"),
+        "Release the oneMKL matrix_handle created by _sparse_gemv_init.");
 }
diff --git a/dpnp/backend/extensions/sparse/types_matrix.hpp b/dpnp/backend/extensions/sparse/types_matrix.hpp
index c02a7e4ce47e..42145a4ab4d2 100644
--- a/dpnp/backend/extensions/sparse/types_matrix.hpp
+++ b/dpnp/backend/extensions/sparse/types_matrix.hpp
@@ -73,11 +73,15 @@ struct SparseGemvInitTypePairSupportFactory
         dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int32_t>,
         dpctl_td_ns::TypePairDefinedEntry<Tv, double, Ti, std::int64_t>,
         // complex single precision
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>, Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<float>, Ti, std::int64_t>,
+        dpctl_td_ns::
+            TypePairDefinedEntry<Tv, std::complex<float>, Ti, std::int32_t>,
+        dpctl_td_ns::
+            TypePairDefinedEntry<Tv, std::complex<float>, Ti, std::int64_t>,
         // complex double precision
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>, Ti, std::int32_t>,
-        dpctl_td_ns::TypePairDefinedEntry<Tv, std::complex<double>, Ti, std::int64_t>,
+        dpctl_td_ns::
+            TypePairDefinedEntry<Tv, std::complex<double>, Ti, std::int32_t>,
+        dpctl_td_ns::
+            TypePairDefinedEntry<Tv, std::complex<double>, Ti, std::int64_t>,
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
@@ -100,17 +104,16 @@ template <typename Tv>
 struct SparseGemvComputeTypeSupportFactory
 {
 #if defined(DPCTL_HAS_TYPE_DEFINED_ENTRY)
-    static constexpr bool is_defined = std::disjunction
-        dpctl_td_ns::TypeDefinedEntry<Tv, float>,
+    static constexpr bool
+        is_defined = std::disjunction dpctl_td_ns::TypeDefinedEntry<Tv, float>,
         dpctl_td_ns::TypeDefinedEntry<Tv, double>,
         dpctl_td_ns::TypeDefinedEntry<Tv, std::complex<float>>,
         dpctl_td_ns::TypeDefinedEntry<Tv, std::complex<double>>,
-        dpctl_td_ns::NotDefinedEntry>::is_defined;
+        dpctl_td_ns::NotDefinedEntry > ::is_defined;
 #else
     // Portable fallback: works with any dpctl version.
     static constexpr bool is_defined =
-        std::is_same_v<Tv, float>               ||
-        std::is_same_v<Tv, double>              ||
+        std::is_same_v<Tv, float> || std::is_same_v<Tv, double> ||
         std::is_same_v<Tv, std::complex<float>> ||
         std::is_same_v<Tv, std::complex<double>>;
 #endif
diff --git a/dpnp/scipy/__init__.py b/dpnp/scipy/__init__.py
index 7886299c9f9d..ceb1f9df932e 100644
--- a/dpnp/scipy/__init__.py
+++ b/dpnp/scipy/__init__.py
@@ -36,6 +36,6 @@
 DPNP functionality, reusing DPNP and oneMKL implementations underneath.
 """
 
-from . import linalg, special, sparse
+from . import linalg, sparse, special
 
 __all__ = ["linalg", "special", "sparse"]
diff --git a/dpnp/scipy/sparse/linalg/__init__.py b/dpnp/scipy/sparse/linalg/__init__.py
index fb09329a2d12..30124562447e 100644
--- a/dpnp/scipy/sparse/linalg/__init__.py
+++ b/dpnp/scipy/sparse/linalg/__init__.py
@@ -26,9 +26,8 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-from __future__ import annotations
-
-"""Sparse linear algebra interface for DPNP.
+"""
+Sparse linear algebra interface for DPNP.
 
 This module provides a subset of :mod:`scipy.sparse.linalg`
  functionality on top of DPNP arrays.
@@ -37,13 +36,9 @@
 and a small set of Krylov solvers (``cg``, ``gmres``, ``minres``).
 """
 
+from __future__ import annotations
+
 from ._interface import LinearOperator, aslinearoperator
 from ._iterative import cg, gmres, minres
 
-__all__ = [
-    "LinearOperator",
-    "aslinearoperator",
-    "cg",
-    "gmres",
-    "minres",
-]
+__all__ = ["LinearOperator", "aslinearoperator", "cg", "gmres", "minres"]
diff --git a/dpnp/scipy/sparse/linalg/_interface.py b/dpnp/scipy/sparse/linalg/_interface.py
index 623ada2c33cc..e071242f6ba3 100644
--- a/dpnp/scipy/sparse/linalg/_interface.py
+++ b/dpnp/scipy/sparse/linalg/_interface.py
@@ -110,15 +110,14 @@ def __init__(self, dtype, shape):
         shape = tuple(int(s) for s in shape)
         if not _isshape(shape):
             raise ValueError(
-                f"invalid shape {shape!r} (must be a length-2 tuple of non-negative ints)"
+                f"invalid shape {shape!r} (must be a length-2 tuple of "
+                "non-negative ints)"
             )
         self.dtype = dtype
         self.shape = shape
 
     def _init_dtype(self):
-        """
-        Infer dtype via a trial matvec on a zero vector.
-        """
+        """Infer dtype via a trial matvec on a zero vector."""
         if self.dtype is not None:
             return
         v = dpnp.zeros(self.shape[-1], dtype=dpnp.float64)
@@ -145,24 +144,29 @@ def _rmatmat(self, X):
         return self.H.matmat(X)
 
     def matvec(self, x):
+        """Apply the matrix-vector product."""
         M, N = self.shape
         if x.shape not in ((N,), (N, 1)):
             raise ValueError(
-                f"dimension mismatch: operator shape {self.shape}, vector shape {x.shape}"
+                f"dimension mismatch: operator shape {self.shape}, "
+                "vector shape {x.shape}"
             )
         y = self._matvec(x)
         return y.reshape(M) if x.ndim == 1 else y.reshape(M, 1)
 
     def rmatvec(self, x):
+        """Apply the adjoint matrix-vector product."""
         M, N = self.shape
         if x.shape not in ((M,), (M, 1)):
             raise ValueError(
-                f"dimension mismatch: operator shape {self.shape}, vector shape {x.shape}"
+                f"dimension mismatch: operator shape {self.shape}, "
+                "vector shape {x.shape}"
             )
         y = self._rmatvec(x)
         return y.reshape(N) if x.ndim == 1 else y.reshape(N, 1)
 
     def matmat(self, X):
+        """Apply the matrix-matrix product."""
         if X.ndim != 2:
             raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[1]:
@@ -172,6 +176,7 @@ def matmat(self, X):
         return self._matmat(X)
 
     def rmatmat(self, X):
+        """Apply the adjoint matrix-matrix product."""
         if X.ndim != 2:
             raise ValueError(f"expected 2-D array, got {X.ndim}-D")
         if X.shape[0] != self.shape[0]:
@@ -199,6 +204,7 @@ def __call__(self, x):
         return self * x
 
     def __mul__(self, x):
+        """Multiply operator by array x."""
         return self.dot(x)
 
     def __matmul__(self, x):
@@ -261,16 +267,14 @@ def __repr__(self):
         dt = (
             "unspecified dtype" if self.dtype is None else f"dtype={self.dtype}"
         )
-        return f"<{self.shape[0]}x{self.shape[1]} {self.__class__.__name__} with {dt}>"
-
-
-# ---------------------------------------------------------------------------
-# Concrete operator classes
-# ---------------------------------------------------------------------------
+        return (
+            f"<{self.shape[0]}x{self.shape[1]}"
+            f" {self.__class__.__name__} with {dt}>"
+        )
 
 
 class _CustomLinearOperator(LinearOperator):
-    """Created when the user calls LinearOperator(shape, matvec=...) directly."""
+    """Created when the user calls LinearOperator(shape, matvec=...)"""
 
     def __init__(
         self, shape, matvec, rmatvec=None, matmat=None, dtype=None, rmatmat=None
@@ -321,16 +325,16 @@ def __init__(self, A):
         self.args = (A,)
 
     def _matvec(self, x):
-        return self.A._rmatvec(x)
+        return self.A._rmatvec(x) # pylint: disable=protected-access
 
     def _rmatvec(self, x):
-        return self.A._matvec(x)
+        return self.A._matvec(x) # pylint: disable=protected-access
 
     def _matmat(self, X):
-        return self.A._rmatmat(X)
+        return self.A._rmatmat(X) # pylint: disable=protected-access
 
     def _rmatmat(self, X):
-        return self.A._matmat(X)
+        return self.A._matmat(X) # pylint: disable=protected-access
 
     def _adjoint(self):
         return self.A
@@ -504,6 +508,7 @@ def __init__(self, shape, dtype=None):
         super().__init__(dtype, shape)
 
     def _matvec(self, x):
+        """Apply matrix-vector product via stored array."""
         return x
 
     def _rmatvec(self, x):
@@ -535,7 +540,7 @@ def aslinearoperator(A) -> LinearOperator:
         return A
 
     try:
-        from dpnp.scipy import sparse as _sp
+        from dpnp.scipy import sparse as _sp # pylint: disable=import-outside-toplevel
 
         if _sp.issparse(A):
             return MatrixLinearOperator(A)
diff --git a/dpnp/scipy/sparse/linalg/_iterative.py b/dpnp/scipy/sparse/linalg/_iterative.py
index 786d7f9f92de..de09d2684115 100644
--- a/dpnp/scipy/sparse/linalg/_iterative.py
+++ b/dpnp/scipy/sparse/linalg/_iterative.py
@@ -63,12 +63,13 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Tuple
+from typing import Callable
 
+import dpctl.utils as dpu
 import numpy
+
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
-import dpctl.utils as dpu
 
 from ._interface import IdentityOperator, LinearOperator, aslinearoperator
 
@@ -86,17 +87,10 @@
 
 _SUPPORTED_DTYPES = frozenset("fdFD")
 
-
-# ---------------------------------------------------------------------------
-# Internal helpers
-# ---------------------------------------------------------------------------
-
-
 def _np_dtype(dp_dtype) -> numpy.dtype:
-    """Normalise any dtype-like (dpnp type, numpy type, string) to numpy.dtype."""
+    """Normalise any dtype-like (dpnp type/numpy type/string) to numpy.dtype."""
     return numpy.dtype(dp_dtype)
 
-
 def _check_dtype(dtype, name: str) -> None:
     if _np_dtype(dtype).char not in _SUPPORTED_DTYPES:
         raise TypeError(
@@ -104,7 +98,6 @@ def _check_dtype(dtype, name: str) -> None:
             "only float32, float64, complex64, complex128 are accepted."
         )
 
-
 class _CachedSpMV:
     """
     Wrap a CSR matrix with a persistent oneMKL matrix_handle.
@@ -174,7 +167,7 @@ def __init__(self, A, trans: int = 0):
         self._val_type_id = val_type_id
 
     def __call__(self, x: dpnp.ndarray) -> dpnp.ndarray:
-        """y = op(A) * x -- only sparse::gemv fires, fully async."""
+        """Y = op(A) * x -- only sparse::gemv fires, fully async."""
         y = dpnp.empty(
             self._out_size, dtype=self._dtype, sycl_queue=self._exec_q
         )
@@ -219,9 +212,11 @@ def __init__(self, A):
         self._adjoint = None
 
     def matvec(self, x):
+        """Apply the operator to vector x."""
         return self.forward(x)
 
     def rmatvec(self, x):
+        """Return the data type of the operator."""
         if self._adjoint is None:
             # Build conjtrans handle on first use. For real dtypes
             # this is equivalent to trans=1.
@@ -315,7 +310,8 @@ def _make_system(A, M, x0, b):
         M_op = aslinearoperator(M)
         if M_op.shape != A_op.shape:
             raise ValueError(
-                f"preconditioner shape {M_op.shape} != operator shape {A_op.shape}"
+                f"preconditioner shape {M_op.shape} != "
+                f"operator shape {A_op.shape}"
             )
 
         fast_mv_M = _make_fast_matvec(M)
@@ -365,24 +361,18 @@ def _get_atol(b_norm: float, atol, rtol: float) -> float:
         )
     return max(atol, float(rtol) * float(b_norm))
 
-
-# ---------------------------------------------------------------------------
-# Conjugate Gradient
-# ---------------------------------------------------------------------------
-
-
 def cg(
     A,
     b,
-    x0: Optional[dpnp.ndarray] = None,
+    x0: dpnp.ndarray | None = None,
     *,
     rtol: float = 1e-5,
-    tol: Optional[float] = None,
-    maxiter: Optional[int] = None,
+    tol: float | None = None,
+    maxiter: int | None = None,
     M=None,
-    callback: Optional[Callable] = None,
+    callback: Callable | None = None,
     atol=None,
-) -> Tuple[dpnp.ndarray, int]:
+) -> tuple[dpnp.ndarray, int]:
     """Conjugate Gradient -- pure dpnp/oneMKL, Hermitian positive definite A.
 
     Parameters
@@ -433,7 +423,7 @@ def cg(
 
     info = maxiter
 
-    for k in range(maxiter):
+    for _k in range(maxiter):
         # Convergence check (sync).
         rnorm = dpnp.linalg.norm(r)
         if float(rnorm) <= atol_eff_host:
@@ -473,16 +463,16 @@ def cg(
 def gmres(
     A,
     b,
-    x0: Optional[dpnp.ndarray] = None,
+    x0: dpnp.ndarray | None = None,
     *,
     rtol: float = 1e-5,
     atol: float = 0.0,
-    restart: Optional[int] = None,
-    maxiter: Optional[int] = None,
+    restart: int | None = None,
+    maxiter: int | None = None,
     M=None,
-    callback: Optional[Callable] = None,
-    callback_type: Optional[str] = None,
-) -> Tuple[dpnp.ndarray, int]:
+    callback: Callable | None = None,
+    callback_type: str | None = None,
+) -> tuple[dpnp.ndarray, int]:
     """Uses Generalized Minimal RESidual iteration to solve ``Ax = b``.
 
     Parameters
@@ -606,17 +596,17 @@ def gmres(
 def minres(
     A,
     b,
-    x0: Optional[dpnp.ndarray] = None,
+    x0: dpnp.ndarray | None = None,
     *,
     rtol: float = 1e-5,
     shift: float = 0.0,
-    tol: Optional[float] = None,
-    maxiter: Optional[int] = None,
+    tol: float | None = None,
+    maxiter: int | None = None,
     M=None,
-    callback: Optional[Callable] = None,
+    callback: Callable | None = None,
     show: bool = False,
     check: bool = False,
-) -> Tuple[dpnp.ndarray, int]:
+) -> tuple[dpnp.ndarray, int]:
     """Uses MINimum RESidual iteration to solve ``Ax = b``.
 
     Solves the symmetric (possibly indefinite) system ``Ax = b`` or,
@@ -798,7 +788,6 @@ def minres(
         epsln = sn * beta
         dbar = -cs * beta
         root = numpy.sqrt(gbar**2 + dbar**2)
-        Arnorm = phibar * root  # ||A r_{k-1}||
 
         # Compute the next plane rotation Q_k.
         gamma = numpy.sqrt(gbar**2 + beta**2)
@@ -908,8 +897,8 @@ def _make_compute_hu(V):
     """Factory mirroring cupyx's _make_compute_hu using oneMKL gemv directly.
 
     Returns a closure compute_hu(u, j) that performs:
-        h = V[:, :j+1]^H @ u            (gemv with transpose=True)
-        u = u - V[:, :j+1] @ h          (gemv with transpose=False, then subtract)
+        h = V[:, :j+1]^H @ u     (gemv with transpose=True)
+        u = u - V[:, :j+1] @ h   (gemv with transpose=False, then subtract)
 
     The current bi._gemv binding hardcodes alpha=1, beta=0, so the second
     pass requires a temporary vector and an explicit subtraction.  To get
@@ -928,7 +917,6 @@ def _make_compute_hu(V):
     exec_q = V.sycl_queue
     dtype = V.dtype
     is_cpx = dpnp.issubdtype(dtype, dpnp.complexfloating)
-    V_usm = dpnp.get_usm_ndarray(V)
 
     def compute_hu(u, j):
         # h = V[:, :j+1]^H @ u  (allocate fresh, length j+1)
diff --git a/dpnp/tests/test_scipy_sparse_linalg.py b/dpnp/tests/test_scipy_sparse_linalg.py
index bce364ef3739..b11aa1fcc796 100644
--- a/dpnp/tests/test_scipy_sparse_linalg.py
+++ b/dpnp/tests/test_scipy_sparse_linalg.py
@@ -8,6 +8,13 @@
 )
 
 import dpnp
+from dpnp.scipy.sparse.linalg import (
+    LinearOperator,
+    aslinearoperator,
+    cg,
+    gmres,
+    minres,
+)
 from dpnp.tests.helper import (
     assert_dtype_allclose,
     generate_random_numpy_array,
@@ -18,14 +25,6 @@
 )
 from dpnp.tests.third_party.cupy import testing
 
-from dpnp.scipy.sparse.linalg import (
-    LinearOperator,
-    aslinearoperator,
-    cg,
-    gmres,
-    minres,
-)
-
 if is_scipy_available():
     import scipy.sparse.linalg as scipy_sla