Restrict matmul fast path to supported 2D cases and fall back to chunked path

FrancescAlted · FrancescAlted · commit 2ae53873d274 · 2026-03-23T10:56:15.000+01:00
diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
@@ -2102,6 +2102,7 @@ cdef class SChunk:
     cpdef remove_prefilter(self, func_name, _new_ctx=True):
         cdef udf_udata* udf_data
         cdef user_filters_udata* udata
+        cdef mm_udata* mm_data
 
         if func_name is not None and func_name in blosc2.prefilter_funcs:
             del blosc2.prefilter_funcs[func_name]
@@ -2123,6 +2124,13 @@ cdef class SChunk:
                     if me_data.eval_params != NULL:
                         free(me_data.eval_params)
                     free(me_data)
+        elif self.schunk.storage.cparams.prefilter == <blosc2_prefilter_fn>matmul_prefilter:
+            if self.schunk.storage.cparams.preparams != NULL:
+                mm_data = <mm_udata*>self.schunk.storage.cparams.preparams.user_data
+                if mm_data != NULL:
+                    if mm_data.inputs != NULL:
+                        free(mm_data.inputs)
+                    free(mm_data)
         elif self.schunk.storage.cparams.prefilter != NULL:
             # From Python the preparams->udata with always have the field py_func
             if self.schunk.storage.cparams.preparams != NULL:
@@ -2408,6 +2416,8 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
     out_block_nrows = out_arr.blockshape[ndim - 2]
     out_block_ncols = out_arr.blockshape[ndim - 1]
 
+    memset(params_output, 0, out_arr.blocknitems * typesize)
+
     dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS)
 
     first_run = True
@@ -2464,13 +2474,13 @@ cdef int aux_matmul(mm_udata *udata, int64_t nchunk, int32_t nblock, void *param
             if rc < 0:
                 raise ValueError("matmul: error decompressing the B chunk")
             batch = 0
-            offsetA = 0
-            offsetB = 0
-            offset = 0
             while batch < batches:
                 batch_ = batch
+                offsetA = 0
+                offsetB = 0
+                offset = 0
                 for i in range(ndim - 2):
-                    coord = batch // udata.el_strides[0][i]
+                    coord = batch_ // udata.el_strides[0][i]
                     batch_ = batch_ % udata.el_strides[0][i]
                     offsetA += coord * udata.el_strides[1][i]
                     offsetB += coord * udata.el_strides[2][i]
diff --git a/src/blosc2/linalg.py b/src/blosc2/linalg.py
@@ -23,7 +23,79 @@
     from collections.abc import Sequence
 
 
-def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArray:  # noqa: C901
+def _matmul_chunked(
+    x1: blosc2.Array, x2: blosc2.NDArray, result: blosc2.NDArray, n: int, m: int, k: int
+) -> None:
+    p, q = result.chunks[-2:]
+    r = x2.chunks[-1]
+
+    intersecting_chunks = get_intersecting_chunks((), result.shape[:-2], result.chunks[:-2])
+    for chunk in intersecting_chunks:
+        chunk = chunk.raw
+        for row in range(0, n, p):
+            row_end = builtins.min(row + p, n)
+            for col in range(0, m, q):
+                col_end = builtins.min(col + q, m)
+                for aux in range(0, k, r):
+                    aux_end = builtins.min(aux + r, k)
+                    bx1 = (
+                        x1[chunk[-x1.ndim + 2 :] + (slice(row, row_end), slice(aux, aux_end))]
+                        if x1.ndim > 2
+                        else x1[row:row_end, aux:aux_end]
+                    )
+                    bx2 = (
+                        x2[chunk[-x2.ndim + 2 :] + (slice(aux, aux_end), slice(col, col_end))]
+                        if x2.ndim > 2
+                        else x2[aux:aux_end, col:col_end]
+                    )
+                    result[chunk + (slice(row, row_end), slice(col, col_end))] += np.matmul(bx1, bx2)
+
+
+def _matmul_can_use_fast_path(
+    x1: blosc2.Array, x2: blosc2.NDArray, result: blosc2.NDArray, use_miniexpr: bool
+) -> bool:
+    if not use_miniexpr:
+        return False
+
+    ops = (x1, x2, result)
+    all_ndarray = all(isinstance(value, blosc2.NDArray) and value.shape != () for value in ops)
+    if not all_ndarray:
+        return False
+
+    # The current prefilter-backed implementation is only supported for 2-D layouts.
+    if result.ndim != 2 or x1.ndim != 2 or x2.ndim != 2:
+        return False
+
+    if any(op.dtype != ops[0].dtype for op in ops):
+        return False
+
+    chunks_aligned = x1.chunks[-2] % x1.blocks[-2] == 0
+    chunks_aligned &= x2.chunks[-1] % x2.blocks[-1] == 0
+    chunks_aligned &= x2.chunks[-2] % x1.blocks[-1] == 0
+    if not chunks_aligned:
+        return False
+
+    same_blocks = x2.blocks[-2] == x1.blocks[-1]
+    same_blocks &= x2.blocks[-1] == result.blocks[-1]
+    same_blocks &= result.blocks[-2] == x1.blocks[-2]
+    if not same_blocks:
+        return False
+
+    try:
+        result_blocks = np.broadcast_shapes(x1.blocks, x2.blocks)
+    except ValueError:
+        return False
+    if result_blocks[:-2] != result.blocks[:-2]:
+        return False
+
+    if x1.dtype.kind not in ("i", "f"):
+        return False
+    if x2.dtype.kind not in ("i", "f"):
+        return False
+    return x1.dtype == x2.dtype
+
+
+def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArray:
     """
     Computes the matrix product between two Blosc2 NDArrays.
 
@@ -112,60 +184,10 @@ def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArra
     kwargs["_chunksize_reduc_factor"] = 1
     result = blosc2.zeros(result_shape, dtype=blosc2.result_type(x1, x2), **kwargs)
 
-    # multithreaded matmul
-    # TODO: handle a) type promotion, b) padding (explicitly), c) (improved) >2D
-    ops = (x1, x2, result)
-    all_ndarray = all(isinstance(value, blosc2.NDArray) and value.shape != () for value in ops)
     global try_miniexpr
 
-    # Use a local copy so we don't modify the global
-    use_miniexpr = try_miniexpr
     if 0 not in result.shape + x1.shape + x2.shape:  # if any array is empty, return array of 0s
-        if all_ndarray:
-            if any(op.dtype != ops[0].dtype for op in ops):  # TODO: Remove this condition
-                use_miniexpr = False
-
-            # TODO: We can relax this to even just load according to result blockshape, but that's difficult.
-            # Just force same chunk/block shapes
-            # same_chunks = all(op.chunks == result.chunks for op in (x1, x2))
-            # same_blocks = all(op.blocks == result.blocks for op in (x1, x2))
-            # same_shape = all(op.shape == result.shape for op in (x1, x2))
-
-            # use_miniexpr &= same_blocks & same_chunks & same_shape
-            # Two easier cases are presented below
-            # Case 1: Might want to restrict loading across chunk boundaries, in which case would require:
-            # x1.chunks[-2] % result.blocks[-2] == 0
-            # x2.chunks[-1] % result.blocks[-1] == 0
-            # x2.chunks[-2] % x1.blocks[-1] == 0
-            # Can then load in x1 as slices of size [result.blocks[-2], x1.blocks[-1]]
-            # and x2 in slices of [x1.blocks[-1], result.blocks[-1]]
-
-            # Case 2: Slightly easier to implement this maybe
-            # Require that blocks are matmul compatible and broadcastable directly to result
-            # (M, K) x (K, N) = (M, N)
-            # so can load block-by-block for inputs and calculate block of output
-            # Also need to avoid loading across chunk boundaries
-            chunks_aligned = x1.chunks[-2] % x1.blocks[-2] == 0
-            chunks_aligned &= x2.chunks[-1] % x2.blocks[-1] == 0
-            chunks_aligned &= x2.chunks[-2] % x1.blocks[-1] == 0
-            same_blocks = x2.blocks[-2] == x1.blocks[-1]
-            same_blocks &= x2.blocks[-1] == result.blocks[-1]
-            same_blocks &= result.blocks[-2] == x1.blocks[-2]
-            try:
-                result_blocks = np.broadcast_shapes(x1.blocks, x2.blocks)
-                if not (same_blocks and chunks_aligned and result_blocks[:-2] == result.blocks[:-2]):
-                    use_miniexpr = False
-            except ValueError:
-                use_miniexpr = False
-
-            use_miniexpr &= x1.dtype.kind in ("i", "f")
-            use_miniexpr &= x2.dtype.kind in ("i", "f")
-            use_miniexpr &= x1.dtype == x2.dtype
-
-        else:
-            use_miniexpr = False
-
-        if use_miniexpr:
+        if _matmul_can_use_fast_path(x1, x2, result, try_miniexpr):
             prefilter_set = False
             try:
                 result._set_pref_matmul({"x1": x1, "x2": x2}, fp_accuracy=blosc2.FPAccuracy.DEFAULT)
@@ -174,36 +196,16 @@ def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArra
                 data = np.empty(result.schunk.chunksize, dtype=np.uint8)
                 for nchunk_out in range(result.schunk.nchunks):
                     result.schunk.update_data(nchunk_out, data, copy=False)
-            except Exception as e:
-                raise Exception from e
+            except Exception as exc:
+                warnings.warn(
+                    f"Fast matmul path unavailable; falling back to chunked path: {exc}", RuntimeWarning
+                )
+                _matmul_chunked(x1, x2, result, n, m, k)
             finally:
                 if prefilter_set:
                     result.schunk.remove_prefilter("miniexpr")
-        else:  # couldn't do multithreading
-            print("multithreading failed :( ")
-            p, q = result.chunks[-2:]
-            r = x2.chunks[-1]
-
-            intersecting_chunks = get_intersecting_chunks((), result.shape[:-2], result.chunks[:-2])
-            for chunk in intersecting_chunks:
-                chunk = chunk.raw
-                for row in range(0, n, p):
-                    row_end = builtins.min(row + p, n)
-                    for col in range(0, m, q):
-                        col_end = builtins.min(col + q, m)
-                        for aux in range(0, k, r):
-                            aux_end = builtins.min(aux + r, k)
-                            bx1 = (
-                                x1[chunk[-x1.ndim + 2 :] + (slice(row, row_end), slice(aux, aux_end))]
-                                if x1.ndim > 2
-                                else x1[row:row_end, aux:aux_end]
-                            )
-                            bx2 = (
-                                x2[chunk[-x2.ndim + 2 :] + (slice(aux, aux_end), slice(col, col_end))]
-                                if x2.ndim > 2
-                                else x2[aux:aux_end, col:col_end]
-                            )
-                            result[chunk + (slice(row, row_end), slice(col, col_end))] += np.matmul(bx1, bx2)
+        else:
+            _matmul_chunked(x1, x2, result, n, m, k)
 
     if x1_is_vector:
         result = result.squeeze(axis=-2)
diff --git a/src/blosc2/utils.py b/src/blosc2/utils.py
@@ -9,6 +9,7 @@
 import builtins
 import inspect
 import math
+import sys
 import warnings
 from itertools import product
 
@@ -26,6 +27,10 @@
 def _toggle_miniexpr(FLAG):
     global try_miniexpr
     try_miniexpr = FLAG
+    for module_name in ("blosc2.lazyexpr", "blosc2.linalg"):
+        module = sys.modules.get(module_name)
+        if module is not None:
+            module.try_miniexpr = FLAG
 
 
 # NumPy version and a convenient boolean flag
diff --git a/tests/ndarray/test_linalg.py b/tests/ndarray/test_linalg.py
@@ -12,8 +12,10 @@
 import pytest
 
 import blosc2
+import blosc2.linalg as blosc2_linalg
+import blosc2.utils as utils_mod
 from blosc2.lazyexpr import linalg_funcs
-from blosc2.utils import npvecdot
+from blosc2.utils import _toggle_miniexpr, npvecdot
 
 # Conditionally import torch for proxy tests
 try:
@@ -69,6 +71,88 @@ def test_matmul(ashape, achunks, ablocks, bshape, bchunks, bblocks, dtype):
         np.testing.assert_allclose(b2_res[()], np_res, rtol=1e-6)
 
 
+def test_toggle_miniexpr_updates_linalg_runtime_flag():
+    old_flag = utils_mod.try_miniexpr
+    try:
+        _toggle_miniexpr(False)
+        assert utils_mod.try_miniexpr is False
+        assert blosc2_linalg.try_miniexpr is False
+
+        _toggle_miniexpr(True)
+        assert utils_mod.try_miniexpr is True
+        assert blosc2_linalg.try_miniexpr is True
+    finally:
+        _toggle_miniexpr(old_flag)
+
+
+def test_matmul_uses_fast_path_for_supported_2d(monkeypatch):
+    old_flag = utils_mod.try_miniexpr
+    calls = []
+    original = blosc2.NDArray._set_pref_matmul
+
+    def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
+        calls.append((self.shape, inputs["x1"].shape, inputs["x2"].shape))
+        return original(self, inputs, fp_accuracy)
+
+    monkeypatch.setattr(blosc2.NDArray, "_set_pref_matmul", wrapped_set_pref_matmul)
+    try:
+        _toggle_miniexpr(True)
+        a = blosc2.ones(shape=(400, 400), dtype=np.int64, chunks=(200, 200), blocks=(100, 100))
+        b = blosc2.full(shape=(400, 400), fill_value=2, dtype=np.int64, chunks=(200, 200), blocks=(100, 100))
+
+        c = blosc2.matmul(a, b, chunks=(200, 200), blocks=(100, 100))
+
+        assert calls == [((400, 400), (400, 400), (400, 400))]
+        np.testing.assert_allclose(c[:], np.matmul(a[:], b[:]), rtol=1e-6, atol=1e-6)
+    finally:
+        _toggle_miniexpr(old_flag)
+
+
+def test_matmul_falls_back_for_nd_inputs(monkeypatch):
+    old_flag = utils_mod.try_miniexpr
+    calls = []
+    original = blosc2.NDArray._set_pref_matmul
+
+    def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
+        calls.append((self.shape, inputs["x1"].shape, inputs["x2"].shape))
+        return original(self, inputs, fp_accuracy)
+
+    monkeypatch.setattr(blosc2.NDArray, "_set_pref_matmul", wrapped_set_pref_matmul)
+    try:
+        _toggle_miniexpr(True)
+        a = blosc2.ones(shape=(2, 40, 40), dtype=np.int64, chunks=(1, 20, 20), blocks=(1, 10, 10))
+        b = blosc2.full(
+            shape=(2, 40, 40), fill_value=2, dtype=np.int64, chunks=(1, 20, 20), blocks=(1, 10, 10)
+        )
+
+        c = blosc2.matmul(a, b, chunks=(1, 20, 20), blocks=(1, 10, 10))
+
+        assert calls == []
+        np.testing.assert_allclose(c[:], np.matmul(a[:], b[:]), rtol=1e-6, atol=1e-6)
+    finally:
+        _toggle_miniexpr(old_flag)
+
+
+def test_matmul_fast_path_failure_falls_back(monkeypatch):
+    old_flag = utils_mod.try_miniexpr
+
+    def failing_set_pref_matmul(self, inputs, fp_accuracy):
+        raise RuntimeError("boom")
+
+    monkeypatch.setattr(blosc2.NDArray, "_set_pref_matmul", failing_set_pref_matmul)
+    try:
+        _toggle_miniexpr(True)
+        a = blosc2.ones(shape=(200, 200), dtype=np.int64, chunks=(100, 100), blocks=(50, 50))
+        b = blosc2.full(shape=(200, 200), fill_value=2, dtype=np.int64, chunks=(100, 100), blocks=(50, 50))
+
+        with pytest.warns(RuntimeWarning, match="falling back to chunked path"):
+            c = blosc2.matmul(a, b, chunks=(100, 100), blocks=(50, 50))
+
+        np.testing.assert_allclose(c[:], np.matmul(a[:], b[:]), rtol=1e-6, atol=1e-6)
+    finally:
+        _toggle_miniexpr(old_flag)
+
+
 @pytest.mark.parametrize(
     ("ashape", "achunks", "ablocks"),
     {