Optimize chunksize for matmul and machines with L3

FrancescAlted · FrancescAlted · commit 8f6d2156753a · 2025-11-13T13:55:07.000+01:00
diff --git a/src/blosc2/core.py b/src/blosc2/core.py
@@ -1318,9 +1318,12 @@ def get_cbuffer_sizes(src: object) -> tuple[(int, int, int)]:
 
 
 # Compute a decent value for chunksize based on L3 and/or heuristics
-def get_chunksize(blocksize, l3_minimum=4 * 2**20, l3_maximum=2**26):
-    # Find a decent default when L3 cannot be detected by cpuinfo
-    # Based mainly in heuristics
+def get_chunksize(blocksize, l3_minimum=4 * 2**20, l3_maximum=2**26, reduc_factor=4):
+    # Find a decent default when L3 cannot be detected by cpuinfo.
+    # `reduc_factor` means that the chunk will be divided by this factor
+    # 4 stems for 3 operands + 1 result, but some functions (e.g., linalg ones) may
+    # decide to use another one (e.g., 1 for matmul has proved to be better).
+    # Most of this is based mainly on heuristics and experimentation.
     chunksize = blocksize
     if blocksize * 32 < l3_maximum:
         chunksize = blocksize * 32
@@ -1339,15 +1342,14 @@ def get_chunksize(blocksize, l3_minimum=4 * 2**20, l3_maximum=2**26):
             if isinstance(l2_cache_size, int) and l3_cache_size > l2_cache_size:
                 chunksize = l3_cache_size
         # When computing expressions, it is convenient to keep chunks for all operands
-        # in L3 cache, so let's divide by 4 (3 operands + result is a typical situation
-        # for moderately complex expressions)
-        chunksize //= 4
+        # in L3 cache (reduc_factor will account for this).
+        chunksize //= reduc_factor
 
     # Chunksize should be at least the size of L2
     l2_cache_size = cpu_info.get("l2_cache_size", "Not found")
     if isinstance(l2_cache_size, int) and l2_cache_size > chunksize:
         # Apple Silicon has a large L2 cache, and memory bandwidth is high,
-        # so we can use a larger chunksize based on L2 cache size
+        # so we can use a larger chunksize based on L2 cache size.
         chunksize = l2_cache_size * 4
 
     # Ensure a minimum size
@@ -1577,7 +1579,8 @@ def compute_chunks_blocks(  # noqa: C901
     # Finally, the chunks
     if chunks is None:
         blocksize = math.prod(blocks) * itemsize
-        chunksize = get_chunksize(blocksize)
+        reduc_factor = kwargs.get("_chunksize_reduc_factor", 4)
+        chunksize = get_chunksize(blocksize, reduc_factor=reduc_factor)
         # Make chunksize to be a multiple of the blocksize. This allows for:
         # 1. Avoid unnecessary padding in chunks
         # 2. Avoid exceeding the maximum buffer size (see #392)
diff --git a/src/blosc2/linalg.py b/src/blosc2/linalg.py
@@ -99,6 +99,9 @@ def matmul(x1: blosc2.Array, x2: blosc2.NDArray, **kwargs: Any) -> blosc2.NDArra
     n, k = x1.shape[-2:]
     m = x2.shape[-1]
     result_shape = np.broadcast_shapes(x1.shape[:-2], x2.shape[:-2]) + (n, m)
+    # For matmul, we don't want to reduce the chunksize, as experiments show that
+    # the larger, the better (as long as some limits are not exceeded).
+    kwargs["_chunksize_reduc_factor"] = 1
     result = blosc2.zeros(result_shape, dtype=blosc2.result_type(x1, x2), **kwargs)
 
     if 0 not in result.shape + x1.shape + x2.shape:  # if any array is empty, return array of 0s
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
@@ -5839,6 +5839,7 @@ def _check_ndarray_kwargs(**kwargs):
         "initial_mapping_size",
         "storage",
         "out",
+        "_chunksize_reduc_factor",
     ]
     _ = kwargs.pop("device", None)  # pop device (not used, but needs to be discarded)
     for key in kwargs:

Original file line number	Diff line number	Diff line change
`@@ -5839,6 +5839,7 @@ def _check_ndarray_kwargs(**kwargs):`
`5839`	`5839`	`"initial_mapping_size",`
`5840`	`5840`	`"storage",`
`5841`	`5841`	`"out",`
	`5842`	`+ "_chunksize_reduc_factor",`
`5842`	`5843`	`]`
`5843`	`5844`	`_ = kwargs.pop("device", None) # pop device (not used, but needs to be discarded)`
`5844`	`5845`	`for key in kwargs:`