Merge pull request #435 from Blosc/fancyIndex

lshaw8317 · web-flow · commit 3c6fc802cf31 · 2025-07-09T13:31:27.000+02:00
Fixing reductions and slicing for lazy expressions
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
@@ -568,7 +568,7 @@ def compute_broadcast_shape(arrays):
     return np.broadcast_shapes(*shapes) if shapes else None
 
 
-def check_smaller_shape(value_shape, shape, slice_shape):
+def check_smaller_shape(value_shape, shape, slice_shape, slice_):
     """Check whether the shape of the value is smaller than the shape of the array.
 
     This follows the NumPy broadcasting rules.
@@ -579,7 +579,10 @@ def check_smaller_shape(value_shape, shape, slice_shape):
     is_smaller_shape = any(
         s > (1 if i >= len(value_shape) else value_shape[i]) for i, s in enumerate(slice_shape)
     )
-    return len(value_shape) < len(shape) or is_smaller_shape
+    slice_past_bounds = any(
+        s.stop > (1 if i >= len(value_shape) else value_shape[i]) for i, s in enumerate(slice_)
+    )
+    return len(value_shape) < len(shape) or is_smaller_shape or slice_past_bounds
 
 
 def _compute_smaller_slice(larger_shape, smaller_shape, larger_slice):
@@ -1396,29 +1399,35 @@ def slices_eval(  # noqa: C901
 
     dtype = kwargs.pop("dtype", None)
     shape_slice = None
-    need_orig_slice = False
+    need_final_slice = False
 
-    # keep orig_slice for final getitem
+    # keep orig_slice
     _slice = _slice.raw
     orig_slice = _slice
+    full_slice = ()  # by default the full_slice is the whole array
+    final_slice = ()  # by default the final_slice is the whole array
+
+    # Compute the shape and chunks of the output array, including broadcasting
+    shape = compute_broadcast_shape(operands.values())
     if out is None:
-        # Compute the shape and chunks of the output array, including broadcasting
-        shape = compute_broadcast_shape(operands.values())
         if _slice != ():
             # Check whether _slice contains an integer, or any step that are not None or 1
             if any(
                 (isinstance(s, int)) or (isinstance(s, slice) and s.step not in (None, 1)) for s in _slice
             ):
-                need_orig_slice = True
+                need_final_slice = True
             _slice = tuple(slice(i, i + 1) if isinstance(i, int) else i for i in _slice)
             full_slice = tuple(
-                slice(s.start or 0, s.stop or shape[i], None) for i, s in enumerate(_slice)
+                slice(s.start or 0, s.stop or shape[i], 1) for i, s in enumerate(_slice)
             )  # get rid of non-unit steps
             # shape_slice in general not equal to final shape:
-            # dummy dims (due to ints) or non-unit steps will be dealt with by taking orig_slice
+            # dummy dims (due to ints) or non-unit steps will be dealt with by taking final_slice
             shape_slice = ndindex.ndindex(full_slice).newshape(shape)
+            final_slice = ndindex.ndindex(orig_slice).as_subindex(full_slice).raw
     else:
-        # TODO: check that this is fine since __out__ could have shape_slice and __shape__ should refer to full array
+        # # out should always have shape of full array
+        # if shape is not None and shape != out.shape:
+        #     raise ValueError("Provided output shape does not match the operands' shape.")
         shape = out.shape
 
     if chunks is None:  # Guess chunk shape
@@ -1457,9 +1466,8 @@ def slices_eval(  # noqa: C901
     intersecting_chunks = chunk_size.as_subchunks(_slice, shape)  # if _slice is (), returns all chunks
 
     for nchunk, chunk_slice in enumerate(intersecting_chunks):
-        # Check whether current chunk_slice intersects with _slice
+        # get intersection of chunk and target
         if _slice != ():
-            # get intersection of chunk and target
             cslice = tuple(
                 slice(max(s1.start, s2.start), min(s1.stop, s2.stop))
                 for s1, s2 in zip(chunk_slice.raw, _slice, strict=True)
@@ -1469,6 +1477,10 @@ def slices_eval(  # noqa: C901
 
         cslice_shape = tuple(s.stop - s.start for s in cslice)
         len_chunk = math.prod(cslice_shape)
+        # get local index of part of out that is to be updated
+        cslice_subidx = (
+            ndindex.ndindex(cslice).as_subindex(full_slice).raw
+        )  # in the case full_slice=(), just gives cslice
 
         # Get the starts and stops for the slice
         starts = [s.start if s.start is not None else 0 for s in cslice]
@@ -1482,7 +1494,7 @@ def slices_eval(  # noqa: C901
             if value.shape == ():
                 chunk_operands[key] = value[()]
                 continue
-            if check_smaller_shape(value.shape, shape, cslice_shape):
+            if check_smaller_shape(value.shape, shape, cslice_shape, cslice):
                 # We need to fetch the part of the value that broadcasts with the operand
                 smaller_slice = compute_smaller_slice(shape, value.shape, cslice)
                 chunk_operands[key] = value[smaller_slice]
@@ -1502,6 +1514,7 @@ def slices_eval(  # noqa: C901
 
         if callable(expression):
             result = np.empty(cslice_shape, dtype=out.dtype)  # raises error if out is None
+            # cslice should be equal to cslice_subidx
             # Call the udf directly and use result as the output array
             offset = tuple(s.start for s in cslice)
             expression(tuple(chunk_operands.values()), result, offset=offset)
@@ -1573,12 +1586,12 @@ def slices_eval(  # noqa: C901
                 out.schunk.update_data(nchunk, result, copy=False)
             else:
                 try:
-                    out[cslice] = result
+                    out[cslice_subidx] = result
                 except ComplexWarning:
                     # The result is a complex number, so we need to convert it to real.
                     # This is a workaround for rigidness of numpy with type casting.
                     result = result.real.astype(out.dtype)
-                    out[cslice] = result
+                    out[cslice_subidx] = result
         elif len(where) == 1:
             lenres = len(result)
             out[lenout : lenout + lenres] = result
@@ -1588,7 +1601,7 @@ def slices_eval(  # noqa: C901
         else:
             raise ValueError("The where condition must be a tuple with one or two elements")
 
-    if where is not None and len(where) < 2:  # Don't need to take orig_slice since filled up from 0 index
+    if where is not None and len(where) < 2:  # Don't need to take final_slice since filled up from 0 index
         if _order is not None:
             # argsort the result following _order
             new_order = np.argsort(out[:lenout])
@@ -1600,16 +1613,16 @@ def slices_eval(  # noqa: C901
         else:
             out.resize((lenout,))
 
-    else:  # Need to take orig_slice since filled up array according to slice_ for each chunk
-        if orig_slice != ():
+    else:  # Need to take final_slice since filled up array according to slice_ for each chunk
+        if final_slice != ():
             if isinstance(out, np.ndarray):
-                if need_orig_slice:
-                    out = out[orig_slice]
+                if need_final_slice:  # only called if out was None
+                    out = out[final_slice]
             elif isinstance(out, blosc2.NDArray):
                 # It *seems* better to choose an automatic chunks and blocks for the output array
                 # out = out.slice(_slice, chunks=out.chunks, blocks=out.blocks)
-                if need_orig_slice:
-                    out = out.slice(orig_slice)
+                if need_final_slice:  # only called if out was None
+                    out = out.slice(final_slice)
             else:
                 raise ValueError("The output array is not a NumPy array or a NDArray")
 
@@ -1679,7 +1692,7 @@ def slices_eval_getitem(
         if value.shape == ():
             slice_operands[key] = value[()]
             continue
-        if check_smaller_shape(value.shape, shape, slice_shape):
+        if check_smaller_shape(value.shape, shape, slice_shape, _slice_bcast):
             # We need to fetch the part of the value that broadcasts with the operand
             smaller_slice = compute_smaller_slice(shape, value.shape, _slice)
             slice_operands[key] = value[smaller_slice]
@@ -1703,7 +1716,8 @@ def slices_eval_getitem(
             # This is a workaround for rigidness of numpy with type casting.
             return result.real.astype(dtype, copy=False)
     else:
-        out[()] = result
+        # out should always have maximal shape
+        out[_slice] = result
         return out
 
 
@@ -1721,6 +1735,19 @@ def infer_reduction_dtype(dtype, operation):
         raise ValueError(f"Unsupported operation: {operation}")
 
 
+def step_handler(s1start, s2start, s1stop, s2stop, s2step):
+    # assume s1step = 1
+    newstart = max(s1start, s2start)
+    newstop = min(s1stop, s2stop)
+    rem = (newstart - s2start) % s2step
+    if rem != 0:  # only pass through here if s2step is not 1
+        newstart += s2step - rem
+        # true_stop = start + n*step + 1 -> stop = start + n * step + 1 + residual
+        # so n = (stop - start - 1) // step
+        newstop = newstart + (newstop - newstart - 1) // s2step * s2step + 1
+    return slice(newstart, newstop, s2step)
+
+
 def reduce_slices(  # noqa: C901
     expression: str | Callable[[tuple, np.ndarray, tuple[int]], None],
     operands: dict,
@@ -1770,14 +1797,26 @@ def reduce_slices(  # noqa: C901
     # Compute the shape and chunks of the output array, including broadcasting
     shape = compute_broadcast_shape(operands.values())
 
+    _slice = _slice.raw
+    shape_slice = shape
+    full_slice = ()  # by default the full_slice is the whole array
+    if out is None and _slice != ():
+        shape_slice = ndindex.ndindex(_slice).newshape(shape)
+        full_slice = _slice
+
+    # after slicing, we reduce to calculate shape of output
     if axis is None:
-        axis = tuple(range(len(shape)))
+        axis = tuple(range(len(shape_slice)))
     elif not isinstance(axis, tuple):
         axis = (axis,)
+    axis = tuple(a if a >= 0 else a + len(shape_slice) for a in axis)
     if keepdims:
-        reduced_shape = tuple(1 if i in axis else s for i, s in enumerate(shape))
+        reduced_shape = tuple(1 if i in axis else s for i, s in enumerate(shape_slice))
     else:
-        reduced_shape = tuple(s for i, s in enumerate(shape) if i not in axis)
+        reduced_shape = tuple(s for i, s in enumerate(shape_slice) if i not in axis)
+
+    if out is not None and reduced_shape != out.shape:
+        raise ValueError("Provided output shape does not match the reduced shape.")
 
     if is_inside_new_expr():
         # We already have the dtype and reduced_shape, so return immediately
@@ -1820,23 +1859,17 @@ def reduce_slices(  # noqa: C901
     chunk_operands = {}
     # Check which chunks intersect with _slice
     chunk_size = ndindex.ChunkSize(chunks)
-    _slice = _slice.raw
     intersecting_chunks = chunk_size.as_subchunks(_slice, shape)  # if _slice is (), returns all chunks
     out_init = False
 
     for nchunk, chunk_slice in enumerate(intersecting_chunks):
         cslice = chunk_slice.raw
-        if keepdims:
-            reduced_slice = tuple(slice(None) if i in axis else sl for i, sl in enumerate(cslice))
-        else:
-            reduced_slice = tuple(sl for i, sl in enumerate(cslice) if i not in axis)
         offset = tuple(s.start for s in cslice)  # offset for the udf
-        # TODO: Is this necessary, shouldn't slice always be None for a reduction?
         # Check whether current cslice intersects with _slice
         if cslice != () and _slice != ():
             # get intersection of chunk and target
             cslice = tuple(
-                slice(max(s1.start, s2.start), min(s1.stop, s2.stop))
+                step_handler(s1.start, s2.start, s1.stop, s2.stop, s2.step)
                 for s1, s2 in zip(cslice, _slice, strict=True)
             )
         chunks_ = tuple(s.stop - s.start for s in cslice)
@@ -1859,7 +1892,7 @@ def reduce_slices(  # noqa: C901
                 if value.shape == ():
                     chunk_operands[key] = value[()]
                     continue
-                if check_smaller_shape(value.shape, shape, chunks_):
+                if check_smaller_shape(value.shape, shape, chunks_, cslice):
                     # We need to fetch the part of the value that broadcasts with the operand
                     smaller_slice = compute_smaller_slice(operand.shape, value.shape, cslice)
                     chunk_operands[key] = value[smaller_slice]
@@ -1874,6 +1907,15 @@ def reduce_slices(  # noqa: C901
                     continue
                 chunk_operands[key] = value[cslice]
 
+        # get local index of part of out that is to be updated
+        cslice_subidx = (
+            ndindex.ndindex(cslice).as_subindex(full_slice).raw
+        )  # if full_slice is (), just gives cslice
+        if keepdims:
+            reduced_slice = tuple(slice(None) if i in axis else sl for i, sl in enumerate(cslice_subidx))
+        else:
+            reduced_slice = tuple(sl for i, sl in enumerate(cslice_subidx) if i not in axis)
+
         # Evaluate and reduce the expression using chunks of operands
 
         if callable(expression):
@@ -2622,23 +2664,16 @@ def get_num_elements(self, axis, item):
             num_elements = self.sum(axis=axis, dtype=np.int64, item=item)
             self._where_args = orig_where_args
             return num_elements
-        if np.isscalar(axis):
-            axis = (axis,)
         # Compute the number of elements in the array
         shape = self.shape
+        if np.isscalar(axis):
+            axis = (axis,)
         if item is not None:
             # Compute the shape of the slice
-            if not isinstance(item, tuple):
-                item = (item,)
-            # Ensure that the limits in item slices are not None
-            item = tuple(slice(s.start or 0, s.stop or self.shape[i], s.step) for i, s in enumerate(item))
-            # Compute the intersection of the slice with the shape
-            item = tuple(slice(s1.start, min(s1.stop, s2)) for s1, s2 in zip(item, shape, strict=True))
-            if axis is None:
-                shape = [s.stop - s.start for s in item]
-            else:
-                shape = [s.stop - s.start for i, s in enumerate(item) if i in axis]
-        return math.prod(shape) if axis is None else math.prod([shape[i] for i in axis])
+            shape = ndindex.ndindex(item).newshape(shape)
+        axis = tuple(range(len(shape))) if axis is None else axis
+        axis = tuple(a if a >= 0 else a + len(shape) for a in axis)  # handle negative indexing
+        return math.prod([shape[i] for i in axis])
 
     def mean(self, axis=None, dtype=None, keepdims=False, **kwargs):
         item = kwargs.pop("item", None)
@@ -2657,9 +2692,15 @@ def mean(self, axis=None, dtype=None, keepdims=False, **kwargs):
 
     def std(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
         item = kwargs.pop("item", None)
-        mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item)
-        expr = (self - mean_value) ** 2
-        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims, item=item)
+        if item is None:  # fast path
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True)
+            expr = (self - mean_value) ** 2
+        else:
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item)
+            # TODO: Not optimal because we load the whole slice in memory. Would have to write
+            #  a bespoke std function that executed within slice_eval to avoid this probably.
+            expr = (self.slice(item) - mean_value) ** 2
+        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims)
         if ddof != 0:
             num_elements = self.get_num_elements(axis, item)
             out = np.sqrt(out * num_elements / (num_elements - ddof))
@@ -2675,14 +2716,18 @@ def std(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
 
     def var(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
         item = kwargs.pop("item", None)
-        mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item)
-        expr = (self - mean_value) ** 2
+        if item is None:  # fast path
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True)
+            expr = (self - mean_value) ** 2
+        else:
+            mean_value = self.mean(axis=axis, dtype=dtype, keepdims=True, item=item)
+            # TODO: Not optimal because we load the whole slice in memory. Would have to write
+            #  a bespoke var function that executed within slice_eval to avoid this probably.
+            expr = (self.slice(item) - mean_value) ** 2
+        out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims)
         if ddof != 0:
-            out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims, item=item)
             num_elements = self.get_num_elements(axis, item)
             out = out * num_elements / (num_elements - ddof)
-        else:
-            out = expr.mean(axis=axis, dtype=dtype, keepdims=keepdims, item=item)
         out2 = kwargs.pop("out", None)
         if out2 is not None:
             out2[:] = out
@@ -3241,6 +3286,7 @@ def __getitem__(self, item):
             # TODO: as this creates a big array, this can potentially consume a lot of memory
             output = np.empty(self.shape, self.dtype)
             # It is important to pass kwargs here, because chunks can be used internally
+            # fills numpy array with desired slice
             chunked_eval(self.func, self.inputs_dict, item, _getitem=True, _output=output, **self.kwargs)
             return output[item]
         return self.res_getitem[item]
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
@@ -1078,23 +1078,27 @@ def test_eval_getitem(array_fixture):
     np.testing.assert_allclose(expr[:10], nres[:10])
     np.testing.assert_allclose(expr[0:10:2], nres[0:10:2])
 
+
+def test_eval_getitem2():
     # Small test for non-isomorphic shape
     shape = (2, 10, 5)
-    test_arr = blosc2.linspace(0, 10, np.prod(shape), shape=shape)
+    test_arr = blosc2.linspace(0, 10, np.prod(shape), shape=shape, chunks=(1, 5, 1))
     expr = test_arr * 30
     nres = test_arr[:] * 30
     np.testing.assert_allclose(expr[0], nres[0])
-    np.testing.assert_allclose(expr[:10], nres[:10])
+    np.testing.assert_allclose(expr[1:, :7], nres[1:, :7])
     np.testing.assert_allclose(expr[0:10:2], nres[0:10:2])
+    # This works, but it is not very efficient since it relies on blosc2.ndarray.slice for non-unit steps
+    np.testing.assert_allclose(expr.slice((slice(None, None, None), slice(0, 10, 2)))[:], nres[:, 0:10:2])
 
     # Small test for broadcasting
-    shape = (2, 10, 5)
-    test_arr = blosc2.linspace(0, 10, np.prod(shape), shape=shape)
-    expr = test_arr + test_arr.slice(slice(1, 2))
+    expr = test_arr + test_arr.slice(1)
     nres = test_arr[:] + test_arr[1]
     np.testing.assert_allclose(expr[0], nres[0])
-    np.testing.assert_allclose(expr[:10], nres[:10])
-    np.testing.assert_allclose(expr[0:10:2], nres[0:10:2])
+    np.testing.assert_allclose(expr[1:, :7], nres[1:, :7])
+    np.testing.assert_allclose(expr[:, 0:10:2], nres[:, 0:10:2])
+    # This works, but it is not very efficient since it relies on blosc2.ndarray.slice for non-unit steps
+    np.testing.assert_allclose(expr.slice((slice(None, None, None), slice(0, 10, 2)))[:], nres[:, 0:10:2])
 
 
 # Test lazyexpr's slice method
diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py