Remove Numpy optimisation and reformat benchmarks

lshaw8317 · lshaw8317 · commit 6cb1be273407 · 2025-07-17T10:39:59.000+02:00
diff --git a/bench/ndarray/fancy_index.py b/bench/ndarray/fancy_index.py
@@ -18,142 +18,125 @@
 import pickle
 import os
 plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
-plt.rcParams['figure.dpi'] = 1000
-plt.rcParams['savefig.dpi'] = 1000
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
 plt.rc('text', usetex=False)
 plt.rc('font',**{'serif':['cm']})
 plt.style.use('seaborn-v0_8-paper')
 
-NUMPY_BLOSC = False # activate NUMPY and BLOSC tests
-NUMPY_BLOSC_ZARR = False # activate NUMPY, BLOSC and Zarr tests
-# default if both are false is to run tests for Numpy, Blosc, Zarr and HDF5
+NUMPY = True
+BLOSC = True
+ZARR = True
+HDF5 = True
 
-def genarray(r, ndims=1, verbose=True):
+NDIMS = 2 # must be at least 2
+
+def genarray(r, ndims=2, verbose=True):
     d = int((r*2**30/8)**(1/ndims))
     shape = (d,) * ndims
     chunks = (d // 4,) * ndims
     blocks = (max(d // 10, 1),) * ndims
     t = time.time()
-    if os.path.exists(f'linspace{r}.b2nd'):
-        arr = blosc2.open(urlpath=f'linspace{r}.b2nd')
-    else:
-        arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}.b2nd', mode='w')
+    arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}{ndims}D.b2nd', mode='w')
     t = time.time() - t
+    arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
     if verbose:
         print(f"Array shape: {arr.shape}")
-        print(f"Array size: {np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30:.6f} GB")
+        print(f"Array size: {arrsize:.6f} GB")
         print(f"Time to create array: {t:.6f} seconds")
-    return arr
+    return arr, arrsize
 
 
-sizes = np.int64(np.array([1, 2, 4, 8, 16, 24, 32]))
+target_sizes = np.int64(np.array([1, 2, 4, 8, 16, 24, 32]))
 rng = np.random.default_rng()
 blosctimes = []
 nptimes = []
 zarrtimes = []
 h5pytimes = []
-x = np.arange(len(sizes))
-width = 0.2
-labs = 'NumpyBlosc2' if NUMPY_BLOSC else 'NumpyBlosc2ZarrHDF5'
-labs = 'NumpyBlosc2Zarr' if NUMPY_BLOSC_ZARR else labs
-try:
-    with open(f"results{labs}.pkl", 'rb') as f:
-        result_tuple = pickle.load(f)
-    labs = ''
-    for i, r in enumerate(result_tuple):
-        if r[1].shape != (0,):
-            label,times,w = r
-            c = ['b', 'r', 'g', 'm'][i]
-            mean = times.mean(axis=1)
-            err = [mean - times.min(axis=1), times.max(axis=1)-mean]
-            plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
-            error_kw=dict(lw=2, capthick=2, ecolor='k'))
-            labs+=label
-except:
-    for d in sizes:
-        arr = genarray(d, ndims=2)
-        idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
-        row = np.sort(np.unique(idx))
-        col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))))
-        mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
+genuine_sizes = []
+for d in target_sizes:
+    arr, arrsize = genarray(d, ndims=NDIMS)
+    genuine_sizes += [arrsize]
+    idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
+    sorted_idx = np.sort(np.unique(idx))
+    col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))))
+    mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
 
-        ## Test fancy indexing for different use cases
-        m, M = np.min(idx), np.max(idx)
-        res = arr[:][row]
-        def timer(arr, row=row, col=col):
-            time_list = []
-            if NUMPY_BLOSC or NUMPY_BLOSC_ZARR:
-                t = time.time()
-                b = arr[row, col]
-                time_list += [time.time() - t]
-            if NUMPY_BLOSC:
+    ## Test fancy indexing for different use cases
+    m, M = sorted_idx[0], sorted_idx[-1]
+    def timer(arr):
+        time_list = []
+        if not HDF5:
+            t = time.time()
+            b = arr[idx, col]
+            time_list += [time.time() - t]
+            if not ZARR:
                 t = time.time()
                 b = arr[slice(1, M // 2, 5), col]
                 time_list += [time.time() - t]
                 t = time.time()
-                b = arr[[[row], [col]]]
+                b = arr[[[idx], [col]]]
                 time_list += [time.time() - t]
                 t = time.time()
-                b = arr[row[:10, None], col[:10]]
+                b = arr[idx[:10, None], col[:10]]
                 time_list += [time.time() - t]
                 t = time.time()
-                b = arr[row[:10, None], mask]
+                b = arr[idx[:10, None], mask]
                 time_list += [time.time() - t]
-            t = time.time()
-            b = arr[row]
-            time_list += [time.time() - t]
-            t = time.time()
-            b = arr[m, col]
-            time_list += [time.time() - t]
-            return np.array(time_list)
+        t = time.time()
+        b = arr[idx] if not HDF5 else arr[sorted_idx]
+        time_list += [time.time() - t]
+        t = time.time()
+        b = arr[m, idx] if not HDF5 else arr[m, col]
+        time_list += [time.time() - t]
+        return np.array(time_list)
+
+    nparr = arr[:]
+    if BLOSC:
+        blosctimes += [timer(arr)]
+    if NUMPY:
+        nptimes += [timer(nparr)]
+    if ZARR:
+        z_test = zarr.create_array(store='data/example.zarr', shape=nparr.shape, dtype=nparr.dtype, overwrite=True)
+        z_test[:] = nparr
+        zarrtimes += [timer(z_test)]
+    if HDF5:
+        with h5py.File('my_hdf5_file.h5', 'w') as f:
+                dset = f.create_dataset("init", data=nparr)
+                h5pytimes += [timer(dset)]
 
-        if NUMPY_BLOSC or NUMPY_BLOSC_ZARR:
-            blosctimes += [timer(arr, row=idx, col=idx)]
-            arr=arr[:]
-            nptimes += [timer(arr, row=idx, col=idx)]
-            if NUMPY_BLOSC_ZARR:
-                z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, dtype=arr.dtype, overwrite=True)
-                z_test[:] = arr
-                zarrtimes += [timer(z_test, row=idx, col=idx)]
-        else:
-            blosctimes += [timer(arr)]
-            arr=arr[:]
-            nptimes += [timer(arr)]
-            z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, dtype=arr.dtype, overwrite=True)
-            z_test[:] = arr
-            zarrtimes += [timer(z_test)]
-            with h5py.File('my_hdf5_file.h5', 'w') as f:
-                    dset = f.create_dataset("init", data=arr)
-                    h5pytimes += [timer(dset)]
+blosctimes = np.array(blosctimes)
+nptimes = np.array(nptimes)
+zarrtimes = np.array(zarrtimes)
+h5pytimes = np.array(h5pytimes)
+labs=''
+result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
 
-    blosctimes = np.array(blosctimes)
-    nptimes = np.array(nptimes)
-    zarrtimes = np.array(zarrtimes)
-    h5pytimes = np.array(h5pytimes)
-    labs=''
-    result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
+x = np.arange(len(genuine_sizes))
+width = 0.2
+# Create barplot for Numpy vs Blosc vs Zarr vs H5py
+for i, r in enumerate(result_tuple):
+    if r[1].shape != (0,):
+        label, times, w = r
+        c = ['b', 'r', 'g', 'm'][i]
+        mean = times.mean(axis=1)
+        err = (mean - times.min(axis=1), times.max(axis=1)-mean)
+        plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
+        error_kw=dict(lw=2, capthick=2, ecolor='k'))
+        labs+=label
 
-    # Create barplot for Numpy vs Blosc vs Zarr vs H5py
-    for i, r in enumerate(result_tuple):
-        if r[1].shape != (0,):
-            label, times, w = r
-            c = ['b', 'r', 'g', 'm'][i]
-            mean = times.mean(axis=1)
-            err = (mean - times.min(axis=1), times.max(axis=1)-mean)
-            plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
-            error_kw=dict(lw=2, capthick=2, ecolor='k'))
-            labs+=label
+filename = "results{labs}{NDIMS}D"
 
-    with open(f"results{labs}.pkl", 'wb') as f:
-        pickle.dump(result_tuple, f)
+with open(f"{filename}.pkl", 'wb') as f:
+    pickle.dump(result_tuple, f)
 
 plt.xlabel('Array size (GB)')
 plt.legend()
-plt.xticks(x-width, np.round(sizes, 2))
+plt.xticks(x-width, np.round(genuine_sizes, 2))
 plt.ylabel("Time (s)")
-plt.title('Fancy indexing performance comparison')
+plt.title('Fancy indexing performance comparison, {NDIMS}D')
 plt.gca().set_yscale('log')
-plt.savefig(f'plots/fancyIdx{labs}.png', format="png")
+plt.savefig(f'plots/fancyIdx{filename}.png', format="png")
 plt.show()
 
 print("Finished everything!")
diff --git a/bench/ndarray/fancy_index1D.py b/bench/ndarray/fancy_index1D.py
@@ -0,0 +1,123 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# This source code is licensed under a BSD-style license (found in the
+# LICENSE file in the root directory of this source tree)
+#######################################################################
+
+# Benchmark for computing a fancy index of a blosc2 array
+
+import numpy as np
+import ndindex
+import blosc2
+import time
+import matplotlib.pyplot as plt
+import zarr
+import h5py
+import pickle
+import os
+plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+plt.rc('text', usetex=False)
+plt.rc('font',**{'serif':['cm']})
+plt.style.use('seaborn-v0_8-paper')
+
+NUMPY = True
+BLOSC = True
+ZARR = False
+HDF5 = True
+SPARSE = True
+
+if HDF5:
+    SPARSE = True # HDF5 takes too long for non-sparse indexing
+
+def genarray(r, verbose=True):
+    d = int((r*2**30/8))
+    shape = (d,)
+    chunks = (d // 4,)
+    blocks = (max(d // 10, 1),)
+    t = time.time()
+    arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}1D.b2nd', mode='w')
+    t = time.time() - t
+    arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
+    if verbose:
+        print(f"Array shape: {arr.shape}")
+        print(f"Array size: {arrsize:.6f} GB")
+        print(f"Time to create array: {t:.6f} seconds")
+    return arr, arrsize
+
+
+target_sizes = np.float64(np.array([.1, .2, .4, .8, 1, 1.2]))
+rng = np.random.default_rng()
+blosctimes = []
+nptimes = []
+zarrtimes = []
+h5pytimes = []
+genuine_sizes = []
+for d in target_sizes:
+    arr, arrsize = genarray(d)
+    genuine_sizes += [arrsize]
+    idx = rng.integers(low=0, high=arr.shape[0], size=(1000,)) if SPARSE else rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
+    sorted_idx = np.sort(idx)
+    ## Test fancy indexing for different use cases
+    def timer(arr):
+        time_list = []
+        if not (HDF5 or ZARR):
+             b = arr[[[sorted_idx], [idx]]]
+             time_list += [time.time() - t]
+             t = time.time()
+        t = time.time()
+        b = arr[sorted_idx] if HDF5 else arr[idx]
+        time_list += [time.time() - t]
+        return np.array(time_list)
+
+    nparr = arr[:]
+    if BLOSC:
+        blosctimes += [timer(arr, row=idx, col=idx)]
+    if NUMPY:
+        nptimes += [timer(nparr, row=idx, col=idx)]
+    if ZARR:
+        z_test = zarr.create_array(store='data/example.zarr', shape=nparr.shape, dtype=nparr.dtype, overwrite=True)
+        z_test[:] = nparr
+        zarrtimes += [timer(z_test, row=idx, col=idx)]
+    if HDF5:
+        with h5py.File('my_hdf5_file.h5', 'w') as f:
+            dset = f.create_dataset("init", data=nparr)
+            h5pytimes += [timer(dset)]
+
+blosctimes = np.array(blosctimes)
+nptimes = np.array(nptimes)
+zarrtimes = np.array(zarrtimes)
+h5pytimes = np.array(h5pytimes)
+labs=''
+result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
+
+x = np.arange(len(genuine_sizes))
+width = 0.2
+# Create barplot for Numpy vs Blosc vs Zarr vs H5py
+for i, r in enumerate(result_tuple):
+    if r[1].shape != (0,):
+        label, times, w = r
+        c = ['b', 'r', 'g', 'm'][i]
+        mean = times.mean(axis=1)
+        err = (mean - times.min(axis=1), times.max(axis=1)-mean)
+        plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
+        error_kw=dict(lw=2, capthick=2, ecolor='k'))
+        labs+=label
+
+filename = "results{labs}1Dsparse" if SPARSE else "results{labs}1D"
+with open(filename+".pkl", 'wb') as f:
+    pickle.dump({'times':result_tuple, 'sizes':genuine_sizes}, f)
+
+plt.xlabel('Array size (GB)')
+plt.legend()
+plt.xticks(x-width, np.round(genuine_sizes, 2))
+plt.ylabel("Time (s)")
+plt.title('Fancy indexing performance comparison, 1D' + {" sparse" if SPARSE else ""})
+plt.gca().set_yscale('log')
+plt.savefig(f'plots/{filename}.png', format="png")
+plt.show()
+
+print("Finished everything!")
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
@@ -1441,8 +1441,9 @@ def T(self):
 
     def get_fselection_numpy(self, key):
         # TODO: Make this faster for broadcasted keys
-        if math.prod(self.shape) * self.dtype.itemsize < blosc2.MAX_FAST_PATH_SIZE:
-            return self[:][key]  # load into memory for smallish arrays
+        ## Can`t do this because ndindex doesn't support all the same indexing cases as Numpy
+        # if math.prod(self.shape) * self.dtype.itemsize < blosc2.MAX_FAST_PATH_SIZE:
+        #     return self[:][key]  # load into memory for smallish arrays
         shape = self.shape
         chunks = self.chunks
         _slice = ndindex.ndindex(key).expand(shape)
@@ -1527,7 +1528,9 @@ def __getitem__(  # noqa: C901
         """
         Retrieve a (multidimensional) slice as specified by the key.
 
-        Note that this __getitem__ matches NumPy fancy indexing behaviour.
+        Note that this __getitem__ closely matches NumPy fancy indexing behaviour, except in
+        some edge cases which are not supported by ndindex.
+        Array indeices eparated by slice object - e.g. arr[0, :10, [0,1]] - are NOT supported.
 
         Parameters
         ----------