Blosc
diff --git a/‎bench/ndarray/fancy_index.py‎
Lines changed: 153 additions & 0 deletions b/‎bench/ndarray/fancy_index.py‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎bench/ndarray/fancy_index1D.py‎
Lines changed: 130 additions & 0 deletions b/‎bench/ndarray/fancy_index1D.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎src/blosc2/blosc2_ext.pyx‎
Lines changed: 70 additions & 0 deletions b/‎src/blosc2/blosc2_ext.pyx‎
Lines changed: 70 additions & 0 deletions
@@ -0,0 +1,153 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# This source code is licensed under a BSD-style license (found in the
+# LICENSE file in the root directory of this source tree)
+#######################################################################
+
+# Benchmark for computing a fancy index of a blosc2 array
+
+import numpy as np
+import ndindex
+import blosc2
+import time
+import matplotlib.pyplot as plt
+import zarr
+import h5py
+import pickle
+import os
+plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+plt.rc('text', usetex=False)
+plt.rc('font',**{'serif':['cm']})
+plt.style.use('seaborn-v0_8-paper')
+
+NUMPY = True
+BLOSC = True
+ZARR = False
+HDF5 = False
+SPARSE = False
+
+NDIMS = 2 # must be at least 2
+
+def genarray(r, ndims=2, verbose=True):
+    d = int((r*2**30/8)**(1/ndims))
+    shape = (d,) * ndims
+    chunks = (d // 4,) * ndims
+    blocks = (max(d // 10, 1),) * ndims
+    t = time.time()
+    arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64,
+                          urlpath=f'linspace{r}{ndims}D.b2nd', mode='w')
+    t = time.time() - t
+    arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
+    if verbose:
+        print(f"Array shape: {arr.shape}")
+        print(f"Array size: {arrsize:.6f} GB")
+        print(f"Time to create array: {t:.6f} seconds")
+    return arr, arrsize
+
+
+target_sizes = np.int64(np.array([1, 2, 4, 8, 16, 24]))
+#target_sizes = np.int64(np.array([1, 2, 4, 8]))  # for quick testing
+rng = np.random.default_rng()
+blosctimes = []
+nptimes = []
+zarrtimes = []
+h5pytimes = []
+genuine_sizes = []
+for d in target_sizes:
+    arr, arrsize = genarray(d, ndims=NDIMS)
+    genuine_sizes += [arrsize]
+    sparseness = 1000 if SPARSE else arr.shape[0]//4
+    idx = rng.integers(low=0, high=arr.shape[0], size=(sparseness,))
+    sorted_idx = np.sort(np.unique(idx))
+    col = rng.integers(low=0, high=arr.shape[0], size=(sparseness,))
+    col_sorted = np.sort(np.unique(col))
+    mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
+
+    ## Test fancy indexing for different use cases
+    m, M = sorted_idx[0], sorted_idx[-1]
+    def timer(arr):
+        time_list = []
+        if not HDF5:
+            t = time.time()
+            b = arr[idx, col]
+            time_list += [time.time() - t]
+            if not ZARR:
+                t = time.time()
+                b = arr[slice(1, M // 2, 5), col]
+                time_list += [time.time() - t]
+                t = time.time()
+                b = arr[[[idx], [col]]]
+                time_list += [time.time() - t]
+                t = time.time()
+                b = arr[idx[:10, None], col[:10]]
+                time_list += [time.time() - t]
+                t = time.time()
+                b = arr[idx[:10, None], mask]
+                time_list += [time.time() - t]
+        t = time.time()
+        b = arr[idx] if not HDF5 else arr[sorted_idx]
+        time_list += [time.time() - t]
+        t = time.time()
+        b = arr[m, idx] if not HDF5 else arr[m, col_sorted]
+        time_list += [time.time() - t]
+        return np.array(time_list)
+
+    nparr = arr[:]
+    if BLOSC:
+        blosctimes += [timer(arr)]
+    if NUMPY:
+        nptimes += [timer(nparr)]
+    if ZARR:
+        z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, chunks=arr.chunks,
+                                   dtype=nparr.dtype, overwrite=True)
+        z_test[:] = nparr
+        zarrtimes += [timer(z_test)]
+    if HDF5:
+        with h5py.File('my_hdf5_file.h5', 'w') as f:
+                dset = f.create_dataset("init", data=nparr, chunks=arr.chunks)
+                h5pytimes += [timer(dset)]
+
+blosctimes = np.array(blosctimes)
+nptimes = np.array(nptimes)
+zarrtimes = np.array(zarrtimes)
+h5pytimes = np.array(h5pytimes)
+labs=''
+width = 0.2
+result_tuple = (
+    ["Numpy", nptimes, -2 * width],
+    ["Blosc2", blosctimes, -width],
+    ["Zarr", zarrtimes, 0],
+    ["HDF5", h5pytimes, width]
+)
+
+x = np.arange(len(genuine_sizes))
+# Create barplot for Numpy vs Blosc vs Zarr vs H5py
+for i, r in enumerate(result_tuple):
+    if r[1].shape != (0,):
+        label, times, w = r
+        c = ['b', 'r', 'g', 'm'][i]
+        mean = times.mean(axis=1)
+        err = (mean - times.min(axis=1), times.max(axis=1)-mean)
+        plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
+        error_kw=dict(lw=2, capthick=2, ecolor='k'))
+        labs+=label
+
+filename = f"results{labs}{NDIMS}D" + "sparse" if SPARSE else f"results{labs}{NDIMS}D"
+
+with open(f"{filename}.pkl", 'wb') as f:
+    pickle.dump(result_tuple, f)
+
+plt.xlabel('Array size (GB)')
+plt.legend()
+plt.xticks(x-width, np.round(genuine_sizes, 2))
+plt.ylabel("Time (s)")
+plt.title(f"Fancy indexing performance comparison, {NDIMS}D" +f"{" sparse" if SPARSE else ""}")
+plt.gca().set_yscale('log')
+plt.savefig(f'plots/fancyIdx{filename}.png', format="png")
+plt.show()
+
+print("Finished everything!")
@@ -0,0 +1,130 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# This source code is licensed under a BSD-style license (found in the
+# LICENSE file in the root directory of this source tree)
+#######################################################################
+
+# Benchmark for computing a fancy index of a blosc2 array
+
+import numpy as np
+import ndindex
+import blosc2
+import time
+import matplotlib.pyplot as plt
+import zarr
+import h5py
+import pickle
+import os
+
+plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
+plt.rcParams['figure.dpi'] = 300
+plt.rcParams['savefig.dpi'] = 300
+plt.rc('text', usetex=False)
+plt.rc('font',**{'serif':['cm']})
+plt.style.use('seaborn-v0_8-paper')
+
+NUMPY = True
+BLOSC = True
+ZARR = True
+HDF5 = True
+SPARSE = True
+
+if HDF5:
+    SPARSE = True # HDF5 takes too long for non-sparse indexing
+
+def genarray(r, verbose=True):
+    d = int((r*2**30/8))
+    shape = (d,)
+    chunks = (d // 4,)
+    blocks = (max(d // 10, 1),)
+    t = time.time()
+    arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}1D.b2nd', mode='w')
+    t = time.time() - t
+    arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
+    if verbose:
+        print(f"Array shape: {arr.shape}")
+        print(f"Array size: {arrsize:.6f} GB")
+        print(f"Time to create array: {t:.6f} seconds")
+    return arr, arrsize
+
+
+target_sizes = np.float64(np.array([.2, .5, 1, 2, 5, 10]))
+rng = np.random.default_rng()
+blosctimes = []
+nptimes = []
+zarrtimes = []
+h5pytimes = []
+genuine_sizes = []
+for d in target_sizes:
+    arr, arrsize = genarray(d)
+    genuine_sizes += [arrsize]
+    idx = rng.integers(low=0, high=arr.shape[0], size=(1000,)) if SPARSE else rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
+    sorted_idx = np.unique(np.sort(idx))
+    ## Test fancy indexing for different use cases
+    def timer(arr):
+        time_list = []
+        if not (HDF5 or ZARR):
+             b = arr[[[sorted_idx], [idx]]]
+             time_list += [time.time() - t]
+             t = time.time()
+        t = time.time()
+        b = arr[sorted_idx] if HDF5 else arr[idx]
+        time_list += [time.time() - t]
+        return np.array(time_list)
+
+    nparr = arr[:]
+    if BLOSC:
+        blosctimes += [timer(arr)]
+    if NUMPY:
+        nptimes += [timer(nparr)]
+    if ZARR:
+        z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, chunks=arr.chunks,
+                                   dtype=nparr.dtype, overwrite=True)
+        z_test[:] = nparr
+        zarrtimes += [timer(z_test)]
+    if HDF5:
+        with h5py.File('my_hdf5_file.h5', 'w') as f:
+            dset = f.create_dataset("init", data=nparr, chunks=arr.chunks)
+            h5pytimes += [timer(dset)]
+
+blosctimes = np.array(blosctimes)
+nptimes = np.array(nptimes)
+zarrtimes = np.array(zarrtimes)
+h5pytimes = np.array(h5pytimes)
+labs=''
+width = 0.2
+result_tuple = (
+    ["Numpy", nptimes, -2 * width],
+    ["Blosc2", blosctimes, -width],
+    ["Zarr", zarrtimes, 0],
+    ["HDF5", h5pytimes, width]
+)
+
+x = np.arange(len(genuine_sizes))
+# Create barplot for Numpy vs Blosc vs Zarr vs H5py
+for i, r in enumerate(result_tuple):
+    if r[1].shape != (0,):
+        label, times, w = r
+        c = ['b', 'r', 'g', 'm'][i]
+        mean = times.mean(axis=1)
+        err = (mean - times.min(axis=1), times.max(axis=1)-mean)
+        plt.bar(x + w, mean, width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
+        error_kw=dict(lw=2, capthick=2, ecolor='k'))
+        labs+=label
+
+filename = f"results{labs}1Dsparse" if SPARSE else f"results{labs}1D"
+with open(filename+".pkl", 'wb') as f:
+    pickle.dump({'times':result_tuple, 'sizes':genuine_sizes}, f)
+
+plt.xlabel('Array size (GB)')
+plt.legend()
+plt.xticks(x-width, np.round(genuine_sizes, 2))
+plt.ylabel("Time (s)")
+plt.title(f"Fancy indexing performance comparison, 1D {' sparse' if SPARSE else ''}")
+plt.gca().set_yscale('log')
+plt.savefig(f'plots/{filename}.png', format="png")
+plt.show()
+
+print("Finished everything!")
@@ -510,6 +510,12 @@ cdef extern from "b2nd.h":
     int b2nd_concatenate(b2nd_context_t *ctx, b2nd_array_t *src1, b2nd_array_t *src2,
                          int8_t axis, c_bool copy, b2nd_array_t **array)
     int b2nd_expand_dims(const b2nd_array_t *array, b2nd_array_t ** view, const int8_t axis)
+    int b2nd_get_orthogonal_selection(const b2nd_array_t *array, int64_t ** selection,
+                                      int64_t *selection_size, void *buffer,
+                                      int64_t *buffershape, int64_t buffersize)
+    int b2nd_set_orthogonal_selection(const b2nd_array_t *array, int64_t ** selection,
+                                      int64_t *selection_size, void *buffer,
+                                      int64_t *buffershape, int64_t buffersize)
     int b2nd_from_schunk(blosc2_schunk *schunk, b2nd_array_t **array)
 
     void blosc2_unidim_to_multidim(uint8_t ndim, int64_t *shape, int64_t i, int64_t *index)
@@ -2419,6 +2425,70 @@ cdef class NDArray:
 
         return arr
 
+    def get_oindex_numpy(self, arr, key):
+        """
+        Orthogonal indexing. Key is a tuple of lists of integer indices.
+        """
+        if len(key) != self.array.ndim:
+            raise ValueError(f"Key must have {self.array.ndim} dimensions, got {len(key)}.")
+        cdef int64_t[B2ND_MAX_DIM] buffershape_
+        cdef int64_t** key_
+        cdef int64_t buffersize_ = self.array.sc.typesize
+        cdef int64_t[B2ND_MAX_DIM] sel_size
+
+        key_ = <int64_t**> malloc(len(key) * sizeof(int64_t *))
+
+        for i in range(self.array.ndim):
+            buffershape_[i] = len(key[i])
+            buffersize_ *= buffershape_[i]
+            sel_size[i] = len(key[i])
+            key_[i] = <int64_t *> malloc(sel_size[i] * sizeof(int64_t))
+            for j in range(len(key[i])):
+                key_[i][j] = key[i][j]
+
+        cdef Py_buffer buf
+        PyObject_GetBuffer(arr, &buf, PyBUF_SIMPLE)
+
+        _check_rc(b2nd_get_orthogonal_selection(self.array, key_, sel_size, buf.buf,
+                                                buffershape_, buffersize_), "Error while getting orthogonal selection")
+        PyBuffer_Release(&buf)
+        for i in range(len(key)):
+            free(key_[i])  # Free the allocated memory for each key
+        free(key_)
+        return arr
+
+    def set_oindex_numpy(self, key, arr):
+        """
+        Orthogonal indexing. Set elements of self with arr using key.
+        """
+        if len(key) != self.array.ndim:
+            raise ValueError(f"Key must have {self.array.ndim} dimensions, got {len(key)}.")
+        cdef int64_t[B2ND_MAX_DIM] buffershape_
+        cdef int64_t** key_
+        cdef int64_t buffersize_ = self.array.sc.typesize
+        cdef int64_t[B2ND_MAX_DIM] sel_size
+
+        key_ = <int64_t**> malloc(len(key) * sizeof(int64_t *))
+
+        for i in range(self.array.ndim):
+            buffershape_[i] = len(key[i])
+            buffersize_ *= buffershape_[i]
+            sel_size[i] = len(key[i])
+            key_[i] = <int64_t *> malloc(sel_size[i] * sizeof(int64_t))
+            for j in range(len(key[i])):
+                key_[i][j] = key[i][j]
+
+        cdef Py_buffer buf
+        PyObject_GetBuffer(arr, &buf, PyBUF_SIMPLE)
+
+        _check_rc(b2nd_set_orthogonal_selection(self.array, key_, sel_size, buf.buf,
+                                                buffershape_, buffersize_), "Error while getting orthogonal selection")
+        PyBuffer_Release(&buf)
+        for i in range(len(key)):
+            free(key_[i])  # Free the allocated memory for each key
+        free(key_)
+        return arr
+
 
     def get_slice(self, key, mask, **kwargs):
         start, stop = key