Skip to content

Commit 1a6ba28

Browse files
authored
Merge pull request #437 from Blosc/fancyIndex
Fancy index
2 parents 2f058a4 + d000f15 commit 1a6ba28

5 files changed

Lines changed: 580 additions & 15 deletions

File tree

bench/ndarray/fancy_index.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
9+
# Benchmark for computing a fancy index of a blosc2 array
10+
11+
import numpy as np
12+
import ndindex
13+
import blosc2
14+
import time
15+
import matplotlib.pyplot as plt
16+
import zarr
17+
import h5py
18+
import pickle
19+
import os
20+
plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
21+
plt.rcParams['figure.dpi'] = 300
22+
plt.rcParams['savefig.dpi'] = 300
23+
plt.rc('text', usetex=False)
24+
plt.rc('font',**{'serif':['cm']})
25+
plt.style.use('seaborn-v0_8-paper')
26+
27+
NUMPY = True
28+
BLOSC = True
29+
ZARR = False
30+
HDF5 = False
31+
SPARSE = False
32+
33+
NDIMS = 2 # must be at least 2
34+
35+
def genarray(r, ndims=2, verbose=True):
36+
d = int((r*2**30/8)**(1/ndims))
37+
shape = (d,) * ndims
38+
chunks = (d // 4,) * ndims
39+
blocks = (max(d // 10, 1),) * ndims
40+
t = time.time()
41+
arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64,
42+
urlpath=f'linspace{r}{ndims}D.b2nd', mode='w')
43+
t = time.time() - t
44+
arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
45+
if verbose:
46+
print(f"Array shape: {arr.shape}")
47+
print(f"Array size: {arrsize:.6f} GB")
48+
print(f"Time to create array: {t:.6f} seconds")
49+
return arr, arrsize
50+
51+
52+
target_sizes = np.int64(np.array([1, 2, 4, 8, 16, 24]))
53+
#target_sizes = np.int64(np.array([1, 2, 4, 8])) # for quick testing
54+
rng = np.random.default_rng()
55+
blosctimes = []
56+
nptimes = []
57+
zarrtimes = []
58+
h5pytimes = []
59+
genuine_sizes = []
60+
for d in target_sizes:
61+
arr, arrsize = genarray(d, ndims=NDIMS)
62+
genuine_sizes += [arrsize]
63+
sparseness = 1000 if SPARSE else arr.shape[0]//4
64+
idx = rng.integers(low=0, high=arr.shape[0], size=(sparseness,))
65+
sorted_idx = np.sort(np.unique(idx))
66+
col = rng.integers(low=0, high=arr.shape[0], size=(sparseness,))
67+
col_sorted = np.sort(np.unique(col))
68+
mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
69+
70+
## Test fancy indexing for different use cases
71+
m, M = sorted_idx[0], sorted_idx[-1]
72+
def timer(arr):
73+
time_list = []
74+
if not HDF5:
75+
t = time.time()
76+
b = arr[idx, col]
77+
time_list += [time.time() - t]
78+
if not ZARR:
79+
t = time.time()
80+
b = arr[slice(1, M // 2, 5), col]
81+
time_list += [time.time() - t]
82+
t = time.time()
83+
b = arr[[[idx], [col]]]
84+
time_list += [time.time() - t]
85+
t = time.time()
86+
b = arr[idx[:10, None], col[:10]]
87+
time_list += [time.time() - t]
88+
t = time.time()
89+
b = arr[idx[:10, None], mask]
90+
time_list += [time.time() - t]
91+
t = time.time()
92+
b = arr[idx] if not HDF5 else arr[sorted_idx]
93+
time_list += [time.time() - t]
94+
t = time.time()
95+
b = arr[m, idx] if not HDF5 else arr[m, col_sorted]
96+
time_list += [time.time() - t]
97+
return np.array(time_list)
98+
99+
nparr = arr[:]
100+
if BLOSC:
101+
blosctimes += [timer(arr)]
102+
if NUMPY:
103+
nptimes += [timer(nparr)]
104+
if ZARR:
105+
z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, chunks=arr.chunks,
106+
dtype=nparr.dtype, overwrite=True)
107+
z_test[:] = nparr
108+
zarrtimes += [timer(z_test)]
109+
if HDF5:
110+
with h5py.File('my_hdf5_file.h5', 'w') as f:
111+
dset = f.create_dataset("init", data=nparr, chunks=arr.chunks)
112+
h5pytimes += [timer(dset)]
113+
114+
blosctimes = np.array(blosctimes)
115+
nptimes = np.array(nptimes)
116+
zarrtimes = np.array(zarrtimes)
117+
h5pytimes = np.array(h5pytimes)
118+
labs=''
119+
width = 0.2
120+
result_tuple = (
121+
["Numpy", nptimes, -2 * width],
122+
["Blosc2", blosctimes, -width],
123+
["Zarr", zarrtimes, 0],
124+
["HDF5", h5pytimes, width]
125+
)
126+
127+
x = np.arange(len(genuine_sizes))
128+
# Create barplot for Numpy vs Blosc vs Zarr vs H5py
129+
for i, r in enumerate(result_tuple):
130+
if r[1].shape != (0,):
131+
label, times, w = r
132+
c = ['b', 'r', 'g', 'm'][i]
133+
mean = times.mean(axis=1)
134+
err = (mean - times.min(axis=1), times.max(axis=1)-mean)
135+
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
136+
error_kw=dict(lw=2, capthick=2, ecolor='k'))
137+
labs+=label
138+
139+
filename = f"results{labs}{NDIMS}D" + "sparse" if SPARSE else f"results{labs}{NDIMS}D"
140+
141+
with open(f"{filename}.pkl", 'wb') as f:
142+
pickle.dump(result_tuple, f)
143+
144+
plt.xlabel('Array size (GB)')
145+
plt.legend()
146+
plt.xticks(x-width, np.round(genuine_sizes, 2))
147+
plt.ylabel("Time (s)")
148+
plt.title(f"Fancy indexing performance comparison, {NDIMS}D" +f"{" sparse" if SPARSE else ""}")
149+
plt.gca().set_yscale('log')
150+
plt.savefig(f'plots/fancyIdx{filename}.png', format="png")
151+
plt.show()
152+
153+
print("Finished everything!")

bench/ndarray/fancy_index1D.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
9+
# Benchmark for computing a fancy index of a blosc2 array
10+
11+
import numpy as np
12+
import ndindex
13+
import blosc2
14+
import time
15+
import matplotlib.pyplot as plt
16+
import zarr
17+
import h5py
18+
import pickle
19+
import os
20+
21+
plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
22+
plt.rcParams['figure.dpi'] = 300
23+
plt.rcParams['savefig.dpi'] = 300
24+
plt.rc('text', usetex=False)
25+
plt.rc('font',**{'serif':['cm']})
26+
plt.style.use('seaborn-v0_8-paper')
27+
28+
NUMPY = True
29+
BLOSC = True
30+
ZARR = True
31+
HDF5 = True
32+
SPARSE = True
33+
34+
if HDF5:
35+
SPARSE = True # HDF5 takes too long for non-sparse indexing
36+
37+
def genarray(r, verbose=True):
38+
d = int((r*2**30/8))
39+
shape = (d,)
40+
chunks = (d // 4,)
41+
blocks = (max(d // 10, 1),)
42+
t = time.time()
43+
arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}1D.b2nd', mode='w')
44+
t = time.time() - t
45+
arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
46+
if verbose:
47+
print(f"Array shape: {arr.shape}")
48+
print(f"Array size: {arrsize:.6f} GB")
49+
print(f"Time to create array: {t:.6f} seconds")
50+
return arr, arrsize
51+
52+
53+
target_sizes = np.float64(np.array([.2, .5, 1, 2, 5, 10]))
54+
rng = np.random.default_rng()
55+
blosctimes = []
56+
nptimes = []
57+
zarrtimes = []
58+
h5pytimes = []
59+
genuine_sizes = []
60+
for d in target_sizes:
61+
arr, arrsize = genarray(d)
62+
genuine_sizes += [arrsize]
63+
idx = rng.integers(low=0, high=arr.shape[0], size=(1000,)) if SPARSE else rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
64+
sorted_idx = np.unique(np.sort(idx))
65+
## Test fancy indexing for different use cases
66+
def timer(arr):
67+
time_list = []
68+
if not (HDF5 or ZARR):
69+
b = arr[[[sorted_idx], [idx]]]
70+
time_list += [time.time() - t]
71+
t = time.time()
72+
t = time.time()
73+
b = arr[sorted_idx] if HDF5 else arr[idx]
74+
time_list += [time.time() - t]
75+
return np.array(time_list)
76+
77+
nparr = arr[:]
78+
if BLOSC:
79+
blosctimes += [timer(arr)]
80+
if NUMPY:
81+
nptimes += [timer(nparr)]
82+
if ZARR:
83+
z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, chunks=arr.chunks,
84+
dtype=nparr.dtype, overwrite=True)
85+
z_test[:] = nparr
86+
zarrtimes += [timer(z_test)]
87+
if HDF5:
88+
with h5py.File('my_hdf5_file.h5', 'w') as f:
89+
dset = f.create_dataset("init", data=nparr, chunks=arr.chunks)
90+
h5pytimes += [timer(dset)]
91+
92+
blosctimes = np.array(blosctimes)
93+
nptimes = np.array(nptimes)
94+
zarrtimes = np.array(zarrtimes)
95+
h5pytimes = np.array(h5pytimes)
96+
labs=''
97+
width = 0.2
98+
result_tuple = (
99+
["Numpy", nptimes, -2 * width],
100+
["Blosc2", blosctimes, -width],
101+
["Zarr", zarrtimes, 0],
102+
["HDF5", h5pytimes, width]
103+
)
104+
105+
x = np.arange(len(genuine_sizes))
106+
# Create barplot for Numpy vs Blosc vs Zarr vs H5py
107+
for i, r in enumerate(result_tuple):
108+
if r[1].shape != (0,):
109+
label, times, w = r
110+
c = ['b', 'r', 'g', 'm'][i]
111+
mean = times.mean(axis=1)
112+
err = (mean - times.min(axis=1), times.max(axis=1)-mean)
113+
plt.bar(x + w, mean, width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
114+
error_kw=dict(lw=2, capthick=2, ecolor='k'))
115+
labs+=label
116+
117+
filename = f"results{labs}1Dsparse" if SPARSE else f"results{labs}1D"
118+
with open(filename+".pkl", 'wb') as f:
119+
pickle.dump({'times':result_tuple, 'sizes':genuine_sizes}, f)
120+
121+
plt.xlabel('Array size (GB)')
122+
plt.legend()
123+
plt.xticks(x-width, np.round(genuine_sizes, 2))
124+
plt.ylabel("Time (s)")
125+
plt.title(f"Fancy indexing performance comparison, 1D {' sparse' if SPARSE else ''}")
126+
plt.gca().set_yscale('log')
127+
plt.savefig(f'plots/{filename}.png', format="png")
128+
plt.show()
129+
130+
print("Finished everything!")

src/blosc2/blosc2_ext.pyx

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,12 @@ cdef extern from "b2nd.h":
510510
int b2nd_concatenate(b2nd_context_t *ctx, b2nd_array_t *src1, b2nd_array_t *src2,
511511
int8_t axis, c_bool copy, b2nd_array_t **array)
512512
int b2nd_expand_dims(const b2nd_array_t *array, b2nd_array_t ** view, const int8_t axis)
513+
int b2nd_get_orthogonal_selection(const b2nd_array_t *array, int64_t ** selection,
514+
int64_t *selection_size, void *buffer,
515+
int64_t *buffershape, int64_t buffersize)
516+
int b2nd_set_orthogonal_selection(const b2nd_array_t *array, int64_t ** selection,
517+
int64_t *selection_size, void *buffer,
518+
int64_t *buffershape, int64_t buffersize)
513519
int b2nd_from_schunk(blosc2_schunk *schunk, b2nd_array_t **array)
514520

515521
void blosc2_unidim_to_multidim(uint8_t ndim, int64_t *shape, int64_t i, int64_t *index)
@@ -2419,6 +2425,70 @@ cdef class NDArray:
24192425

24202426
return arr
24212427

2428+
def get_oindex_numpy(self, arr, key):
2429+
"""
2430+
Orthogonal indexing. Key is a tuple of lists of integer indices.
2431+
"""
2432+
if len(key) != self.array.ndim:
2433+
raise ValueError(f"Key must have {self.array.ndim} dimensions, got {len(key)}.")
2434+
cdef int64_t[B2ND_MAX_DIM] buffershape_
2435+
cdef int64_t** key_
2436+
cdef int64_t buffersize_ = self.array.sc.typesize
2437+
cdef int64_t[B2ND_MAX_DIM] sel_size
2438+
2439+
key_ = <int64_t**> malloc(len(key) * sizeof(int64_t *))
2440+
2441+
for i in range(self.array.ndim):
2442+
buffershape_[i] = len(key[i])
2443+
buffersize_ *= buffershape_[i]
2444+
sel_size[i] = len(key[i])
2445+
key_[i] = <int64_t *> malloc(sel_size[i] * sizeof(int64_t))
2446+
for j in range(len(key[i])):
2447+
key_[i][j] = key[i][j]
2448+
2449+
cdef Py_buffer buf
2450+
PyObject_GetBuffer(arr, &buf, PyBUF_SIMPLE)
2451+
2452+
_check_rc(b2nd_get_orthogonal_selection(self.array, key_, sel_size, buf.buf,
2453+
buffershape_, buffersize_), "Error while getting orthogonal selection")
2454+
PyBuffer_Release(&buf)
2455+
for i in range(len(key)):
2456+
free(key_[i]) # Free the allocated memory for each key
2457+
free(key_)
2458+
return arr
2459+
2460+
def set_oindex_numpy(self, key, arr):
2461+
"""
2462+
Orthogonal indexing. Set elements of self with arr using key.
2463+
"""
2464+
if len(key) != self.array.ndim:
2465+
raise ValueError(f"Key must have {self.array.ndim} dimensions, got {len(key)}.")
2466+
cdef int64_t[B2ND_MAX_DIM] buffershape_
2467+
cdef int64_t** key_
2468+
cdef int64_t buffersize_ = self.array.sc.typesize
2469+
cdef int64_t[B2ND_MAX_DIM] sel_size
2470+
2471+
key_ = <int64_t**> malloc(len(key) * sizeof(int64_t *))
2472+
2473+
for i in range(self.array.ndim):
2474+
buffershape_[i] = len(key[i])
2475+
buffersize_ *= buffershape_[i]
2476+
sel_size[i] = len(key[i])
2477+
key_[i] = <int64_t *> malloc(sel_size[i] * sizeof(int64_t))
2478+
for j in range(len(key[i])):
2479+
key_[i][j] = key[i][j]
2480+
2481+
cdef Py_buffer buf
2482+
PyObject_GetBuffer(arr, &buf, PyBUF_SIMPLE)
2483+
2484+
_check_rc(b2nd_set_orthogonal_selection(self.array, key_, sel_size, buf.buf,
2485+
buffershape_, buffersize_), "Error while getting orthogonal selection")
2486+
PyBuffer_Release(&buf)
2487+
for i in range(len(key)):
2488+
free(key_[i]) # Free the allocated memory for each key
2489+
free(key_)
2490+
return arr
2491+
24222492

24232493
def get_slice(self, key, mask, **kwargs):
24242494
start, stop = key

0 commit comments

Comments
 (0)