Skip to content

Commit 6cb1be2

Browse files
committed
Remove Numpy optimisation and reformat benchmarks
1 parent 05ab284 commit 6cb1be2

3 files changed

Lines changed: 208 additions & 99 deletions

File tree

bench/ndarray/fancy_index.py

Lines changed: 79 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -18,142 +18,125 @@
1818
import pickle
1919
import os
2020
plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
21-
plt.rcParams['figure.dpi'] = 1000
22-
plt.rcParams['savefig.dpi'] = 1000
21+
plt.rcParams['figure.dpi'] = 300
22+
plt.rcParams['savefig.dpi'] = 300
2323
plt.rc('text', usetex=False)
2424
plt.rc('font',**{'serif':['cm']})
2525
plt.style.use('seaborn-v0_8-paper')
2626

27-
NUMPY_BLOSC = False # activate NUMPY and BLOSC tests
28-
NUMPY_BLOSC_ZARR = False # activate NUMPY, BLOSC and Zarr tests
29-
# default if both are false is to run tests for Numpy, Blosc, Zarr and HDF5
27+
NUMPY = True
28+
BLOSC = True
29+
ZARR = True
30+
HDF5 = True
3031

31-
def genarray(r, ndims=1, verbose=True):
32+
NDIMS = 2 # must be at least 2
33+
34+
def genarray(r, ndims=2, verbose=True):
3235
d = int((r*2**30/8)**(1/ndims))
3336
shape = (d,) * ndims
3437
chunks = (d // 4,) * ndims
3538
blocks = (max(d // 10, 1),) * ndims
3639
t = time.time()
37-
if os.path.exists(f'linspace{r}.b2nd'):
38-
arr = blosc2.open(urlpath=f'linspace{r}.b2nd')
39-
else:
40-
arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}.b2nd', mode='w')
40+
arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}{ndims}D.b2nd', mode='w')
4141
t = time.time() - t
42+
arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
4243
if verbose:
4344
print(f"Array shape: {arr.shape}")
44-
print(f"Array size: {np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30:.6f} GB")
45+
print(f"Array size: {arrsize:.6f} GB")
4546
print(f"Time to create array: {t:.6f} seconds")
46-
return arr
47+
return arr, arrsize
4748

4849

49-
sizes = np.int64(np.array([1, 2, 4, 8, 16, 24, 32]))
50+
target_sizes = np.int64(np.array([1, 2, 4, 8, 16, 24, 32]))
5051
rng = np.random.default_rng()
5152
blosctimes = []
5253
nptimes = []
5354
zarrtimes = []
5455
h5pytimes = []
55-
x = np.arange(len(sizes))
56-
width = 0.2
57-
labs = 'NumpyBlosc2' if NUMPY_BLOSC else 'NumpyBlosc2ZarrHDF5'
58-
labs = 'NumpyBlosc2Zarr' if NUMPY_BLOSC_ZARR else labs
59-
try:
60-
with open(f"results{labs}.pkl", 'rb') as f:
61-
result_tuple = pickle.load(f)
62-
labs = ''
63-
for i, r in enumerate(result_tuple):
64-
if r[1].shape != (0,):
65-
label,times,w = r
66-
c = ['b', 'r', 'g', 'm'][i]
67-
mean = times.mean(axis=1)
68-
err = [mean - times.min(axis=1), times.max(axis=1)-mean]
69-
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
70-
error_kw=dict(lw=2, capthick=2, ecolor='k'))
71-
labs+=label
72-
except:
73-
for d in sizes:
74-
arr = genarray(d, ndims=2)
75-
idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
76-
row = np.sort(np.unique(idx))
77-
col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))))
78-
mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
56+
genuine_sizes = []
57+
for d in target_sizes:
58+
arr, arrsize = genarray(d, ndims=NDIMS)
59+
genuine_sizes += [arrsize]
60+
idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
61+
sorted_idx = np.sort(np.unique(idx))
62+
col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))))
63+
mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
7964

80-
## Test fancy indexing for different use cases
81-
m, M = np.min(idx), np.max(idx)
82-
res = arr[:][row]
83-
def timer(arr, row=row, col=col):
84-
time_list = []
85-
if NUMPY_BLOSC or NUMPY_BLOSC_ZARR:
86-
t = time.time()
87-
b = arr[row, col]
88-
time_list += [time.time() - t]
89-
if NUMPY_BLOSC:
65+
## Test fancy indexing for different use cases
66+
m, M = sorted_idx[0], sorted_idx[-1]
67+
def timer(arr):
68+
time_list = []
69+
if not HDF5:
70+
t = time.time()
71+
b = arr[idx, col]
72+
time_list += [time.time() - t]
73+
if not ZARR:
9074
t = time.time()
9175
b = arr[slice(1, M // 2, 5), col]
9276
time_list += [time.time() - t]
9377
t = time.time()
94-
b = arr[[[row], [col]]]
78+
b = arr[[[idx], [col]]]
9579
time_list += [time.time() - t]
9680
t = time.time()
97-
b = arr[row[:10, None], col[:10]]
81+
b = arr[idx[:10, None], col[:10]]
9882
time_list += [time.time() - t]
9983
t = time.time()
100-
b = arr[row[:10, None], mask]
84+
b = arr[idx[:10, None], mask]
10185
time_list += [time.time() - t]
102-
t = time.time()
103-
b = arr[row]
104-
time_list += [time.time() - t]
105-
t = time.time()
106-
b = arr[m, col]
107-
time_list += [time.time() - t]
108-
return np.array(time_list)
86+
t = time.time()
87+
b = arr[idx] if not HDF5 else arr[sorted_idx]
88+
time_list += [time.time() - t]
89+
t = time.time()
90+
b = arr[m, idx] if not HDF5 else arr[m, col]
91+
time_list += [time.time() - t]
92+
return np.array(time_list)
93+
94+
nparr = arr[:]
95+
if BLOSC:
96+
blosctimes += [timer(arr)]
97+
if NUMPY:
98+
nptimes += [timer(nparr)]
99+
if ZARR:
100+
z_test = zarr.create_array(store='data/example.zarr', shape=nparr.shape, dtype=nparr.dtype, overwrite=True)
101+
z_test[:] = nparr
102+
zarrtimes += [timer(z_test)]
103+
if HDF5:
104+
with h5py.File('my_hdf5_file.h5', 'w') as f:
105+
dset = f.create_dataset("init", data=nparr)
106+
h5pytimes += [timer(dset)]
109107

110-
if NUMPY_BLOSC or NUMPY_BLOSC_ZARR:
111-
blosctimes += [timer(arr, row=idx, col=idx)]
112-
arr=arr[:]
113-
nptimes += [timer(arr, row=idx, col=idx)]
114-
if NUMPY_BLOSC_ZARR:
115-
z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, dtype=arr.dtype, overwrite=True)
116-
z_test[:] = arr
117-
zarrtimes += [timer(z_test, row=idx, col=idx)]
118-
else:
119-
blosctimes += [timer(arr)]
120-
arr=arr[:]
121-
nptimes += [timer(arr)]
122-
z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, dtype=arr.dtype, overwrite=True)
123-
z_test[:] = arr
124-
zarrtimes += [timer(z_test)]
125-
with h5py.File('my_hdf5_file.h5', 'w') as f:
126-
dset = f.create_dataset("init", data=arr)
127-
h5pytimes += [timer(dset)]
108+
blosctimes = np.array(blosctimes)
109+
nptimes = np.array(nptimes)
110+
zarrtimes = np.array(zarrtimes)
111+
h5pytimes = np.array(h5pytimes)
112+
labs=''
113+
result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
128114

129-
blosctimes = np.array(blosctimes)
130-
nptimes = np.array(nptimes)
131-
zarrtimes = np.array(zarrtimes)
132-
h5pytimes = np.array(h5pytimes)
133-
labs=''
134-
result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
115+
x = np.arange(len(genuine_sizes))
116+
width = 0.2
117+
# Create barplot for Numpy vs Blosc vs Zarr vs H5py
118+
for i, r in enumerate(result_tuple):
119+
if r[1].shape != (0,):
120+
label, times, w = r
121+
c = ['b', 'r', 'g', 'm'][i]
122+
mean = times.mean(axis=1)
123+
err = (mean - times.min(axis=1), times.max(axis=1)-mean)
124+
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
125+
error_kw=dict(lw=2, capthick=2, ecolor='k'))
126+
labs+=label
135127

136-
# Create barplot for Numpy vs Blosc vs Zarr vs H5py
137-
for i, r in enumerate(result_tuple):
138-
if r[1].shape != (0,):
139-
label, times, w = r
140-
c = ['b', 'r', 'g', 'm'][i]
141-
mean = times.mean(axis=1)
142-
err = (mean - times.min(axis=1), times.max(axis=1)-mean)
143-
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
144-
error_kw=dict(lw=2, capthick=2, ecolor='k'))
145-
labs+=label
128+
filename = "results{labs}{NDIMS}D"
146129

147-
with open(f"results{labs}.pkl", 'wb') as f:
148-
pickle.dump(result_tuple, f)
130+
with open(f"{filename}.pkl", 'wb') as f:
131+
pickle.dump(result_tuple, f)
149132

150133
plt.xlabel('Array size (GB)')
151134
plt.legend()
152-
plt.xticks(x-width, np.round(sizes, 2))
135+
plt.xticks(x-width, np.round(genuine_sizes, 2))
153136
plt.ylabel("Time (s)")
154-
plt.title('Fancy indexing performance comparison')
137+
plt.title('Fancy indexing performance comparison, {NDIMS}D')
155138
plt.gca().set_yscale('log')
156-
plt.savefig(f'plots/fancyIdx{labs}.png', format="png")
139+
plt.savefig(f'plots/fancyIdx{filename}.png', format="png")
157140
plt.show()
158141

159142
print("Finished everything!")

bench/ndarray/fancy_index1D.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under a BSD-style license (found in the
6+
# LICENSE file in the root directory of this source tree)
7+
#######################################################################
8+
9+
# Benchmark for computing a fancy index of a blosc2 array
10+
11+
import numpy as np
12+
import ndindex
13+
import blosc2
14+
import time
15+
import matplotlib.pyplot as plt
16+
import zarr
17+
import h5py
18+
import pickle
19+
import os
20+
plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
21+
plt.rcParams['figure.dpi'] = 300
22+
plt.rcParams['savefig.dpi'] = 300
23+
plt.rc('text', usetex=False)
24+
plt.rc('font',**{'serif':['cm']})
25+
plt.style.use('seaborn-v0_8-paper')
26+
27+
NUMPY = True
28+
BLOSC = True
29+
ZARR = False
30+
HDF5 = True
31+
SPARSE = True
32+
33+
if HDF5:
34+
SPARSE = True # HDF5 takes too long for non-sparse indexing
35+
36+
def genarray(r, verbose=True):
37+
d = int((r*2**30/8))
38+
shape = (d,)
39+
chunks = (d // 4,)
40+
blocks = (max(d // 10, 1),)
41+
t = time.time()
42+
arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}1D.b2nd', mode='w')
43+
t = time.time() - t
44+
arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30
45+
if verbose:
46+
print(f"Array shape: {arr.shape}")
47+
print(f"Array size: {arrsize:.6f} GB")
48+
print(f"Time to create array: {t:.6f} seconds")
49+
return arr, arrsize
50+
51+
52+
target_sizes = np.float64(np.array([.1, .2, .4, .8, 1, 1.2]))
53+
rng = np.random.default_rng()
54+
blosctimes = []
55+
nptimes = []
56+
zarrtimes = []
57+
h5pytimes = []
58+
genuine_sizes = []
59+
for d in target_sizes:
60+
arr, arrsize = genarray(d)
61+
genuine_sizes += [arrsize]
62+
idx = rng.integers(low=0, high=arr.shape[0], size=(1000,)) if SPARSE else rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,))
63+
sorted_idx = np.sort(idx)
64+
## Test fancy indexing for different use cases
65+
def timer(arr):
66+
time_list = []
67+
if not (HDF5 or ZARR):
68+
b = arr[[[sorted_idx], [idx]]]
69+
time_list += [time.time() - t]
70+
t = time.time()
71+
t = time.time()
72+
b = arr[sorted_idx] if HDF5 else arr[idx]
73+
time_list += [time.time() - t]
74+
return np.array(time_list)
75+
76+
nparr = arr[:]
77+
if BLOSC:
78+
blosctimes += [timer(arr, row=idx, col=idx)]
79+
if NUMPY:
80+
nptimes += [timer(nparr, row=idx, col=idx)]
81+
if ZARR:
82+
z_test = zarr.create_array(store='data/example.zarr', shape=nparr.shape, dtype=nparr.dtype, overwrite=True)
83+
z_test[:] = nparr
84+
zarrtimes += [timer(z_test, row=idx, col=idx)]
85+
if HDF5:
86+
with h5py.File('my_hdf5_file.h5', 'w') as f:
87+
dset = f.create_dataset("init", data=nparr)
88+
h5pytimes += [timer(dset)]
89+
90+
blosctimes = np.array(blosctimes)
91+
nptimes = np.array(nptimes)
92+
zarrtimes = np.array(zarrtimes)
93+
h5pytimes = np.array(h5pytimes)
94+
labs=''
95+
result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
96+
97+
x = np.arange(len(genuine_sizes))
98+
width = 0.2
99+
# Create barplot for Numpy vs Blosc vs Zarr vs H5py
100+
for i, r in enumerate(result_tuple):
101+
if r[1].shape != (0,):
102+
label, times, w = r
103+
c = ['b', 'r', 'g', 'm'][i]
104+
mean = times.mean(axis=1)
105+
err = (mean - times.min(axis=1), times.max(axis=1)-mean)
106+
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
107+
error_kw=dict(lw=2, capthick=2, ecolor='k'))
108+
labs+=label
109+
110+
filename = "results{labs}1Dsparse" if SPARSE else "results{labs}1D"
111+
with open(filename+".pkl", 'wb') as f:
112+
pickle.dump({'times':result_tuple, 'sizes':genuine_sizes}, f)
113+
114+
plt.xlabel('Array size (GB)')
115+
plt.legend()
116+
plt.xticks(x-width, np.round(genuine_sizes, 2))
117+
plt.ylabel("Time (s)")
118+
plt.title('Fancy indexing performance comparison, 1D' + {" sparse" if SPARSE else ""})
119+
plt.gca().set_yscale('log')
120+
plt.savefig(f'plots/{filename}.png', format="png")
121+
plt.show()
122+
123+
print("Finished everything!")

src/blosc2/ndarray.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,8 +1441,9 @@ def T(self):
14411441

14421442
def get_fselection_numpy(self, key):
14431443
# TODO: Make this faster for broadcasted keys
1444-
if math.prod(self.shape) * self.dtype.itemsize < blosc2.MAX_FAST_PATH_SIZE:
1445-
return self[:][key] # load into memory for smallish arrays
1444+
## Can`t do this because ndindex doesn't support all the same indexing cases as Numpy
1445+
# if math.prod(self.shape) * self.dtype.itemsize < blosc2.MAX_FAST_PATH_SIZE:
1446+
# return self[:][key] # load into memory for smallish arrays
14461447
shape = self.shape
14471448
chunks = self.chunks
14481449
_slice = ndindex.ndindex(key).expand(shape)
@@ -1527,7 +1528,9 @@ def __getitem__( # noqa: C901
15271528
"""
15281529
Retrieve a (multidimensional) slice as specified by the key.
15291530
1530-
Note that this __getitem__ matches NumPy fancy indexing behaviour.
1531+
Note that this __getitem__ closely matches NumPy fancy indexing behaviour, except in
1532+
some edge cases which are not supported by ndindex.
1533+
Array indeices eparated by slice object - e.g. arr[0, :10, [0,1]] - are NOT supported.
15311534
15321535
Parameters
15331536
----------

0 commit comments

Comments
 (0)