Skip to content

Commit d4d9320

Browse files
committed
Remove refs to vindex and finalise bench and tests. Add oindex setitem.
1 parent 1f06f4b commit d4d9320

3 files changed

Lines changed: 150 additions & 104 deletions

File tree

bench/ndarray/fancy_index.py

Lines changed: 99 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -12,130 +12,143 @@
1212
import ndindex
1313
import blosc2
1414
import time
15-
from memory_profiler import memory_usage, profile
1615
import matplotlib.pyplot as plt
1716
import zarr
1817
import h5py
19-
plt.rcParams.update({'text.usetex':True,'font.serif': ['cm'],'font.size':16})
18+
import pickle
19+
plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16})
2020
plt.rcParams['figure.dpi'] = 1000
2121
plt.rcParams['savefig.dpi'] = 1000
22-
plt.rc('text', usetex=True)
22+
plt.rc('text', usetex=False)
2323
plt.rc('font',**{'serif':['cm']})
2424
plt.style.use('seaborn-v0_8-paper')
2525

26-
NUMPY_BLOSC = False
26+
NUMPY_BLOSC = True
27+
NUMPY_BLOSC_ZARR = False
2728

2829
def genarray(r, ndims=1, verbose=True):
2930
d = int((r*2**30/8)**(1/ndims))
3031
shape = (d,) * ndims
3132
chunks = (d // 4,) * ndims
3233
blocks = (max(d // 10, 1),) * ndims
3334
t = time.time()
34-
arr = blosc2.ones(shape=shape, chunks=chunks, blocks=blocks, dtype=np.int64) # , urlpath=file, mode="w")
35+
arr = blosc2.ones(shape=shape, dtype=np.int64)
3536
t = time.time() - t
3637
if verbose:
3738
print(f"Array shape: {arr.shape}")
3839
print(f"Array size: {np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30:.6f} GB")
3940
print(f"Time to create array: {t:.6f} seconds")
4041
return arr
4142

42-
blosc_times = []
43-
np_times = []
44-
zarr_times = []
45-
sizes = []
46-
dims = np.int64(np.array([1, 2, 4, 6, 8]))
43+
44+
sizes = np.int64(np.array([1, 2, 4, 8, 16]))
4745
rng = np.random.default_rng()
4846
blosctimes = []
4947
nptimes = []
5048
zarrtimes = []
5149
h5pytimes = []
52-
53-
for d in dims:
54-
arr = genarray(d, ndims=2)
55-
sizes.append(d)
56-
idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],))
57-
row = np.arange(arr.shape[0])
58-
col = row
59-
mask = rng.integers(low=0, high=2, size=(d,)) == 1
50+
x = np.arange(len(sizes))
51+
width = 0.2
52+
labs = 'NumpyBlosc2' if NUMPY_BLOSC else 'NumpyBlosc2ZarrHDF5'
53+
labs = 'NumpyBlosc2Zarr' if NUMPY_BLOSC_ZARR else labs
54+
try:
55+
with open(f"results{labs}.pkl", 'rb') as f:
56+
result_tuple = pickle.load(f)
57+
labs = ''
58+
for i, r in enumerate(result_tuple):
59+
if r[1].shape != (0,):
60+
label,times,w = r
61+
c = ['b', 'r', 'g', 'm'][i]
62+
mean = times.mean(axis=1)
63+
err = [mean - times.min(axis=1), times.max(axis=1)-mean]
64+
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
65+
error_kw=dict(lw=2, capthick=2, ecolor='k'))
66+
labs+=label
67+
except:
68+
for d in sizes:
69+
arr = genarray(d, ndims=2)
70+
idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],))
71+
row = np.sort(np.unique(idx))
72+
col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],))))
73+
mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1
6074

6175

62-
## Test fancy indexing for different use cases
63-
m, M = np.min(idx), np.max(idx)
64-
def timer(arr, skip_flag=True, row=row, col=col):
65-
time_list = []
66-
if not skip_flag:
67-
t = time.time()
68-
b = arr[row, col]
69-
time_list += [time.time() - t]
76+
## Test fancy indexing for different use cases
77+
m, M = np.min(idx), np.max(idx)
78+
def timer(arr, row=row, col=col):
79+
time_list = []
80+
if NUMPY_BLOSC or NUMPY_BLOSC_ZARR:
81+
t = time.time()
82+
b = arr[row, col]
83+
time_list += [time.time() - t]
84+
if NUMPY_BLOSC:
85+
t = time.time()
86+
b = arr[slice(1, M // 2, 5), col]
87+
time_list += [time.time() - t]
88+
t = time.time()
89+
b = arr[[[row], [col]]]
90+
time_list += [time.time() - t]
91+
t = time.time()
92+
b = arr[row[:10, None], col[:10]]
93+
time_list += [time.time() - t]
94+
t = time.time()
95+
b = arr[row[:10, None], mask]
96+
time_list += [time.time() - t]
7097
t = time.time()
71-
b = arr[slice(1, M // 2, 5), col]
98+
b = arr[row]
7299
time_list += [time.time() - t]
73100
t = time.time()
74-
b = arr[[[m // 2, M // 2], [m // 4, M // 4]]]
101+
b = arr[m, col]
75102
time_list += [time.time() - t]
76-
t = time.time()
77-
b = arr[[m, M//2, M]]
78-
time_list += [time.time() - t]
79-
t = time.time()
80-
b = arr[m, col]
81-
time_list += [time.time() - t]
82-
return np.array(time_list)
103+
return np.array(time_list)
83104

84-
if NUMPY_BLOSC:
85-
blosctimes += [timer(arr, skip_flag=False, row=idx, col=idx)]
86-
arr=arr[:]
87-
nptimes += [timer(arr, skip_flag=False, row=idx, col=idx)]
88-
else:
89-
blosctimes += [timer(arr)]
90-
arr = arr[:]
91-
nptimes += [timer(arr)]
92-
z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype)
93-
z_test[:] = arr
94-
# zarr is more limited, as must provide same number of coord arrays as dims of array
95-
# also cannot mix with slices
96-
zarrtimes += [timer(z_test)]
97-
with h5py.File('my_hdf5_file.h5', 'w') as f:
98-
dset = f.create_dataset("init", data=arr)
99-
h5pytimes += [timer(dset)]
105+
if NUMPY_BLOSC or NUMPY_BLOSC_ZARR:
106+
blosctimes += [timer(arr, row=idx, col=idx)]
107+
arr=arr[:]
108+
nptimes += [timer(arr, row=idx, col=idx)]
109+
if NUMPY_BLOSC_ZARR:
110+
z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype)
111+
z_test[:] = arr
112+
zarrtimes += [timer(z_test, row=idx, col=idx)]
113+
else:
114+
blosctimes += [timer(arr)]
115+
arr=arr[:]
116+
nptimes += [timer(arr)]
117+
z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype)
118+
z_test[:] = arr
119+
zarrtimes += [timer(z_test)]
120+
with h5py.File('my_hdf5_file.h5', 'w') as f:
121+
dset = f.create_dataset("init", data=arr)
122+
h5pytimes += [timer(dset)]
100123

101-
x = np.arange(len(sizes))
102-
width = 0.2
103-
blosctimes = np.array(blosctimes)
104-
nptimes = np.array(nptimes)
105-
if NUMPY_BLOSC:
106-
# Create bars for axis 0 plot
107-
for i, r in enumerate((["Numpy", nptimes, -width], ["Blosc2", blosctimes, 0])):
108-
label, times, w = r
109-
c = ['b', 'r'][i]
110-
plt.bar(x + w, times.mean(axis=1), width, color=c, alpha=0.5)
111-
plt.bar(x + w, times.max(axis=1), width, color=c, alpha=0.25)
112-
plt.bar(x + w, times.min(axis=1), width, label=label, color=c, alpha=1)
113-
114-
plt.xlabel('Array size (GB)')
115-
plt.legend()
116-
plt.xticks(x, np.round(sizes, 2))
117-
plt.ylabel("Time (s)")
118-
plt.title('Fancy indexing NumPy vs Blosc2')
119-
plt.savefig('plots/fancyIdxNumpyVsBlosc.png', format="png")
120-
plt.show()
121-
else:
124+
blosctimes = np.array(blosctimes)
125+
nptimes = np.array(nptimes)
122126
zarrtimes = np.array(zarrtimes)
123127
h5pytimes = np.array(h5pytimes)
128+
labs=''
129+
result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])
130+
131+
# Create barplot for Numpy vs Blosc vs Zarr vs H5py
132+
for i, r in enumerate(result_tuple):
133+
if r[1].shape != (0,):
134+
label, times, w = r
135+
c = ['b', 'r', 'g', 'm'][i]
136+
mean = times.mean(axis=1)
137+
err = (mean - times.min(axis=1), times.max(axis=1)-mean)
138+
plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k',
139+
error_kw=dict(lw=2, capthick=2, ecolor='k'))
140+
labs+=label
124141

125-
# Create bars for axis 0 plot
126-
for i, r in enumerate((["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])):
127-
label,times,w = r
128-
c = ['b', 'r', 'g', 'm'][i]
129-
plt.bar(x + w, times.mean(axis=1), width, color=c, alpha=0.5)
130-
plt.bar(x + w, times.max(axis=1), width, color=c, alpha=0.25)
131-
plt.bar(x + w, times.min(axis=1), width, label=label, color=c, alpha=1)
142+
with open(f"results{labs}.pkl", 'wb') as f:
143+
pickle.dump(result_tuple, f)
132144

133-
plt.xlabel('Array size (GB)')
134-
plt.legend()
135-
plt.xticks(x, np.round(sizes, 2))
136-
plt.ylabel("Time (s)")
137-
plt.title('Fancy indexing performance comparison')
138-
plt.savefig('plots/fancyIdx.png', format="png")
139-
plt.show()
145+
plt.xlabel('Array size (GB)')
146+
plt.legend()
147+
plt.xticks(x-width, np.round(sizes, 2))
148+
plt.ylabel("Time (s)")
149+
plt.title('Fancy indexing performance comparison')
150+
plt.ylim([0,10])
151+
plt.savefig(f'plots/fancyIdx{labs}.png', format="png")
152+
plt.show()
140153

141154
print("Finished everything!")

src/blosc2/ndarray.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,7 +1429,7 @@ def oindex(self) -> OIndex:
14291429

14301430
@property
14311431
def vindex(self) -> VIndex:
1432-
"""Shortcut for fancy indexing, see :func:`get_vselection_numpy`"""
1432+
"""Shortcut for vectorised indexing. Not yet supported."""
14331433
return VIndex(self)
14341434

14351435
@property
@@ -1439,7 +1439,7 @@ def T(self):
14391439
raise ValueError("This property only works for 2-dimensional arrays.")
14401440
return permute_dims(self)
14411441

1442-
def get_vselection_numpy(self, key):
1442+
def get_fselection_numpy(self, key):
14431443
# TODO: Make this faster for broadcasted keys
14441444
shape = self.shape
14451445
if math.prod(shape) * self.dtype.itemsize < blosc2.MAX_FAST_PATH_SIZE:
@@ -1466,19 +1466,27 @@ def get_vselection_numpy(self, key):
14661466
return out
14671467

14681468
def get_oselection_numpy(self, key):
1469+
'''
1470+
Select independently from self along axes specified in key. Key must be same length as self shape.
1471+
See Zarr https://zarr.readthedocs.io/en/stable/user-guide/arrays.html#orthogonal-indexing.
1472+
'''
14691473
shape = tuple(len(k) for k in key) + self.shape[len(key) :]
14701474
# Create the array to store the result
14711475
arr = np.empty(shape, dtype=self.dtype)
14721476
return super().get_oindex_numpy(arr, key)
14731477

1474-
def set_oselection_numpy(self, key, arr):
1478+
def set_oselection_numpy(self, key, arr: np.ndarray):
1479+
'''
1480+
Select independently from self along axes specified in key and set to entries in arr. Key must be same length as self shape.
1481+
See Zarr https://zarr.readthedocs.io/en/stable/user-guide/arrays.html#orthogonal-indexing.
1482+
'''
14751483
return super().set_oindex_numpy(key, arr)
14761484

14771485
def __getitem__( # noqa: C901
14781486
self,
14791487
key: int | slice | Sequence[slice | int] | np.ndarray[np.bool_] | NDArray | blosc2.LazyExpr | str,
14801488
) -> np.ndarray | blosc2.LazyExpr:
1481-
"""Retrieve a (multidimensional) slice as specified by the key.
1489+
"""Retrieve a (multidimensional) slice as specified by the key. Matches NumPy fancy indexing behaviour.
14821490
14831491
Parameters
14841492
----------
@@ -1521,7 +1529,7 @@ def __getitem__( # noqa: C901
15211529
key_ = (slice(key, key + 1), *(slice(None),) * (self.ndim - 1))
15221530
elif isinstance(key, tuple):
15231531
if builtins.any(isinstance(k, (list, np.ndarray)) for k in key):
1524-
return self.get_vselection_numpy(key) # fancy index
1532+
return self.get_fselection_numpy(key) # fancy index
15251533
if builtins.sum(isinstance(k, (list, builtins.slice)) for k in key) != self.ndim:
15261534
key_, mask = process_key(key, self.shape)
15271535
elif isinstance(key, (list, np.ndarray, NDArray)):
@@ -1545,7 +1553,7 @@ def __getitem__( # noqa: C901
15451553
if key.dtype == np.int64 and key.ndim == 1 and self.ndim == 1:
15461554
return extract_values(self, key)
15471555
else:
1548-
return self.get_vselection_numpy(key) # fancy index
1556+
return self.get_fselection_numpy(key) # fancy index
15491557
else:
15501558
# The more general case (this is quite slow)
15511559
# If the key is a LazyExpr, decorate with ``where`` and return it
@@ -4399,6 +4407,9 @@ class VIndex:
43994407
def __init__(self, array: NDArray):
44004408
self.array = array
44014409

4402-
# TODO: add __setitem__
4410+
# TODO: all this
44034411
def __getitem__(self, selection) -> np.ndarray:
4404-
return self.array.get_vselection_numpy(selection)
4412+
return NotImplementedError
4413+
4414+
def __setitem__(self, selection, input) -> np.ndarray:
4415+
return NotImplementedError

0 commit comments

Comments
 (0)