|
| 1 | +####################################################################### |
| 2 | +# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org> |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# This source code is licensed under a BSD-style license (found in the |
| 6 | +# LICENSE file in the root directory of this source tree) |
| 7 | +####################################################################### |
| 8 | + |
| 9 | +# Benchmark for computing a fancy index of a blosc2 array |
| 10 | + |
| 11 | +import numpy as np |
| 12 | +import ndindex |
| 13 | +import blosc2 |
| 14 | +import time |
| 15 | +import matplotlib.pyplot as plt |
| 16 | +import zarr |
| 17 | +import h5py |
| 18 | +import pickle |
| 19 | +import os |
| 20 | +plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16}) |
| 21 | +plt.rcParams['figure.dpi'] = 300 |
| 22 | +plt.rcParams['savefig.dpi'] = 300 |
| 23 | +plt.rc('text', usetex=False) |
| 24 | +plt.rc('font',**{'serif':['cm']}) |
| 25 | +plt.style.use('seaborn-v0_8-paper') |
| 26 | + |
| 27 | +NUMPY = True |
| 28 | +BLOSC = True |
| 29 | +ZARR = False |
| 30 | +HDF5 = False |
| 31 | +SPARSE = False |
| 32 | + |
| 33 | +NDIMS = 2 # must be at least 2 |
| 34 | + |
| 35 | +def genarray(r, ndims=2, verbose=True): |
| 36 | + d = int((r*2**30/8)**(1/ndims)) |
| 37 | + shape = (d,) * ndims |
| 38 | + chunks = (d // 4,) * ndims |
| 39 | + blocks = (max(d // 10, 1),) * ndims |
| 40 | + t = time.time() |
| 41 | + arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, |
| 42 | + urlpath=f'linspace{r}{ndims}D.b2nd', mode='w') |
| 43 | + t = time.time() - t |
| 44 | + arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30 |
| 45 | + if verbose: |
| 46 | + print(f"Array shape: {arr.shape}") |
| 47 | + print(f"Array size: {arrsize:.6f} GB") |
| 48 | + print(f"Time to create array: {t:.6f} seconds") |
| 49 | + return arr, arrsize |
| 50 | + |
| 51 | + |
| 52 | +target_sizes = np.int64(np.array([1, 2, 4, 8, 16, 24])) |
| 53 | +#target_sizes = np.int64(np.array([1, 2, 4, 8])) # for quick testing |
| 54 | +rng = np.random.default_rng() |
| 55 | +blosctimes = [] |
| 56 | +nptimes = [] |
| 57 | +zarrtimes = [] |
| 58 | +h5pytimes = [] |
| 59 | +genuine_sizes = [] |
| 60 | +for d in target_sizes: |
| 61 | + arr, arrsize = genarray(d, ndims=NDIMS) |
| 62 | + genuine_sizes += [arrsize] |
| 63 | + sparseness = 1000 if SPARSE else arr.shape[0]//4 |
| 64 | + idx = rng.integers(low=0, high=arr.shape[0], size=(sparseness,)) |
| 65 | + sorted_idx = np.sort(np.unique(idx)) |
| 66 | + col = rng.integers(low=0, high=arr.shape[0], size=(sparseness,)) |
| 67 | + col_sorted = np.sort(np.unique(col)) |
| 68 | + mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1 |
| 69 | + |
| 70 | + ## Test fancy indexing for different use cases |
| 71 | + m, M = sorted_idx[0], sorted_idx[-1] |
| 72 | + def timer(arr): |
| 73 | + time_list = [] |
| 74 | + if not HDF5: |
| 75 | + t = time.time() |
| 76 | + b = arr[idx, col] |
| 77 | + time_list += [time.time() - t] |
| 78 | + if not ZARR: |
| 79 | + t = time.time() |
| 80 | + b = arr[slice(1, M // 2, 5), col] |
| 81 | + time_list += [time.time() - t] |
| 82 | + t = time.time() |
| 83 | + b = arr[[[idx], [col]]] |
| 84 | + time_list += [time.time() - t] |
| 85 | + t = time.time() |
| 86 | + b = arr[idx[:10, None], col[:10]] |
| 87 | + time_list += [time.time() - t] |
| 88 | + t = time.time() |
| 89 | + b = arr[idx[:10, None], mask] |
| 90 | + time_list += [time.time() - t] |
| 91 | + t = time.time() |
| 92 | + b = arr[idx] if not HDF5 else arr[sorted_idx] |
| 93 | + time_list += [time.time() - t] |
| 94 | + t = time.time() |
| 95 | + b = arr[m, idx] if not HDF5 else arr[m, col_sorted] |
| 96 | + time_list += [time.time() - t] |
| 97 | + return np.array(time_list) |
| 98 | + |
| 99 | + nparr = arr[:] |
| 100 | + if BLOSC: |
| 101 | + blosctimes += [timer(arr)] |
| 102 | + if NUMPY: |
| 103 | + nptimes += [timer(nparr)] |
| 104 | + if ZARR: |
| 105 | + z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, chunks=arr.chunks, |
| 106 | + dtype=nparr.dtype, overwrite=True) |
| 107 | + z_test[:] = nparr |
| 108 | + zarrtimes += [timer(z_test)] |
| 109 | + if HDF5: |
| 110 | + with h5py.File('my_hdf5_file.h5', 'w') as f: |
| 111 | + dset = f.create_dataset("init", data=nparr, chunks=arr.chunks) |
| 112 | + h5pytimes += [timer(dset)] |
| 113 | + |
| 114 | +blosctimes = np.array(blosctimes) |
| 115 | +nptimes = np.array(nptimes) |
| 116 | +zarrtimes = np.array(zarrtimes) |
| 117 | +h5pytimes = np.array(h5pytimes) |
| 118 | +labs='' |
| 119 | +width = 0.2 |
| 120 | +result_tuple = ( |
| 121 | + ["Numpy", nptimes, -2 * width], |
| 122 | + ["Blosc2", blosctimes, -width], |
| 123 | + ["Zarr", zarrtimes, 0], |
| 124 | + ["HDF5", h5pytimes, width] |
| 125 | +) |
| 126 | + |
| 127 | +x = np.arange(len(genuine_sizes)) |
| 128 | +# Create barplot for Numpy vs Blosc vs Zarr vs H5py |
| 129 | +for i, r in enumerate(result_tuple): |
| 130 | + if r[1].shape != (0,): |
| 131 | + label, times, w = r |
| 132 | + c = ['b', 'r', 'g', 'm'][i] |
| 133 | + mean = times.mean(axis=1) |
| 134 | + err = (mean - times.min(axis=1), times.max(axis=1)-mean) |
| 135 | + plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k', |
| 136 | + error_kw=dict(lw=2, capthick=2, ecolor='k')) |
| 137 | + labs+=label |
| 138 | + |
| 139 | +filename = f"results{labs}{NDIMS}D" + "sparse" if SPARSE else f"results{labs}{NDIMS}D" |
| 140 | + |
| 141 | +with open(f"{filename}.pkl", 'wb') as f: |
| 142 | + pickle.dump(result_tuple, f) |
| 143 | + |
| 144 | +plt.xlabel('Array size (GB)') |
| 145 | +plt.legend() |
| 146 | +plt.xticks(x-width, np.round(genuine_sizes, 2)) |
| 147 | +plt.ylabel("Time (s)") |
| 148 | +plt.title(f"Fancy indexing performance comparison, {NDIMS}D" +f"{" sparse" if SPARSE else ""}") |
| 149 | +plt.gca().set_yscale('log') |
| 150 | +plt.savefig(f'plots/fancyIdx{filename}.png', format="png") |
| 151 | +plt.show() |
| 152 | + |
| 153 | +print("Finished everything!") |
0 commit comments