|
18 | 18 | import pickle |
19 | 19 | import os |
20 | 20 | plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16}) |
21 | | -plt.rcParams['figure.dpi'] = 1000 |
22 | | -plt.rcParams['savefig.dpi'] = 1000 |
| 21 | +plt.rcParams['figure.dpi'] = 300 |
| 22 | +plt.rcParams['savefig.dpi'] = 300 |
23 | 23 | plt.rc('text', usetex=False) |
24 | 24 | plt.rc('font',**{'serif':['cm']}) |
25 | 25 | plt.style.use('seaborn-v0_8-paper') |
26 | 26 |
|
27 | | -NUMPY_BLOSC = False # activate NUMPY and BLOSC tests |
28 | | -NUMPY_BLOSC_ZARR = False # activate NUMPY, BLOSC and Zarr tests |
29 | | -# default if both are false is to run tests for Numpy, Blosc, Zarr and HDF5 |
| 27 | +NUMPY = True |
| 28 | +BLOSC = True |
| 29 | +ZARR = True |
| 30 | +HDF5 = True |
30 | 31 |
|
31 | | -def genarray(r, ndims=1, verbose=True): |
| 32 | +NDIMS = 2 # must be at least 2 |
| 33 | + |
| 34 | +def genarray(r, ndims=2, verbose=True): |
32 | 35 | d = int((r*2**30/8)**(1/ndims)) |
33 | 36 | shape = (d,) * ndims |
34 | 37 | chunks = (d // 4,) * ndims |
35 | 38 | blocks = (max(d // 10, 1),) * ndims |
36 | 39 | t = time.time() |
37 | | - if os.path.exists(f'linspace{r}.b2nd'): |
38 | | - arr = blosc2.open(urlpath=f'linspace{r}.b2nd') |
39 | | - else: |
40 | | - arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}.b2nd', mode='w') |
| 40 | + arr = blosc2.linspace(0, 1000, num=np.prod(shape), shape=shape, dtype=np.float64, urlpath=f'linspace{r}{ndims}D.b2nd', mode='w') |
41 | 41 | t = time.time() - t |
| 42 | + arrsize = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30 |
42 | 43 | if verbose: |
43 | 44 | print(f"Array shape: {arr.shape}") |
44 | | - print(f"Array size: {np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30:.6f} GB") |
| 45 | + print(f"Array size: {arrsize:.6f} GB") |
45 | 46 | print(f"Time to create array: {t:.6f} seconds") |
46 | | - return arr |
| 47 | + return arr, arrsize |
47 | 48 |
|
48 | 49 |
|
49 | | -sizes = np.int64(np.array([1, 2, 4, 8, 16, 24, 32])) |
| 50 | +target_sizes = np.int64(np.array([1, 2, 4, 8, 16, 24, 32])) |
50 | 51 | rng = np.random.default_rng() |
51 | 52 | blosctimes = [] |
52 | 53 | nptimes = [] |
53 | 54 | zarrtimes = [] |
54 | 55 | h5pytimes = [] |
55 | | -x = np.arange(len(sizes)) |
56 | | -width = 0.2 |
57 | | -labs = 'NumpyBlosc2' if NUMPY_BLOSC else 'NumpyBlosc2ZarrHDF5' |
58 | | -labs = 'NumpyBlosc2Zarr' if NUMPY_BLOSC_ZARR else labs |
59 | | -try: |
60 | | - with open(f"results{labs}.pkl", 'rb') as f: |
61 | | - result_tuple = pickle.load(f) |
62 | | - labs = '' |
63 | | - for i, r in enumerate(result_tuple): |
64 | | - if r[1].shape != (0,): |
65 | | - label,times,w = r |
66 | | - c = ['b', 'r', 'g', 'm'][i] |
67 | | - mean = times.mean(axis=1) |
68 | | - err = [mean - times.min(axis=1), times.max(axis=1)-mean] |
69 | | - plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k', |
70 | | - error_kw=dict(lw=2, capthick=2, ecolor='k')) |
71 | | - labs+=label |
72 | | -except: |
73 | | - for d in sizes: |
74 | | - arr = genarray(d, ndims=2) |
75 | | - idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,)) |
76 | | - row = np.sort(np.unique(idx)) |
77 | | - col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,)))) |
78 | | - mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1 |
| 56 | +genuine_sizes = [] |
| 57 | +for d in target_sizes: |
| 58 | + arr, arrsize = genarray(d, ndims=NDIMS) |
| 59 | + genuine_sizes += [arrsize] |
| 60 | + idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,)) |
| 61 | + sorted_idx = np.sort(np.unique(idx)) |
| 62 | + col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0]//4,)))) |
| 63 | + mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1 |
79 | 64 |
|
80 | | - ## Test fancy indexing for different use cases |
81 | | - m, M = np.min(idx), np.max(idx) |
82 | | - res = arr[:][row] |
83 | | - def timer(arr, row=row, col=col): |
84 | | - time_list = [] |
85 | | - if NUMPY_BLOSC or NUMPY_BLOSC_ZARR: |
86 | | - t = time.time() |
87 | | - b = arr[row, col] |
88 | | - time_list += [time.time() - t] |
89 | | - if NUMPY_BLOSC: |
| 65 | + ## Test fancy indexing for different use cases |
| 66 | + m, M = sorted_idx[0], sorted_idx[-1] |
| 67 | + def timer(arr): |
| 68 | + time_list = [] |
| 69 | + if not HDF5: |
| 70 | + t = time.time() |
| 71 | + b = arr[idx, col] |
| 72 | + time_list += [time.time() - t] |
| 73 | + if not ZARR: |
90 | 74 | t = time.time() |
91 | 75 | b = arr[slice(1, M // 2, 5), col] |
92 | 76 | time_list += [time.time() - t] |
93 | 77 | t = time.time() |
94 | | - b = arr[[[row], [col]]] |
| 78 | + b = arr[[[idx], [col]]] |
95 | 79 | time_list += [time.time() - t] |
96 | 80 | t = time.time() |
97 | | - b = arr[row[:10, None], col[:10]] |
| 81 | + b = arr[idx[:10, None], col[:10]] |
98 | 82 | time_list += [time.time() - t] |
99 | 83 | t = time.time() |
100 | | - b = arr[row[:10, None], mask] |
| 84 | + b = arr[idx[:10, None], mask] |
101 | 85 | time_list += [time.time() - t] |
102 | | - t = time.time() |
103 | | - b = arr[row] |
104 | | - time_list += [time.time() - t] |
105 | | - t = time.time() |
106 | | - b = arr[m, col] |
107 | | - time_list += [time.time() - t] |
108 | | - return np.array(time_list) |
| 86 | + t = time.time() |
| 87 | + b = arr[idx] if not HDF5 else arr[sorted_idx] |
| 88 | + time_list += [time.time() - t] |
| 89 | + t = time.time() |
| 90 | + b = arr[m, idx] if not HDF5 else arr[m, col] |
| 91 | + time_list += [time.time() - t] |
| 92 | + return np.array(time_list) |
| 93 | + |
| 94 | + nparr = arr[:] |
| 95 | + if BLOSC: |
| 96 | + blosctimes += [timer(arr)] |
| 97 | + if NUMPY: |
| 98 | + nptimes += [timer(nparr)] |
| 99 | + if ZARR: |
| 100 | + z_test = zarr.create_array(store='data/example.zarr', shape=nparr.shape, dtype=nparr.dtype, overwrite=True) |
| 101 | + z_test[:] = nparr |
| 102 | + zarrtimes += [timer(z_test)] |
| 103 | + if HDF5: |
| 104 | + with h5py.File('my_hdf5_file.h5', 'w') as f: |
| 105 | + dset = f.create_dataset("init", data=nparr) |
| 106 | + h5pytimes += [timer(dset)] |
109 | 107 |
|
110 | | - if NUMPY_BLOSC or NUMPY_BLOSC_ZARR: |
111 | | - blosctimes += [timer(arr, row=idx, col=idx)] |
112 | | - arr=arr[:] |
113 | | - nptimes += [timer(arr, row=idx, col=idx)] |
114 | | - if NUMPY_BLOSC_ZARR: |
115 | | - z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, dtype=arr.dtype, overwrite=True) |
116 | | - z_test[:] = arr |
117 | | - zarrtimes += [timer(z_test, row=idx, col=idx)] |
118 | | - else: |
119 | | - blosctimes += [timer(arr)] |
120 | | - arr=arr[:] |
121 | | - nptimes += [timer(arr)] |
122 | | - z_test = zarr.create_array(store='data/example.zarr', shape=arr.shape, dtype=arr.dtype, overwrite=True) |
123 | | - z_test[:] = arr |
124 | | - zarrtimes += [timer(z_test)] |
125 | | - with h5py.File('my_hdf5_file.h5', 'w') as f: |
126 | | - dset = f.create_dataset("init", data=arr) |
127 | | - h5pytimes += [timer(dset)] |
| 108 | +blosctimes = np.array(blosctimes) |
| 109 | +nptimes = np.array(nptimes) |
| 110 | +zarrtimes = np.array(zarrtimes) |
| 111 | +h5pytimes = np.array(h5pytimes) |
| 112 | +labs='' |
| 113 | +result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width]) |
128 | 114 |
|
129 | | - blosctimes = np.array(blosctimes) |
130 | | - nptimes = np.array(nptimes) |
131 | | - zarrtimes = np.array(zarrtimes) |
132 | | - h5pytimes = np.array(h5pytimes) |
133 | | - labs='' |
134 | | - result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width]) |
| 115 | +x = np.arange(len(genuine_sizes)) |
| 116 | +width = 0.2 |
| 117 | +# Create barplot for Numpy vs Blosc vs Zarr vs H5py |
| 118 | +for i, r in enumerate(result_tuple): |
| 119 | + if r[1].shape != (0,): |
| 120 | + label, times, w = r |
| 121 | + c = ['b', 'r', 'g', 'm'][i] |
| 122 | + mean = times.mean(axis=1) |
| 123 | + err = (mean - times.min(axis=1), times.max(axis=1)-mean) |
| 124 | + plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k', |
| 125 | + error_kw=dict(lw=2, capthick=2, ecolor='k')) |
| 126 | + labs+=label |
135 | 127 |
|
136 | | - # Create barplot for Numpy vs Blosc vs Zarr vs H5py |
137 | | - for i, r in enumerate(result_tuple): |
138 | | - if r[1].shape != (0,): |
139 | | - label, times, w = r |
140 | | - c = ['b', 'r', 'g', 'm'][i] |
141 | | - mean = times.mean(axis=1) |
142 | | - err = (mean - times.min(axis=1), times.max(axis=1)-mean) |
143 | | - plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k', |
144 | | - error_kw=dict(lw=2, capthick=2, ecolor='k')) |
145 | | - labs+=label |
| 128 | +filename = "results{labs}{NDIMS}D" |
146 | 129 |
|
147 | | - with open(f"results{labs}.pkl", 'wb') as f: |
148 | | - pickle.dump(result_tuple, f) |
| 130 | +with open(f"{filename}.pkl", 'wb') as f: |
| 131 | + pickle.dump(result_tuple, f) |
149 | 132 |
|
150 | 133 | plt.xlabel('Array size (GB)') |
151 | 134 | plt.legend() |
152 | | -plt.xticks(x-width, np.round(sizes, 2)) |
| 135 | +plt.xticks(x-width, np.round(genuine_sizes, 2)) |
153 | 136 | plt.ylabel("Time (s)") |
154 | | -plt.title('Fancy indexing performance comparison') |
| 137 | +plt.title('Fancy indexing performance comparison, {NDIMS}D') |
155 | 138 | plt.gca().set_yscale('log') |
156 | | -plt.savefig(f'plots/fancyIdx{labs}.png', format="png") |
| 139 | +plt.savefig(f'plots/fancyIdx{filename}.png', format="png") |
157 | 140 | plt.show() |
158 | 141 |
|
159 | 142 | print("Finished everything!") |
0 commit comments