|
14 | 14 | import time |
15 | 15 | from memory_profiler import memory_usage, profile |
16 | 16 | import matplotlib.pyplot as plt |
| 17 | +import zarr |
| 18 | +import h5py |
| 19 | +plt.rcParams.update({'text.usetex':True,'font.serif': ['cm'],'font.size':16}) |
| 20 | +plt.rcParams['figure.dpi'] = 1000 |
| 21 | +plt.rcParams['savefig.dpi'] = 1000 |
| 22 | +plt.rc('text', usetex=True) |
| 23 | +plt.rc('font',**{'serif':['cm']}) |
| 24 | +plt.style.use('seaborn-v0_8-paper') |
17 | 25 |
|
18 | | -MEM_PROFILE = False |
| 26 | +NUMPY_BLOSC = False |
19 | 27 |
|
20 | | -# @profile |
21 | | -def fancyBlosc2(arr, fancyIdx): |
| 28 | +def genarray(r, ndims=1, verbose=True): |
| 29 | + d = int((r*2**30/8)**(1/ndims)) |
| 30 | + shape = (d,) * ndims |
| 31 | + chunks = (d // 4,) * ndims |
| 32 | + blocks = (max(d // 10, 1),) * ndims |
22 | 33 | t = time.time() |
23 | | - res = arr[fancyIdx] |
24 | | - dt = time.time() - t |
25 | | - return res, dt |
26 | | - |
27 | | -# @profile |
28 | | -def fancyNumpy(arr, fancyIdx): |
29 | | - t = time.time() |
30 | | - res = arr[:] |
31 | | - res = res[fancyIdx] |
32 | | - dt = time.time() - t |
33 | | - return res, dt |
34 | | - |
35 | | -def genarray(d, verbose=False): |
36 | | - shape = (d,) * 4 |
37 | | - chunks = (d // 4,) * 4 |
38 | | - blocks = (max(d // 10, 1),) * 4 |
39 | | - t = time.time() |
40 | | - arr = blosc2.ones(shape=shape, chunks=chunks, blocks=blocks) # , urlpath=file, mode="w") |
| 34 | + arr = blosc2.ones(shape=shape, chunks=chunks, blocks=blocks, dtype=np.int64) # , urlpath=file, mode="w") |
41 | 35 | t = time.time() - t |
42 | 36 | if verbose: |
43 | 37 | print(f"Array shape: {arr.shape}") |
44 | 38 | print(f"Array size: {np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30:.6f} GB") |
45 | 39 | print(f"Time to create array: {t:.6f} seconds") |
46 | 40 | return arr |
47 | 41 |
|
48 | | -if __name__ == '__main__': |
49 | | - blosc_times = [] |
50 | | - np_times = [] |
51 | | - sizes = [] |
52 | | - # dims = np.int64(np.array([2**6.76, 2**6.8, 2**6.85, 2**6.9, 2**6.95])) |
53 | | - dims = np.int64(np.array([2**4.3, 2**5.25, 2**6.25, 2**6.75, 2**7, 2**7.25, 2**7.4, 2**7.5])) |
54 | | - rng = np.random.default_rng() |
| 42 | +blosc_times = [] |
| 43 | +np_times = [] |
| 44 | +zarr_times = [] |
| 45 | +sizes = [] |
| 46 | +dims = np.int64(np.array([1, 2, 4, 6, 8])) |
| 47 | +rng = np.random.default_rng() |
| 48 | +blosctimes = [] |
| 49 | +nptimes = [] |
| 50 | +zarrtimes = [] |
| 51 | +h5pytimes = [] |
55 | 52 |
|
56 | | - for d in dims: |
57 | | - arr = genarray(d) |
58 | | - arr_size = np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30 |
59 | | - print(arr_size) |
60 | | - sizes.append(arr_size) |
61 | | - idx = rng.integers(low=0, high=d, size=(d,)) |
62 | | - if MEM_PROFILE: |
63 | | - fancyIdx = np.s_[idx, :, :d // 2, d // 2:] |
| 53 | +for d in dims: |
| 54 | + arr = genarray(d, ndims=2) |
| 55 | + sizes.append(d) |
| 56 | + idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],)) |
| 57 | + row = np.arange(arr.shape[0]) |
| 58 | + col = row |
| 59 | + mask = rng.integers(low=0, high=2, size=(d,)) == 1 |
64 | 60 |
|
65 | | - interval = 0.001 |
66 | | - offset = 0 |
67 | | - for f in [fancyBlosc2, fancyNumpy]: |
68 | | - mem = memory_usage((f, (arr, fancyIdx)), interval=interval) |
69 | | - times = offset + interval * np.arange(len(mem)) |
70 | | - offset = times[-1] |
71 | | - plt.plot(times, mem) |
72 | 61 |
|
73 | | - plt.xlabel('Time (s)') |
74 | | - plt.ylabel('Memory usage (MiB)') |
75 | | - plt.title('Memory usage fancy indexing') |
76 | | - plt.legend(['blosc2', 'numpy']) |
77 | | - plt.savefig(f'plots/MemoryUsagefancyIdx_d{d}.png', format="png") |
| 62 | + ## Test fancy indexing for different use cases |
| 63 | + m, M = np.min(idx), np.max(idx) |
| 64 | + def timer(arr, skip_flag=True, row=row, col=col): |
| 65 | + time_list = [] |
| 66 | + if not skip_flag: |
| 67 | + t = time.time() |
| 68 | + b = arr[row, col] |
| 69 | + time_list += [time.time() - t] |
| 70 | + t = time.time() |
| 71 | + b = arr[slice(1, M // 2, 5), col] |
| 72 | + time_list += [time.time() - t] |
| 73 | + t = time.time() |
| 74 | + b = arr[[[m // 2, M // 2], [m // 4, M // 4]]] |
| 75 | + time_list += [time.time() - t] |
| 76 | + t = time.time() |
| 77 | + b = arr[[m, M//2, M]] |
| 78 | + time_list += [time.time() - t] |
| 79 | + t = time.time() |
| 80 | + b = arr[m, col] |
| 81 | + time_list += [time.time() - t] |
| 82 | + return np.array(time_list) |
78 | 83 |
|
79 | | - row = idx |
80 | | - col = rng.permutation(idx) |
81 | | - mask = rng.integers(low=0, high=2, size=(d,)) == 1 |
| 84 | + if NUMPY_BLOSC: |
| 85 | + blosctimes += [timer(arr, skip_flag=False, row=idx, col=idx)] |
| 86 | + arr=arr[:] |
| 87 | + nptimes += [timer(arr, skip_flag=False, row=idx, col=idx)] |
| 88 | + else: |
| 89 | + blosctimes += [timer(arr)] |
| 90 | + arr = arr[:] |
| 91 | + nptimes += [timer(arr)] |
| 92 | + z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype) |
| 93 | + z_test[:] = arr |
| 94 | + # zarr is more limited, as must provide same number of coord arrays as dims of array |
| 95 | + # also cannot mix with slices |
| 96 | + zarrtimes += [timer(z_test)] |
| 97 | + with h5py.File('my_hdf5_file.h5', 'w') as f: |
| 98 | + dset = f.create_dataset("init", data=arr) |
| 99 | + h5pytimes += [timer(dset)] |
82 | 100 |
|
83 | | - ## Test fancy indexing for different use cases |
84 | | - loc_blosc_times = [] |
85 | | - loc_np_times = [] |
86 | | - m, M = np.min(idx), np.max(idx) |
87 | | - for i, fancyIdx in enumerate([ |
88 | | - [m, M//2, M], # i) |
89 | | - [[[m//2, M//2], [m//4, M//4]]], # ii) |
90 | | - [row, col], # iii) |
91 | | - # [row[:, None], col], # iv) |
92 | | - [m, col], # v) |
93 | | - [slice(1, M//2, 5), col], # vi) |
94 | | - # [row[:, None], mask], # vii) |
95 | | - ]): |
96 | | - # print(f'\n(case {i + 1})') |
97 | | - try: |
98 | | - r, c = fancyIdx |
99 | | - idx = (r, c) |
100 | | - except: |
101 | | - r, c = fancyIdx, None |
102 | | - idx = r |
103 | | - b, blosctime = fancyBlosc2(arr,idx) |
104 | | - n, nptime = fancyNumpy(arr,idx) |
105 | | - slice_size = np.prod(b.shape) * b.dtype.itemsize / 2 ** 30 |
106 | | - np.testing.assert_allclose(b, n) |
107 | | - loc_blosc_times.append(blosctime) |
108 | | - loc_np_times.append(nptime) |
109 | | - blosc_times.append(loc_blosc_times) |
110 | | - np_times.append(loc_np_times) |
| 101 | +x = np.arange(len(sizes)) |
| 102 | +width = 0.2 |
| 103 | +blosctimes = np.array(blosctimes) |
| 104 | +nptimes = np.array(nptimes) |
| 105 | +if NUMPY_BLOSC: |
| 106 | + # Create bars for axis 0 plot |
| 107 | + for i, r in enumerate((["Numpy", nptimes, -width], ["Blosc2", blosctimes, 0])): |
| 108 | + label, times, w = r |
| 109 | + c = ['b', 'r'][i] |
| 110 | + plt.bar(x + w, times.mean(axis=1), width, color=c, alpha=0.5) |
| 111 | + plt.bar(x + w, times.max(axis=1), width, color=c, alpha=0.25) |
| 112 | + plt.bar(x + w, times.min(axis=1), width, label=label, color=c, alpha=1) |
111 | 113 |
|
112 | | - blosc_times = np.array(blosc_times) |
113 | | - np_times = np.array(np_times) |
114 | | - x = np.arange(len(sizes)) |
115 | | - width = 0.25 |
| 114 | + plt.xlabel('Array size (GB)') |
| 115 | + plt.legend() |
| 116 | + plt.xticks(x, np.round(sizes, 2)) |
| 117 | + plt.ylabel("Time (s)") |
| 118 | + plt.title('Fancy indexing NumPy vs Blosc2') |
| 119 | + plt.savefig('plots/fancyIdxNumpyVsBlosc.png', format="png") |
| 120 | + plt.show() |
| 121 | +else: |
| 122 | + zarrtimes = np.array(zarrtimes) |
| 123 | + h5pytimes = np.array(h5pytimes) |
116 | 124 |
|
117 | 125 | # Create bars for axis 0 plot |
118 | | - plt.bar(x - width, np_times.mean(axis=1), width, color='b', alpha=0.5) |
119 | | - plt.bar(x - width, np_times.max(axis=1), width, color='b', alpha=0.25) |
120 | | - plt.bar(x - width, np_times.min(axis=1), width, label='NumPy', color='b', alpha=1) |
121 | | - plt.bar(x, blosc_times.mean(axis=1), width, color='r', alpha=0.5) |
122 | | - plt.bar(x, blosc_times.max(axis=1), width, color='r', alpha=0.25) |
123 | | - plt.bar(x, blosc_times.min(axis=1), width, label='Blosc2', color='r', alpha=1) |
124 | | - plt.bar([],[], label='max', color='k', alpha=0.25) |
125 | | - plt.bar([],[], label='min', color='k', alpha=1) |
126 | | - plt.bar([],[], label='mean', color='k', alpha=0.5) |
| 126 | + for i, r in enumerate((["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])): |
| 127 | + label,times,w = r |
| 128 | + c = ['b', 'r', 'g', 'm'][i] |
| 129 | + plt.bar(x + w, times.mean(axis=1), width, color=c, alpha=0.5) |
| 130 | + plt.bar(x + w, times.max(axis=1), width, color=c, alpha=0.25) |
| 131 | + plt.bar(x + w, times.min(axis=1), width, label=label, color=c, alpha=1) |
127 | 132 |
|
128 | 133 | plt.xlabel('Array size (GB)') |
129 | 134 | plt.legend() |
130 | 135 | plt.xticks(x, np.round(sizes, 2)) |
131 | 136 | plt.ylabel("Time (s)") |
132 | | - plt.title('Fancy indexing, Blosc2 vs NumPy') |
| 137 | + plt.title('Fancy indexing performance comparison') |
133 | 138 | plt.savefig('plots/fancyIdx.png', format="png") |
134 | 139 | plt.show() |
135 | | - ## slowest cases are the ones with broadcasting |
136 | | - ## in fact it's a memory problem for blosc2 seemingly |
| 140 | + |
| 141 | +print("Finished everything!") |
0 commit comments