|
12 | 12 | import ndindex |
13 | 13 | import blosc2 |
14 | 14 | import time |
15 | | -from memory_profiler import memory_usage, profile |
16 | 15 | import matplotlib.pyplot as plt |
17 | 16 | import zarr |
18 | 17 | import h5py |
19 | | -plt.rcParams.update({'text.usetex':True,'font.serif': ['cm'],'font.size':16}) |
| 18 | +import pickle |
| 19 | +plt.rcParams.update({'text.usetex':False,'font.serif': ['cm'],'font.size':16}) |
20 | 20 | plt.rcParams['figure.dpi'] = 1000 |
21 | 21 | plt.rcParams['savefig.dpi'] = 1000 |
22 | | -plt.rc('text', usetex=True) |
| 22 | +plt.rc('text', usetex=False) |
23 | 23 | plt.rc('font',**{'serif':['cm']}) |
24 | 24 | plt.style.use('seaborn-v0_8-paper') |
25 | 25 |
|
26 | | -NUMPY_BLOSC = False |
| 26 | +NUMPY_BLOSC = True |
| 27 | +NUMPY_BLOSC_ZARR = False |
27 | 28 |
|
28 | 29 | def genarray(r, ndims=1, verbose=True): |
29 | 30 | d = int((r*2**30/8)**(1/ndims)) |
30 | 31 | shape = (d,) * ndims |
31 | 32 | chunks = (d // 4,) * ndims |
32 | 33 | blocks = (max(d // 10, 1),) * ndims |
33 | 34 | t = time.time() |
34 | | - arr = blosc2.ones(shape=shape, chunks=chunks, blocks=blocks, dtype=np.int64) # , urlpath=file, mode="w") |
| 35 | + arr = blosc2.ones(shape=shape, dtype=np.int64) |
35 | 36 | t = time.time() - t |
36 | 37 | if verbose: |
37 | 38 | print(f"Array shape: {arr.shape}") |
38 | 39 | print(f"Array size: {np.prod(arr.shape) * arr.dtype.itemsize / 2 ** 30:.6f} GB") |
39 | 40 | print(f"Time to create array: {t:.6f} seconds") |
40 | 41 | return arr |
41 | 42 |
|
42 | | -blosc_times = [] |
43 | | -np_times = [] |
44 | | -zarr_times = [] |
45 | | -sizes = [] |
46 | | -dims = np.int64(np.array([1, 2, 4, 6, 8])) |
| 43 | + |
| 44 | +sizes = np.int64(np.array([1, 2, 4, 8, 16])) |
47 | 45 | rng = np.random.default_rng() |
48 | 46 | blosctimes = [] |
49 | 47 | nptimes = [] |
50 | 48 | zarrtimes = [] |
51 | 49 | h5pytimes = [] |
52 | | - |
53 | | -for d in dims: |
54 | | - arr = genarray(d, ndims=2) |
55 | | - sizes.append(d) |
56 | | - idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],)) |
57 | | - row = np.arange(arr.shape[0]) |
58 | | - col = row |
59 | | - mask = rng.integers(low=0, high=2, size=(d,)) == 1 |
| 50 | +x = np.arange(len(sizes)) |
| 51 | +width = 0.2 |
| 52 | +labs = 'NumpyBlosc2' if NUMPY_BLOSC else 'NumpyBlosc2ZarrHDF5' |
| 53 | +labs = 'NumpyBlosc2Zarr' if NUMPY_BLOSC_ZARR else labs |
| 54 | +try: |
| 55 | + with open(f"results{labs}.pkl", 'rb') as f: |
| 56 | + result_tuple = pickle.load(f) |
| 57 | + labs = '' |
| 58 | + for i, r in enumerate(result_tuple): |
| 59 | + if r[1].shape != (0,): |
| 60 | + label,times,w = r |
| 61 | + c = ['b', 'r', 'g', 'm'][i] |
| 62 | + mean = times.mean(axis=1) |
| 63 | + err = [mean - times.min(axis=1), times.max(axis=1)-mean] |
| 64 | + plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k', |
| 65 | + error_kw=dict(lw=2, capthick=2, ecolor='k')) |
| 66 | + labs+=label |
| 67 | +except: |
| 68 | + for d in sizes: |
| 69 | + arr = genarray(d, ndims=2) |
| 70 | + idx = rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],)) |
| 71 | + row = np.sort(np.unique(idx)) |
| 72 | + col = np.sort(np.unique(rng.integers(low=0, high=arr.shape[0], size=(arr.shape[0],)))) |
| 73 | + mask = rng.integers(low=0, high=2, size=(arr.shape[0],)) == 1 |
60 | 74 |
|
61 | 75 |
|
62 | | - ## Test fancy indexing for different use cases |
63 | | - m, M = np.min(idx), np.max(idx) |
64 | | - def timer(arr, skip_flag=True, row=row, col=col): |
65 | | - time_list = [] |
66 | | - if not skip_flag: |
67 | | - t = time.time() |
68 | | - b = arr[row, col] |
69 | | - time_list += [time.time() - t] |
| 76 | + ## Test fancy indexing for different use cases |
| 77 | + m, M = np.min(idx), np.max(idx) |
| 78 | + def timer(arr, row=row, col=col): |
| 79 | + time_list = [] |
| 80 | + if NUMPY_BLOSC or NUMPY_BLOSC_ZARR: |
| 81 | + t = time.time() |
| 82 | + b = arr[row, col] |
| 83 | + time_list += [time.time() - t] |
| 84 | + if NUMPY_BLOSC: |
| 85 | + t = time.time() |
| 86 | + b = arr[slice(1, M // 2, 5), col] |
| 87 | + time_list += [time.time() - t] |
| 88 | + t = time.time() |
| 89 | + b = arr[[[row], [col]]] |
| 90 | + time_list += [time.time() - t] |
| 91 | + t = time.time() |
| 92 | + b = arr[row[:10, None], col[:10]] |
| 93 | + time_list += [time.time() - t] |
| 94 | + t = time.time() |
| 95 | + b = arr[row[:10, None], mask] |
| 96 | + time_list += [time.time() - t] |
70 | 97 | t = time.time() |
71 | | - b = arr[slice(1, M // 2, 5), col] |
| 98 | + b = arr[row] |
72 | 99 | time_list += [time.time() - t] |
73 | 100 | t = time.time() |
74 | | - b = arr[[[m // 2, M // 2], [m // 4, M // 4]]] |
| 101 | + b = arr[m, col] |
75 | 102 | time_list += [time.time() - t] |
76 | | - t = time.time() |
77 | | - b = arr[[m, M//2, M]] |
78 | | - time_list += [time.time() - t] |
79 | | - t = time.time() |
80 | | - b = arr[m, col] |
81 | | - time_list += [time.time() - t] |
82 | | - return np.array(time_list) |
| 103 | + return np.array(time_list) |
83 | 104 |
|
84 | | - if NUMPY_BLOSC: |
85 | | - blosctimes += [timer(arr, skip_flag=False, row=idx, col=idx)] |
86 | | - arr=arr[:] |
87 | | - nptimes += [timer(arr, skip_flag=False, row=idx, col=idx)] |
88 | | - else: |
89 | | - blosctimes += [timer(arr)] |
90 | | - arr = arr[:] |
91 | | - nptimes += [timer(arr)] |
92 | | - z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype) |
93 | | - z_test[:] = arr |
94 | | - # zarr is more limited, as must provide same number of coord arrays as dims of array |
95 | | - # also cannot mix with slices |
96 | | - zarrtimes += [timer(z_test)] |
97 | | - with h5py.File('my_hdf5_file.h5', 'w') as f: |
98 | | - dset = f.create_dataset("init", data=arr) |
99 | | - h5pytimes += [timer(dset)] |
| 105 | + if NUMPY_BLOSC or NUMPY_BLOSC_ZARR: |
| 106 | + blosctimes += [timer(arr, row=idx, col=idx)] |
| 107 | + arr=arr[:] |
| 108 | + nptimes += [timer(arr, row=idx, col=idx)] |
| 109 | + if NUMPY_BLOSC_ZARR: |
| 110 | + z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype) |
| 111 | + z_test[:] = arr |
| 112 | + zarrtimes += [timer(z_test, row=idx, col=idx)] |
| 113 | + else: |
| 114 | + blosctimes += [timer(arr)] |
| 115 | + arr=arr[:] |
| 116 | + nptimes += [timer(arr)] |
| 117 | + z_test = zarr.zeros(shape=arr.shape, dtype=arr.dtype) |
| 118 | + z_test[:] = arr |
| 119 | + zarrtimes += [timer(z_test)] |
| 120 | + with h5py.File('my_hdf5_file.h5', 'w') as f: |
| 121 | + dset = f.create_dataset("init", data=arr) |
| 122 | + h5pytimes += [timer(dset)] |
100 | 123 |
|
101 | | -x = np.arange(len(sizes)) |
102 | | -width = 0.2 |
103 | | -blosctimes = np.array(blosctimes) |
104 | | -nptimes = np.array(nptimes) |
105 | | -if NUMPY_BLOSC: |
106 | | - # Create bars for axis 0 plot |
107 | | - for i, r in enumerate((["Numpy", nptimes, -width], ["Blosc2", blosctimes, 0])): |
108 | | - label, times, w = r |
109 | | - c = ['b', 'r'][i] |
110 | | - plt.bar(x + w, times.mean(axis=1), width, color=c, alpha=0.5) |
111 | | - plt.bar(x + w, times.max(axis=1), width, color=c, alpha=0.25) |
112 | | - plt.bar(x + w, times.min(axis=1), width, label=label, color=c, alpha=1) |
113 | | - |
114 | | - plt.xlabel('Array size (GB)') |
115 | | - plt.legend() |
116 | | - plt.xticks(x, np.round(sizes, 2)) |
117 | | - plt.ylabel("Time (s)") |
118 | | - plt.title('Fancy indexing NumPy vs Blosc2') |
119 | | - plt.savefig('plots/fancyIdxNumpyVsBlosc.png', format="png") |
120 | | - plt.show() |
121 | | -else: |
| 124 | + blosctimes = np.array(blosctimes) |
| 125 | + nptimes = np.array(nptimes) |
122 | 126 | zarrtimes = np.array(zarrtimes) |
123 | 127 | h5pytimes = np.array(h5pytimes) |
| 128 | + labs='' |
| 129 | + result_tuple = (["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width]) |
| 130 | + |
| 131 | + # Create barplot for Numpy vs Blosc vs Zarr vs H5py |
| 132 | + for i, r in enumerate(result_tuple): |
| 133 | + if r[1].shape != (0,): |
| 134 | + label, times, w = r |
| 135 | + c = ['b', 'r', 'g', 'm'][i] |
| 136 | + mean = times.mean(axis=1) |
| 137 | + err = (mean - times.min(axis=1), times.max(axis=1)-mean) |
| 138 | + plt.bar(x + w, mean , width, color=c, label=label, yerr=err, capsize=5, ecolor='k', |
| 139 | + error_kw=dict(lw=2, capthick=2, ecolor='k')) |
| 140 | + labs+=label |
124 | 141 |
|
125 | | - # Create bars for axis 0 plot |
126 | | - for i, r in enumerate((["Numpy",nptimes,-2*width],["Blosc2",blosctimes, -width],["Zarr",zarrtimes, 0],["HDF5",h5pytimes, width])): |
127 | | - label,times,w = r |
128 | | - c = ['b', 'r', 'g', 'm'][i] |
129 | | - plt.bar(x + w, times.mean(axis=1), width, color=c, alpha=0.5) |
130 | | - plt.bar(x + w, times.max(axis=1), width, color=c, alpha=0.25) |
131 | | - plt.bar(x + w, times.min(axis=1), width, label=label, color=c, alpha=1) |
| 142 | + with open(f"results{labs}.pkl", 'wb') as f: |
| 143 | + pickle.dump(result_tuple, f) |
132 | 144 |
|
133 | | - plt.xlabel('Array size (GB)') |
134 | | - plt.legend() |
135 | | - plt.xticks(x, np.round(sizes, 2)) |
136 | | - plt.ylabel("Time (s)") |
137 | | - plt.title('Fancy indexing performance comparison') |
138 | | - plt.savefig('plots/fancyIdx.png', format="png") |
139 | | - plt.show() |
| 145 | +plt.xlabel('Array size (GB)') |
| 146 | +plt.legend() |
| 147 | +plt.xticks(x-width, np.round(sizes, 2)) |
| 148 | +plt.ylabel("Time (s)") |
| 149 | +plt.title('Fancy indexing performance comparison') |
| 150 | +plt.ylim([0,10]) |
| 151 | +plt.savefig(f'plots/fancyIdx{labs}.png', format="png") |
| 152 | +plt.show() |
140 | 153 |
|
141 | 154 | print("Finished everything!") |
0 commit comments