55# SPDX-License-Identifier: BSD-3-Clause
66#######################################################################
77
8+ # This benchmarks BatchArray random single-item reads. It supports
9+ # msgpack or arrow, configurable codec/compression level, optional
10+ # dictionary compression, and in-memory vs persistent mode.
11+
812from __future__ import annotations
913
1014import argparse
1519import blosc2
1620
1721
18- URLPATH = "bench_batch_store .b2b"
22+ URLPATH = "bench_batch_array .b2b"
1923NBATCHES = 10_000
2024OBJECTS_PER_BATCH = 100
2125TOTAL_OBJECTS = NBATCHES * OBJECTS_PER_BATCH
22- BLOCKSIZE_MAX = 32
26+ ITEMS_PER_BLOCK = 32
2327N_RANDOM_READS = 1_000
2428
2529
@@ -46,25 +50,25 @@ def expected_entry(batch_index: int, item_index: int) -> dict[str, int]:
4650
4751def build_parser () -> argparse .ArgumentParser :
4852 parser = argparse .ArgumentParser (
49- description = "Benchmark BatchStore single-entry reads." ,
53+ description = "Benchmark BatchArray single-entry reads." ,
5054 formatter_class = argparse .ArgumentDefaultsHelpFormatter ,
5155 )
5256 parser .add_argument ("--codec" , type = str , default = "ZSTD" , choices = [codec .name for codec in blosc2 .Codec ])
5357 parser .add_argument ("--clevel" , type = int , default = 5 )
5458 parser .add_argument ("--serializer" , type = str , default = "msgpack" , choices = ["msgpack" , "arrow" ])
5559 parser .add_argument ("--use-dict" , action = "store_true" , help = "Enable dictionaries for ZSTD/LZ4/LZ4HC codecs." )
56- parser .add_argument ("--in-mem" , action = "store_true" , help = "Keep the BatchStore purely in memory." )
60+ parser .add_argument ("--in-mem" , action = "store_true" , help = "Keep the BatchArray purely in memory." )
5761 return parser
5862
5963
60- def build_store (
64+ def build_array (
6165 codec : blosc2 .Codec , clevel : int , use_dict : bool , serializer : str , in_mem : bool
62- ) -> blosc2 .BatchStore | None :
66+ ) -> blosc2 .BatchArray | None :
6367 if in_mem :
6468 storage = blosc2 .Storage (mode = "w" )
65- store = blosc2 .BatchStore (
69+ barr = blosc2 .BatchArray (
6670 storage = storage ,
67- max_blocksize = BLOCKSIZE_MAX ,
71+ items_per_block = ITEMS_PER_BLOCK ,
6872 serializer = serializer ,
6973 cparams = {
7074 "codec" : codec ,
@@ -73,8 +77,8 @@ def build_store(
7377 },
7478 )
7579 for batch_index in range (NBATCHES ):
76- store .append (make_batch (batch_index ))
77- return store
80+ barr .append (make_batch (batch_index ))
81+ return barr
7882
7983 blosc2 .remove_urlpath (URLPATH )
8084 storage = blosc2 .Storage (urlpath = URLPATH , mode = "w" , contiguous = True )
@@ -83,24 +87,24 @@ def build_store(
8387 "clevel" : clevel ,
8488 "use_dict" : use_dict and codec in (blosc2 .Codec .ZSTD , blosc2 .Codec .LZ4 , blosc2 .Codec .LZ4HC ),
8589 }
86- with blosc2 .BatchStore (
87- storage = storage , max_blocksize = BLOCKSIZE_MAX , serializer = serializer , cparams = cparams
88- ) as store :
90+ with blosc2 .BatchArray (
91+ storage = storage , items_per_block = ITEMS_PER_BLOCK , serializer = serializer , cparams = cparams
92+ ) as barr :
8993 for batch_index in range (NBATCHES ):
90- store .append (make_batch (batch_index ))
94+ barr .append (make_batch (batch_index ))
9195 return None
9296
9397
94- def measure_random_reads (store : blosc2 .BatchStore ) -> tuple [list [tuple [int , int , int , dict [str , int ]]], list [int ]]:
98+ def measure_random_reads (barr : blosc2 .BatchArray ) -> tuple [list [tuple [int , int , int , dict [str , int ]]], list [int ]]:
9599 rng = random .Random (2024 )
96100 samples : list [tuple [int , int , int , dict [str , int ]]] = []
97101 timings_ns : list [int ] = []
98102
99103 for _ in range (N_RANDOM_READS ):
100- batch_index = rng .randrange (len (store ))
104+ batch_index = rng .randrange (len (barr ))
101105 item_index = rng .randrange (OBJECTS_PER_BATCH )
102106 t0 = time .perf_counter_ns ()
103- value = store [batch_index ][item_index ]
107+ value = barr [batch_index ][item_index ]
104108 timings_ns .append (time .perf_counter_ns () - t0 )
105109 if value != expected_entry (batch_index , item_index ):
106110 raise RuntimeError (f"Value mismatch at batch={ batch_index } , item={ item_index } " )
@@ -117,39 +121,39 @@ def main() -> None:
117121
118122 mode_label = "in-memory" if args .in_mem else "persistent"
119123 article = "an" if args .in_mem else "a"
120- print (f"Building { article } { mode_label } BatchStore with 1,000,000 RGB dicts and timing 1,000 random scalar reads..." )
124+ print (f"Building { article } { mode_label } BatchArray with 1,000,000 RGB dicts and timing 1,000 random scalar reads..." )
121125 print (f" codec: { codec .name } " )
122126 print (f" clevel: { args .clevel } " )
123127 print (f" serializer: { args .serializer } " )
124128 print (f" use_dict: { use_dict } " )
125129 print (f" in_mem: { args .in_mem } " )
126130 t0 = time .perf_counter ()
127- store = build_store (
131+ barr = build_array (
128132 codec = codec , clevel = args .clevel , use_dict = use_dict , serializer = args .serializer , in_mem = args .in_mem
129133 )
130134 build_time_s = time .perf_counter () - t0
131135 if args .in_mem :
132- assert store is not None
133- read_store = store
136+ assert barr is not None
137+ read_array = barr
134138 else :
135- read_store = blosc2 .BatchStore (urlpath = URLPATH , mode = "r" , contiguous = True , max_blocksize = BLOCKSIZE_MAX )
136- samples , timings_ns = measure_random_reads (read_store )
139+ read_array = blosc2 .BatchArray (urlpath = URLPATH , mode = "r" , contiguous = True , items_per_block = ITEMS_PER_BLOCK )
140+ samples , timings_ns = measure_random_reads (read_array )
137141 t0 = time .perf_counter ()
138142 checksum = 0
139143 nitems = 0
140- for item in read_store .iter_items ():
144+ for item in read_array .iter_items ():
141145 checksum += item ["blue" ]
142146 nitems += 1
143147 iter_time_s = time .perf_counter () - t0
144148
145149 print ()
146- print ("BatchStore benchmark" )
150+ print ("BatchArray benchmark" )
147151 print (f" build time: { build_time_s :.3f} s" )
148- print (f" batches: { len (read_store )} " )
152+ print (f" batches: { len (read_array )} " )
149153 print (f" items: { TOTAL_OBJECTS } " )
150- print (f" max_blocksize : { read_store . max_blocksize } " )
154+ print (f" items_per_block : { read_array . items_per_block } " )
151155 print ()
152- print (read_store .info )
156+ print (read_array .info )
153157 print (f"Random scalar reads: { N_RANDOM_READS } " )
154158 print (f" mean: { statistics .fmean (timings_ns ) / 1_000 :.2f} us" )
155159 print (f" max: { max (timings_ns ) / 1_000 :.2f} us" )
@@ -159,11 +163,11 @@ def main() -> None:
159163 print (f" checksum: { checksum } " )
160164 print ("Sample reads:" )
161165 for timing_ns , batch_index , item_index , value in samples [:5 ]:
162- print (f" { timing_ns / 1_000 :.2f} us -> read_store [{ batch_index } ][{ item_index } ] = { value } " )
166+ print (f" { timing_ns / 1_000 :.2f} us -> read_array [{ batch_index } ][{ item_index } ] = { value } " )
163167 if args .in_mem :
164- print ("BatchStore kept in memory" )
168+ print ("BatchArray kept in memory" )
165169 else :
166- print (f"BatchStore file at: { read_store .urlpath } " )
170+ print (f"BatchArray file at: { read_array .urlpath } " )
167171
168172
169173if __name__ == "__main__" :
0 commit comments