Skip to content

Commit a65cb69

Browse files
committed
Revamped BatchStore. Add arrow as an optional serializer, and much more, including docs.
1 parent 9c177bd commit a65cb69

7 files changed

Lines changed: 945 additions & 84 deletions

File tree

bench/batch_store.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,21 @@ def build_parser() -> argparse.ArgumentParser:
5151
)
5252
parser.add_argument("--codec", type=str, default="ZSTD", choices=[codec.name for codec in blosc2.Codec])
5353
parser.add_argument("--clevel", type=int, default=5)
54+
parser.add_argument("--serializer", type=str, default="msgpack", choices=["msgpack", "arrow"])
5455
parser.add_argument("--use-dict", action="store_true", help="Enable dictionaries for ZSTD/LZ4/LZ4HC codecs.")
5556
parser.add_argument("--in-mem", action="store_true", help="Keep the BatchStore purely in memory.")
5657
return parser
5758

5859

59-
def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool) -> blosc2.BatchStore | None:
60+
def build_store(
61+
codec: blosc2.Codec, clevel: int, use_dict: bool, serializer: str, in_mem: bool
62+
) -> blosc2.BatchStore | None:
6063
if in_mem:
6164
storage = blosc2.Storage(mode="w")
6265
store = blosc2.BatchStore(
6366
storage=storage,
6467
max_blocksize=BLOCKSIZE_MAX,
68+
serializer=serializer,
6569
cparams={
6670
"codec": codec,
6771
"clevel": clevel,
@@ -79,7 +83,9 @@ def build_store(codec: blosc2.Codec, clevel: int, use_dict: bool, in_mem: bool)
7983
"clevel": clevel,
8084
"use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4, blosc2.Codec.LZ4HC),
8185
}
82-
with blosc2.BatchStore(storage=storage, max_blocksize=BLOCKSIZE_MAX, cparams=cparams) as store:
86+
with blosc2.BatchStore(
87+
storage=storage, max_blocksize=BLOCKSIZE_MAX, serializer=serializer, cparams=cparams
88+
) as store:
8389
for batch_index in range(NBATCHES):
8490
store.append(make_batch(batch_index))
8591
return None
@@ -114,10 +120,13 @@ def main() -> None:
114120
print(f"Building {article} {mode_label} BatchStore with 1,000,000 RGB dicts and timing 1,000 random scalar reads...")
115121
print(f" codec: {codec.name}")
116122
print(f" clevel: {args.clevel}")
123+
print(f" serializer: {args.serializer}")
117124
print(f" use_dict: {use_dict}")
118125
print(f" in_mem: {args.in_mem}")
119126
t0 = time.perf_counter()
120-
store = build_store(codec=codec, clevel=args.clevel, use_dict=use_dict, in_mem=args.in_mem)
127+
store = build_store(
128+
codec=codec, clevel=args.clevel, use_dict=use_dict, serializer=args.serializer, in_mem=args.in_mem
129+
)
121130
build_time_s = time.perf_counter() - t0
122131
if args.in_mem:
123132
assert store is not None
@@ -127,26 +136,26 @@ def main() -> None:
127136
samples, timings_ns = measure_random_reads(read_store)
128137
t0 = time.perf_counter()
129138
checksum = 0
130-
nobjects = 0
131-
for obj in read_store.iter_objects():
132-
checksum += obj["blue"]
133-
nobjects += 1
139+
nitems = 0
140+
for item in read_store.iter_items():
141+
checksum += item["blue"]
142+
nitems += 1
134143
iter_time_s = time.perf_counter() - t0
135144

136145
print()
137146
print("BatchStore benchmark")
138147
print(f" build time: {build_time_s:.3f} s")
139148
print(f" batches: {len(read_store)}")
140-
print(f" objects: {TOTAL_OBJECTS}")
149+
print(f" items: {TOTAL_OBJECTS}")
141150
print(f" max_blocksize: {read_store.max_blocksize}")
142151
print()
143152
print(read_store.info)
144153
print(f"Random scalar reads: {N_RANDOM_READS}")
145154
print(f" mean: {statistics.fmean(timings_ns) / 1_000:.2f} us")
146155
print(f" max: {max(timings_ns) / 1_000:.2f} us")
147156
print(f" min: {min(timings_ns) / 1_000:.2f} us")
148-
print(f"Object iteration via iter_objects(): {iter_time_s:.3f} s")
149-
print(f" per object: {iter_time_s * 1_000_000 / nobjects:.2f} us")
157+
print(f"Item iteration via iter_items(): {iter_time_s:.3f} s")
158+
print(f" per item: {iter_time_s * 1_000_000 / nitems:.2f} us")
150159
print(f" checksum: {checksum}")
151160
print("Sample reads:")
152161
for timing_ns, batch_index, item_index, value in samples[:5]:

0 commit comments

Comments
 (0)