Skip to content

Commit a422d72

Browse files
committed
CTable: full feature build-out (persistency, aggregates, mutations, QoL)
Persistency: - FileTableStorage backend: disk layout _meta.b2frame / _valid_rows.b2nd / _cols/<name>.b2nd - CTable(Row, urlpath=..., mode="w"/"a"/"r"), CTable.open(), CTable.save(), CTable.load() - Read-only mode blocks all writes; save() always writes compacted rows Column aggregates: sum, min, max, mean, std, any, all (chunk-aware via iter_chunks) Column utilities: unique(), value_counts(), assign(), boolean mask __getitem__/__setitem__ Schema mutations: add_column (fills default for existing rows), drop_column, rename_column - All three update schema, handle disk files, and block on views View mutability model fix: - Views allow value writes (assign, __setitem__) — only structural mutations are blocked - _read_only=True reserved for mode="r" disk tables; base is not None guards structural ops QoL: __str__ pandas-style, __repr__, cbytes/nbytes, sample(n), Column.iter_chunks(size) Tests: 258 tests, ~5s — new test_persistency.py (33), test_schema_mutations.py (41), expanded test_column.py; optimized helpers to use to_numpy() instead of row[i]
1 parent ee1d0c4 commit a422d72

8 files changed

Lines changed: 2542 additions & 97 deletions

File tree

bench/ctable/bench_persistency.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark: persistent vs in-memory CTable
9+
#
10+
# Sections:
11+
# 1. extend() — bulk creation: in-memory vs file-backed
12+
# 2. open() — time to reopen an existing persistent table
13+
# 3. append() — single-row append: in-memory vs file-backed (after reopen)
14+
# 4. column read — materialising a full column: in-memory vs file-backed
15+
#
16+
# Each measurement is the minimum of NRUNS repetitions to reduce noise.
17+
18+
import os
19+
import shutil
20+
from dataclasses import dataclass
21+
from time import perf_counter
22+
23+
import blosc2
24+
25+
NRUNS = 3
26+
TABLE_DIR = "saved_ctable/bench"
27+
28+
29+
@dataclass
30+
class Row:
31+
id: int = blosc2.field(blosc2.int64(ge=0))
32+
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
33+
active: bool = blosc2.field(blosc2.bool(), default=True)
34+
35+
36+
def sep(title: str) -> None:
37+
print(f"\n{'─' * 60}")
38+
print(f" {title}")
39+
print(f"{'─' * 60}")
40+
41+
42+
def tmin(fn, n: int = NRUNS) -> float:
43+
"""Return the minimum elapsed time (seconds) over *n* calls of *fn*."""
44+
best = float("inf")
45+
for _ in range(n):
46+
t0 = perf_counter()
47+
fn()
48+
best = min(best, perf_counter() - t0)
49+
return best
50+
51+
52+
def clean() -> None:
53+
if os.path.exists(TABLE_DIR):
54+
shutil.rmtree(TABLE_DIR)
55+
os.makedirs(TABLE_DIR, exist_ok=True)
56+
57+
58+
# ---------------------------------------------------------------------------
59+
# Section 1: bulk creation — extend()
60+
# ---------------------------------------------------------------------------
61+
62+
sep("1. extend() — bulk insert: in-memory vs file-backed")
63+
64+
SIZES = [1_000, 10_000, 100_000, 1_000_000]
65+
66+
print(f"{'rows':>12} {'in-memory (s)':>16} {'file-backed (s)':>16} {'overhead':>10}")
67+
print(f"{'----':>12} {'-------------':>16} {'---------------':>16} {'--------':>10}")
68+
69+
for N in SIZES:
70+
data = [(i, float(i % 100), i % 2 == 0) for i in range(N)]
71+
72+
def bench_mem(N=N, data=data):
73+
t = blosc2.CTable(Row, expected_size=N)
74+
t.extend(data, validate=False)
75+
76+
def bench_file(N=N, data=data):
77+
clean()
78+
t = blosc2.CTable(Row, urlpath=TABLE_DIR + "/ext", mode="w", expected_size=N)
79+
t.extend(data, validate=False)
80+
81+
t_mem = tmin(bench_mem)
82+
t_file = tmin(bench_file)
83+
overhead = t_file / t_mem if t_mem > 0 else float("nan")
84+
print(f"{N:>12,} {t_mem:>16.4f} {t_file:>16.4f} {overhead:>9.2f}x")
85+
86+
# ---------------------------------------------------------------------------
87+
# Section 2: open() — reopen an existing table
88+
# ---------------------------------------------------------------------------
89+
90+
sep("2. open() — time to reopen a persistent table")
91+
92+
print(f"{'rows':>12} {'CTable.open() (s)':>20} {'CTable(..., mode=a) (s)':>24}")
93+
print(f"{'----':>12} {'------------------':>20} {'------------------------':>24}")
94+
95+
for N in SIZES:
96+
data = [(i, float(i % 100), i % 2 == 0) for i in range(N)]
97+
clean()
98+
path = TABLE_DIR + "/reopen"
99+
t = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=N)
100+
t.extend(data, validate=False)
101+
del t
102+
103+
def bench_open(path=path):
104+
t2 = blosc2.CTable.open(path, mode="r")
105+
_ = len(t2)
106+
107+
def bench_ctor(path=path):
108+
t2 = blosc2.CTable(Row, urlpath=path, mode="a")
109+
_ = len(t2)
110+
111+
t_open = tmin(bench_open)
112+
t_ctor = tmin(bench_ctor)
113+
print(f"{N:>12,} {t_open:>20.4f} {t_ctor:>24.4f}")
114+
115+
# ---------------------------------------------------------------------------
116+
# Section 3: append() — single-row inserts after reopen
117+
# ---------------------------------------------------------------------------
118+
119+
sep("3. append() — 1 000 single-row inserts: in-memory vs file-backed")
120+
121+
APPEND_N = 1_000
122+
PREALLOCATE = 10_000 # avoid resize noise
123+
124+
print(f"{'backend':>14} {'total (s)':>12} {'µs / row':>12}")
125+
print(f"{'-------':>14} {'---------':>12} {'--------':>12}")
126+
127+
128+
def bench_append_mem():
129+
t = blosc2.CTable(Row, expected_size=PREALLOCATE, validate=False)
130+
for i in range(APPEND_N):
131+
t.append((i, float(i % 100), True))
132+
133+
134+
clean()
135+
path = TABLE_DIR + "/apath"
136+
blosc2.CTable(Row, urlpath=path, mode="w", expected_size=PREALLOCATE)
137+
138+
139+
def bench_append_file():
140+
t = blosc2.CTable(Row, urlpath=path, mode="a", validate=False)
141+
for i in range(APPEND_N):
142+
t.append((i, float(i % 100), True))
143+
144+
145+
for label, fn in [("in-memory", bench_append_mem), ("file-backed", bench_append_file)]:
146+
# Reset file table before each run
147+
if label == "file-backed":
148+
clean()
149+
blosc2.CTable(Row, urlpath=path, mode="w", expected_size=PREALLOCATE)
150+
elapsed = tmin(fn)
151+
us_per_row = elapsed / APPEND_N * 1e6
152+
print(f"{label:>14} {elapsed:>12.4f} {us_per_row:>12.1f}")
153+
154+
# ---------------------------------------------------------------------------
155+
# Section 4: column read — to_numpy() after reopen
156+
# ---------------------------------------------------------------------------
157+
158+
sep("4. column read — to_numpy() on 'id': in-memory vs file-backed")
159+
160+
print(f"{'rows':>12} {'in-memory (s)':>16} {'file-backed (s)':>16} {'ratio':>8}")
161+
print(f"{'----':>12} {'-------------':>16} {'---------------':>16} {'-----':>8}")
162+
163+
for N in SIZES:
164+
data = [(i, float(i % 100), i % 2 == 0) for i in range(N)]
165+
166+
t_mem_table = blosc2.CTable(Row, expected_size=N, validate=False)
167+
t_mem_table.extend(data, validate=False)
168+
169+
clean()
170+
path = TABLE_DIR + "/read"
171+
t_file_table = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=N)
172+
t_file_table.extend(data, validate=False)
173+
# Reopen read-only (simulates a real read workload)
174+
t_ro = blosc2.CTable.open(path, mode="r")
175+
176+
def bench_read_mem(t=t_mem_table):
177+
_ = t["id"].to_numpy()
178+
179+
def bench_read_file(t=t_ro):
180+
_ = t["id"].to_numpy()
181+
182+
t_m = tmin(bench_read_mem)
183+
t_f = tmin(bench_read_file)
184+
ratio = t_f / t_m if t_m > 0 else float("nan")
185+
print(f"{N:>12,} {t_m:>16.4f} {t_f:>16.4f} {ratio:>7.2f}x")
186+
187+
# Cleanup
188+
clean()
189+
print()

0 commit comments

Comments
 (0)