Skip to content

Commit 4ce8296

Browse files
committed
Schema layer:
- Add schema.py with spec primitives: int8/16/32/64, uint8/16/32/64, float32/64, bool, complex64/128, string, bytes — sharing a _NumericSpec mixin to avoid boilerplate - Add schema_compiler.py: compile_schema(), CompiledSchema/Column/Config, schema_to_dict() / schema_from_dict() for persistence groundwork - Export all spec types and field() from blosc2 namespace Validation: - Add schema_validation.py: Pydantic-backed row validation for append(), cached per schema, re-raised as plain ValueError - Add schema_vectorized.py: vectorized NumPy constraint checks for extend(), using np.char.str_len() for string/bytes columns - validate= per-call override on extend() (None inherits table default) CTable refactor: - Constructor accepts dataclass schemas; legacy Pydantic adapter kept - Schema introspection: table.schema, column_schema(), schema_dict() - _last_pos cache eliminates backward chunk scan on every append/extend - _grow() shared resize helper; delete() writes back in-place without creating a new array; _n_rows updated by subtraction not count_nonzero - head() and tail() unified through _find_physical_index() Tests and docs: - 135 tests across 10 test files, all passing - plans/ctable-implementation-log.md and ctable-user-guide.md added - Benchmarks: bench_validation.py and bench_append_regression.py
1 parent b9e8c35 commit 4ce8296

37 files changed

Lines changed: 3701 additions & 682 deletions

bench/ctable/Prueba_iter.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
from dataclasses import dataclass
9+
10+
import numpy as np
11+
from time import time
12+
13+
from blosc2 import CTable
14+
import blosc2
15+
16+
17+
@dataclass
18+
class Row:
19+
id: int = blosc2.field(blosc2.int64(ge=0))
20+
score: float = blosc2.field(blosc2.float64(ge=0, le=100))
21+
active: bool = blosc2.field(blosc2.bool(), default=True)
22+
23+
24+
N = 1_000 # start small, increase when confident
25+
26+
data = [(i, float(i % 100), i % 2 == 0) for i in range(N)]
27+
tabla = CTable(Row, new_data=data)
28+
29+
print(f"Table created with {len(tabla)} rows\n")
30+
31+
# -------------------------------------------------------------------
32+
# Test 1: iterate without accessing any column (minimum cost)
33+
# -------------------------------------------------------------------
34+
t0 = time()
35+
for row in tabla:
36+
pass
37+
t1 = time()
38+
print(f"[Test 1] Iter without accessing columns: {(t1 - t0)*1000:.3f} ms")
39+
40+
# -------------------------------------------------------------------
41+
# Test 2: iterate accessing a single column (real_pos cached once)
42+
# -------------------------------------------------------------------
43+
t0 = time()
44+
for row in tabla:
45+
_ = row["id"]
46+
t1 = time()
47+
print(f"[Test 2] Iter accessing 'id': {(t1 - t0)*1000:.3f} ms")
48+
49+
# -------------------------------------------------------------------
50+
# Test 3: iterate accessing all columns (real_pos cached once per row)
51+
# -------------------------------------------------------------------
52+
t0 = time()
53+
for row in tabla:
54+
_ = row["id"]
55+
_ = row["score"]
56+
_ = row["active"]
57+
t1 = time()
58+
print(f"[Test 3] Iter accessing 3 columns: {(t1 - t0)*1000:.3f} ms")
59+
60+
# -------------------------------------------------------------------
61+
# Test 4: correctness — values match expected
62+
# -------------------------------------------------------------------
63+
errors = 0
64+
for row in tabla:
65+
if row["id"] != row._nrow:
66+
errors += 1
67+
if row["score"] != float(row._nrow % 100):
68+
errors += 1
69+
if row["active"] != (row._nrow % 2 == 0):
70+
errors += 1
71+
72+
print(f"\n[Test 4] Correctness errors: {errors} (expected: 0)")
73+
74+
# -------------------------------------------------------------------
75+
# Test 5: with holes (deleted rows)
76+
# -------------------------------------------------------------------
77+
tabla2 = CTable(Row, new_data=data)
78+
tabla2.delete(list(range(0, N, 2))) # delete even rows, keep odd ones
79+
80+
print(f"\nTable with holes: {len(tabla2)} rows (expected: {N // 2})")
81+
82+
t0 = time()
83+
ids = []
84+
for row in tabla2:
85+
ids.append(row["id"])
86+
t1 = time()
87+
88+
expected_ids = [i for i in range(N) if i % 2 != 0]
89+
ok = ids == expected_ids
90+
print(f"[Test 5] Iter with holes ({N//2} rows): {(t1 - t0)*1000:.3f} ms | correctness: {ok}")
91+
92+
# -------------------------------------------------------------------
93+
# Test 6: real_pos is cached correctly (not recomputed)
94+
# -------------------------------------------------------------------
95+
row0 = next(iter(tabla))
96+
assert row0._real_pos is None, "real_pos should be None before first access"
97+
_ = row0["id"]
98+
assert row0._real_pos is not None, "real_pos should be cached after first access"
99+
print(f"\n[Test 6] real_pos caching: OK (real_pos={row0._real_pos})")
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark: append() overhead introduced by the new schema pipeline
9+
#
10+
# The new append() path routes every row through:
11+
# _normalize_row_input → validate_row (Pydantic) → _coerce_row_to_storage
12+
#
13+
# This benchmark isolates how much each step costs, and shows the
14+
# total overhead vs the raw NDArray write speed.
15+
16+
from dataclasses import dataclass
17+
from time import perf_counter
18+
19+
import numpy as np
20+
21+
import blosc2
22+
from blosc2.schema_compiler import compile_schema
23+
from blosc2.schema_validation import validate_row, build_validator_model
24+
25+
26+
@dataclass
27+
class Row:
28+
id: int = blosc2.field(blosc2.int64(ge=0))
29+
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
30+
active: bool = blosc2.field(blosc2.bool(), default=True)
31+
32+
33+
N = 5_000
34+
rng = np.random.default_rng(42)
35+
data = [
36+
(int(i), float(rng.uniform(0, 100)), bool(i % 2))
37+
for i in range(N)
38+
]
39+
schema = compile_schema(Row)
40+
# Warm up the Pydantic model cache
41+
build_validator_model(schema)
42+
43+
print(f"append() pipeline cost breakdown | N = {N:,} rows")
44+
print("=" * 60)
45+
46+
# ── 1. Raw NDArray writes (no CTable overhead at all) ────────────────────────
47+
ids = np.zeros(N, dtype=np.int64)
48+
scores = np.zeros(N, dtype=np.float64)
49+
flags = np.zeros(N, dtype=np.bool_)
50+
mask = np.zeros(N, dtype=np.bool_)
51+
52+
t0 = perf_counter()
53+
for i, (id_, score, active) in enumerate(data):
54+
ids[i] = id_
55+
scores[i] = score
56+
flags[i] = active
57+
mask[i] = True
58+
t_raw = perf_counter() - t0
59+
print(f"{'Raw NumPy writes (baseline)':<40} {t_raw:.4f} s")
60+
61+
# ── 2. _normalize_row_input only ─────────────────────────────────────────────
62+
t_obj = blosc2.CTable(Row, expected_size=N, validate=False)
63+
t0 = perf_counter()
64+
for row in data:
65+
_ = t_obj._normalize_row_input(row)
66+
t_normalize = perf_counter() - t0
67+
print(f"{'_normalize_row_input only':<40} {t_normalize:.4f} s ({t_normalize/t_raw:.1f}x baseline)")
68+
69+
# ── 3. Pydantic validate_row only ────────────────────────────────────────────
70+
row_dicts = [t_obj._normalize_row_input(row) for row in data]
71+
t0 = perf_counter()
72+
for rd in row_dicts:
73+
_ = validate_row(schema, rd)
74+
t_validate = perf_counter() - t0
75+
print(f"{'validate_row (Pydantic) only':<40} {t_validate:.4f} s ({t_validate/t_raw:.1f}x baseline)")
76+
77+
# ── 4. _coerce_row_to_storage only ───────────────────────────────────────────
78+
t0 = perf_counter()
79+
for rd in row_dicts:
80+
_ = t_obj._coerce_row_to_storage(rd)
81+
t_coerce = perf_counter() - t0
82+
print(f"{'_coerce_row_to_storage only':<40} {t_coerce:.4f} s ({t_coerce/t_raw:.1f}x baseline)")
83+
84+
# ── 5. Full append(), validate=False (3 runs, take minimum) ─────────────────
85+
RUNS = 3
86+
best_off = float("inf")
87+
for _ in range(RUNS):
88+
t_obj2 = blosc2.CTable(Row, expected_size=N, validate=False)
89+
t0 = perf_counter()
90+
for row in data:
91+
t_obj2.append(row)
92+
best_off = min(best_off, perf_counter() - t0)
93+
t_append_off = best_off
94+
print(f"{'Full append(), validate=False':<40} {t_append_off:.4f} s ({t_append_off/t_raw:.1f}x baseline)")
95+
96+
# ── 6. Full append(), validate=True (3 runs, take minimum) ──────────────────
97+
best_on = float("inf")
98+
for _ in range(RUNS):
99+
t_obj3 = blosc2.CTable(Row, expected_size=N, validate=True)
100+
t0 = perf_counter()
101+
for row in data:
102+
t_obj3.append(row)
103+
best_on = min(best_on, perf_counter() - t0)
104+
t_append_on = best_on
105+
print(f"{'Full append(), validate=True':<40} {t_append_on:.4f} s ({t_append_on/t_raw:.1f}x baseline)")
106+
107+
print()
108+
print("=" * 60)
109+
pydantic_cost = max(t_append_on - t_append_off, 0.0)
110+
print(f"{'Pydantic overhead in append()':<40} {pydantic_cost:.4f} s")
111+
if t_append_on > 0:
112+
print(f"{'Validation fraction of total':<40} {pydantic_cost/t_append_on*100:.1f}%")
113+
print(f"{'Per-row Pydantic cost (isolated)':<40} {(t_validate/N)*1e6:.2f} µs/row")
114+
print()
115+
print(f"Note: append() is dominated by blosc2 I/O ({t_append_off/t_raw:.0f}x raw numpy),")
116+
print(f" not by the validation pipeline.")
117+
print(f" The main bottleneck is the last_true_pos backward scan per row.")

bench/ctable/bench_validation.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark: cost of constraint validation
9+
#
10+
# Measures the overhead of validate=True vs validate=False for:
11+
# 1. append() — row-by-row, Pydantic path
12+
# 2. extend() — bulk insert, vectorized NumPy path
13+
#
14+
# at increasing batch sizes to show how validation cost scales.
15+
16+
from dataclasses import dataclass
17+
from time import perf_counter
18+
19+
import numpy as np
20+
21+
import blosc2
22+
23+
24+
@dataclass
25+
class Row:
26+
id: int = blosc2.field(blosc2.int64(ge=0))
27+
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
28+
active: bool = blosc2.field(blosc2.bool(), default=True)
29+
30+
31+
def make_data(n: int):
32+
rng = np.random.default_rng(42)
33+
ids = np.arange(n, dtype=np.int64)
34+
scores = rng.uniform(0, 100, n)
35+
flags = rng.integers(0, 2, n, dtype=np.bool_)
36+
return list(zip(ids.tolist(), scores.tolist(), flags.tolist()))
37+
38+
39+
SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
40+
APPEND_SIZES = [100, 1_000] # append row-by-row is slow at large N
41+
42+
# ─────────────────────────────────────────────────────────────────────────────
43+
# 1. append() — validate=True vs validate=False
44+
# ─────────────────────────────────────────────────────────────────────────────
45+
print("=" * 65)
46+
print("1. append() — row-by-row (Pydantic validation per row)")
47+
print("=" * 65)
48+
print(f"{'N':>10} {'validate=True':>14} {'validate=False':>15} {'overhead':>10}")
49+
print("-" * 65)
50+
51+
for n in APPEND_SIZES:
52+
data = make_data(n)
53+
54+
t = blosc2.CTable(Row, expected_size=n, validate=True)
55+
t0 = perf_counter()
56+
for row in data:
57+
t.append(row)
58+
t_on = perf_counter() - t0
59+
60+
t = blosc2.CTable(Row, expected_size=n, validate=False)
61+
t0 = perf_counter()
62+
for row in data:
63+
t.append(row)
64+
t_off = perf_counter() - t0
65+
66+
overhead = (t_on / t_off) if t_off > 0 else float("inf")
67+
print(f"{n:>10,} {t_on:>13.4f}s {t_off:>14.4f}s {overhead:>9.2f}x")
68+
69+
# ─────────────────────────────────────────────────────────────────────────────
70+
# 2. extend() — validate=True vs validate=False
71+
# ─────────────────────────────────────────────────────────────────────────────
72+
print()
73+
print("=" * 65)
74+
print("2. extend() — bulk insert (vectorized NumPy validation)")
75+
print("=" * 65)
76+
print(f"{'N':>10} {'validate=True':>14} {'validate=False':>15} {'overhead':>10}")
77+
print("-" * 65)
78+
79+
for n in SIZES:
80+
data = make_data(n)
81+
82+
t = blosc2.CTable(Row, expected_size=n, validate=True)
83+
t0 = perf_counter()
84+
t.extend(data)
85+
t_on = perf_counter() - t0
86+
87+
t = blosc2.CTable(Row, expected_size=n, validate=False)
88+
t0 = perf_counter()
89+
t.extend(data)
90+
t_off = perf_counter() - t0
91+
92+
overhead = (t_on / t_off) if t_off > 0 else float("inf")
93+
print(f"{n:>10,} {t_on:>13.4f}s {t_off:>14.4f}s {overhead:>9.2f}x")
94+
95+
# ─────────────────────────────────────────────────────────────────────────────
96+
# 3. extend() — validate=True vs validate=False with structured NumPy array
97+
# ─────────────────────────────────────────────────────────────────────────────
98+
print()
99+
print("=" * 65)
100+
print("3. extend() with structured NumPy array")
101+
print("=" * 65)
102+
print(f"{'N':>10} {'validate=True':>14} {'validate=False':>15} {'overhead':>10}")
103+
print("-" * 65)
104+
105+
np_dtype = np.dtype([("id", np.int64), ("score", np.float64), ("active", np.bool_)])
106+
107+
for n in SIZES:
108+
rng = np.random.default_rng(42)
109+
arr = np.empty(n, dtype=np_dtype)
110+
arr["id"] = np.arange(n, dtype=np.int64)
111+
arr["score"] = rng.uniform(0, 100, n)
112+
arr["active"] = rng.integers(0, 2, n, dtype=np.bool_)
113+
114+
t = blosc2.CTable(Row, expected_size=n, validate=True)
115+
t0 = perf_counter()
116+
t.extend(arr)
117+
t_on = perf_counter() - t0
118+
119+
t = blosc2.CTable(Row, expected_size=n, validate=False)
120+
t0 = perf_counter()
121+
t.extend(arr)
122+
t_off = perf_counter() - t0
123+
124+
overhead = (t_on / t_off) if t_off > 0 else float("inf")
125+
print(f"{n:>10,} {t_on:>13.4f}s {t_off:>14.4f}s {overhead:>9.2f}x")
126+
127+
print()
128+
print("Note: 'overhead' = validate=True time / validate=False time.")
129+
print(" 1.00x means validation is free; 2.00x means it doubles the time.")

bench/ctable/compact.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,19 @@
99
# of varying fractions of the table.
1010

1111
from time import time
12-
from typing import Annotated
12+
from dataclasses import dataclass
1313

1414
import numpy as np
15-
from pydantic import BaseModel, Field
1615

1716
import blosc2
1817

1918

20-
class NumpyDtype:
21-
def __init__(self, dtype):
22-
self.dtype = dtype
23-
24-
25-
# Row model
26-
class RowModel(BaseModel):
27-
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
28-
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
29-
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
30-
active: Annotated[bool, NumpyDtype(np.bool_)] = True
19+
@dataclass
20+
class Row:
21+
id: int = blosc2.field(blosc2.int64(ge=0))
22+
c_val: complex = blosc2.field(blosc2.complex128(), default=0j)
23+
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
24+
active: bool = blosc2.field(blosc2.bool(), default=True)
3125

3226

3327
N = 1_000_000
@@ -56,7 +50,7 @@ class RowModel(BaseModel):
5650
print("-" * 75)
5751

5852
for frac in delete_fractions:
59-
ct = blosc2.CTable(RowModel, expected_size=N)
53+
ct = blosc2.CTable(Row, expected_size=N)
6054
ct.extend(DATA)
6155

6256
n_delete = int(N * frac)

0 commit comments

Comments
 (0)