Blosc
diff --git a/‎bench/ctable/Prueba_iter.py‎
Lines changed: 99 additions & 0 deletions b/‎bench/ctable/Prueba_iter.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎bench/ctable/bench_append_regression.py‎
Lines changed: 117 additions & 0 deletions b/‎bench/ctable/bench_append_regression.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎bench/ctable/bench_validation.py‎
Lines changed: 129 additions & 0 deletions b/‎bench/ctable/bench_validation.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎bench/ctable/compact.py‎
Lines changed: 8 additions & 14 deletions b/‎bench/ctable/compact.py‎
Lines changed: 8 additions & 14 deletions
@@ -0,0 +1,99 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+from dataclasses import dataclass
+
+import numpy as np
+from time import time
+
+from blosc2 import CTable
+import blosc2
+
+
+@dataclass
+class Row:
+    id: int = blosc2.field(blosc2.int64(ge=0))
+    score: float = blosc2.field(blosc2.float64(ge=0, le=100))
+    active: bool = blosc2.field(blosc2.bool(), default=True)
+
+
+N = 1_000  # start small, increase when confident
+
+data = [(i, float(i % 100), i % 2 == 0) for i in range(N)]
+tabla = CTable(Row, new_data=data)
+
+print(f"Table created with {len(tabla)} rows\n")
+
+# -------------------------------------------------------------------
+# Test 1: iterate without accessing any column (minimum cost)
+# -------------------------------------------------------------------
+t0 = time()
+for row in tabla:
+    pass
+t1 = time()
+print(f"[Test 1] Iter without accessing columns:    {(t1 - t0)*1000:.3f} ms")
+
+# -------------------------------------------------------------------
+# Test 2: iterate accessing a single column (real_pos cached once)
+# -------------------------------------------------------------------
+t0 = time()
+for row in tabla:
+    _ = row["id"]
+t1 = time()
+print(f"[Test 2] Iter accessing 'id':               {(t1 - t0)*1000:.3f} ms")
+
+# -------------------------------------------------------------------
+# Test 3: iterate accessing all columns (real_pos cached once per row)
+# -------------------------------------------------------------------
+t0 = time()
+for row in tabla:
+    _ = row["id"]
+    _ = row["score"]
+    _ = row["active"]
+t1 = time()
+print(f"[Test 3] Iter accessing 3 columns:          {(t1 - t0)*1000:.3f} ms")
+
+# -------------------------------------------------------------------
+# Test 4: correctness — values match expected
+# -------------------------------------------------------------------
+errors = 0
+for row in tabla:
+    if row["id"] != row._nrow:
+        errors += 1
+    if row["score"] != float(row._nrow % 100):
+        errors += 1
+    if row["active"] != (row._nrow % 2 == 0):
+        errors += 1
+
+print(f"\n[Test 4] Correctness errors: {errors} (expected: 0)")
+
+# -------------------------------------------------------------------
+# Test 5: with holes (deleted rows)
+# -------------------------------------------------------------------
+tabla2 = CTable(Row, new_data=data)
+tabla2.delete(list(range(0, N, 2)))  # delete even rows, keep odd ones
+
+print(f"\nTable with holes: {len(tabla2)} rows (expected: {N // 2})")
+
+t0 = time()
+ids = []
+for row in tabla2:
+    ids.append(row["id"])
+t1 = time()
+
+expected_ids = [i for i in range(N) if i % 2 != 0]
+ok = ids == expected_ids
+print(f"[Test 5] Iter with holes ({N//2} rows):        {(t1 - t0)*1000:.3f} ms  |  correctness: {ok}")
+
+# -------------------------------------------------------------------
+# Test 6: real_pos is cached correctly (not recomputed)
+# -------------------------------------------------------------------
+row0 = next(iter(tabla))
+assert row0._real_pos is None, "real_pos should be None before first access"
+_ = row0["id"]
+assert row0._real_pos is not None, "real_pos should be cached after first access"
+print(f"\n[Test 6] real_pos caching: OK (real_pos={row0._real_pos})")
@@ -0,0 +1,117 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark: append() overhead introduced by the new schema pipeline
+#
+# The new append() path routes every row through:
+#   _normalize_row_input → validate_row (Pydantic) → _coerce_row_to_storage
+#
+# This benchmark isolates how much each step costs, and shows the
+# total overhead vs the raw NDArray write speed.
+
+from dataclasses import dataclass
+from time import perf_counter
+
+import numpy as np
+
+import blosc2
+from blosc2.schema_compiler import compile_schema
+from blosc2.schema_validation import validate_row, build_validator_model
+
+
+@dataclass
+class Row:
+    id:     int   = blosc2.field(blosc2.int64(ge=0))
+    score:  float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
+    active: bool  = blosc2.field(blosc2.bool(), default=True)
+
+
+N = 5_000
+rng = np.random.default_rng(42)
+data = [
+    (int(i), float(rng.uniform(0, 100)), bool(i % 2))
+    for i in range(N)
+]
+schema = compile_schema(Row)
+# Warm up the Pydantic model cache
+build_validator_model(schema)
+
+print(f"append() pipeline cost breakdown  |  N = {N:,} rows")
+print("=" * 60)
+
+# ── 1. Raw NDArray writes (no CTable overhead at all) ────────────────────────
+ids    = np.zeros(N, dtype=np.int64)
+scores = np.zeros(N, dtype=np.float64)
+flags  = np.zeros(N, dtype=np.bool_)
+mask   = np.zeros(N, dtype=np.bool_)
+
+t0 = perf_counter()
+for i, (id_, score, active) in enumerate(data):
+    ids[i]    = id_
+    scores[i] = score
+    flags[i]  = active
+    mask[i]   = True
+t_raw = perf_counter() - t0
+print(f"{'Raw NumPy writes (baseline)':<40} {t_raw:.4f} s")
+
+# ── 2. _normalize_row_input only ─────────────────────────────────────────────
+t_obj = blosc2.CTable(Row, expected_size=N, validate=False)
+t0 = perf_counter()
+for row in data:
+    _ = t_obj._normalize_row_input(row)
+t_normalize = perf_counter() - t0
+print(f"{'_normalize_row_input only':<40} {t_normalize:.4f} s  ({t_normalize/t_raw:.1f}x baseline)")
+
+# ── 3. Pydantic validate_row only ────────────────────────────────────────────
+row_dicts = [t_obj._normalize_row_input(row) for row in data]
+t0 = perf_counter()
+for rd in row_dicts:
+    _ = validate_row(schema, rd)
+t_validate = perf_counter() - t0
+print(f"{'validate_row (Pydantic) only':<40} {t_validate:.4f} s  ({t_validate/t_raw:.1f}x baseline)")
+
+# ── 4. _coerce_row_to_storage only ───────────────────────────────────────────
+t0 = perf_counter()
+for rd in row_dicts:
+    _ = t_obj._coerce_row_to_storage(rd)
+t_coerce = perf_counter() - t0
+print(f"{'_coerce_row_to_storage only':<40} {t_coerce:.4f} s  ({t_coerce/t_raw:.1f}x baseline)")
+
+# ── 5. Full append(), validate=False  (3 runs, take minimum) ─────────────────
+RUNS = 3
+best_off = float("inf")
+for _ in range(RUNS):
+    t_obj2 = blosc2.CTable(Row, expected_size=N, validate=False)
+    t0 = perf_counter()
+    for row in data:
+        t_obj2.append(row)
+    best_off = min(best_off, perf_counter() - t0)
+t_append_off = best_off
+print(f"{'Full append(), validate=False':<40} {t_append_off:.4f} s  ({t_append_off/t_raw:.1f}x baseline)")
+
+# ── 6. Full append(), validate=True  (3 runs, take minimum) ──────────────────
+best_on = float("inf")
+for _ in range(RUNS):
+    t_obj3 = blosc2.CTable(Row, expected_size=N, validate=True)
+    t0 = perf_counter()
+    for row in data:
+        t_obj3.append(row)
+    best_on = min(best_on, perf_counter() - t0)
+t_append_on = best_on
+print(f"{'Full append(), validate=True':<40} {t_append_on:.4f} s  ({t_append_on/t_raw:.1f}x baseline)")
+
+print()
+print("=" * 60)
+pydantic_cost = max(t_append_on - t_append_off, 0.0)
+print(f"{'Pydantic overhead in append()':<40} {pydantic_cost:.4f} s")
+if t_append_on > 0:
+    print(f"{'Validation fraction of total':<40} {pydantic_cost/t_append_on*100:.1f}%")
+print(f"{'Per-row Pydantic cost (isolated)':<40} {(t_validate/N)*1e6:.2f} µs/row")
+print()
+print(f"Note: append() is dominated by blosc2 I/O ({t_append_off/t_raw:.0f}x raw numpy),")
+print(f"      not by the validation pipeline.")
+print(f"      The main bottleneck is the last_true_pos backward scan per row.")
@@ -0,0 +1,129 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark: cost of constraint validation
+#
+# Measures the overhead of validate=True vs validate=False for:
+#   1. append()  — row-by-row, Pydantic path
+#   2. extend()  — bulk insert, vectorized NumPy path
+#
+# at increasing batch sizes to show how validation cost scales.
+
+from dataclasses import dataclass
+from time import perf_counter
+
+import numpy as np
+
+import blosc2
+
+
+@dataclass
+class Row:
+    id:     int   = blosc2.field(blosc2.int64(ge=0))
+    score:  float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
+    active: bool  = blosc2.field(blosc2.bool(), default=True)
+
+
+def make_data(n: int):
+    rng = np.random.default_rng(42)
+    ids    = np.arange(n, dtype=np.int64)
+    scores = rng.uniform(0, 100, n)
+    flags  = rng.integers(0, 2, n, dtype=np.bool_)
+    return list(zip(ids.tolist(), scores.tolist(), flags.tolist()))
+
+
+SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
+APPEND_SIZES = [100, 1_000]   # append row-by-row is slow at large N
+
+# ─────────────────────────────────────────────────────────────────────────────
+# 1. append() — validate=True vs validate=False
+# ─────────────────────────────────────────────────────────────────────────────
+print("=" * 65)
+print("1. append()  —  row-by-row  (Pydantic validation per row)")
+print("=" * 65)
+print(f"{'N':>10}  {'validate=True':>14}  {'validate=False':>15}  {'overhead':>10}")
+print("-" * 65)
+
+for n in APPEND_SIZES:
+    data = make_data(n)
+
+    t = blosc2.CTable(Row, expected_size=n, validate=True)
+    t0 = perf_counter()
+    for row in data:
+        t.append(row)
+    t_on = perf_counter() - t0
+
+    t = blosc2.CTable(Row, expected_size=n, validate=False)
+    t0 = perf_counter()
+    for row in data:
+        t.append(row)
+    t_off = perf_counter() - t0
+
+    overhead = (t_on / t_off) if t_off > 0 else float("inf")
+    print(f"{n:>10,}  {t_on:>13.4f}s  {t_off:>14.4f}s  {overhead:>9.2f}x")
+
+# ─────────────────────────────────────────────────────────────────────────────
+# 2. extend() — validate=True vs validate=False
+# ─────────────────────────────────────────────────────────────────────────────
+print()
+print("=" * 65)
+print("2. extend()  —  bulk insert  (vectorized NumPy validation)")
+print("=" * 65)
+print(f"{'N':>10}  {'validate=True':>14}  {'validate=False':>15}  {'overhead':>10}")
+print("-" * 65)
+
+for n in SIZES:
+    data = make_data(n)
+
+    t = blosc2.CTable(Row, expected_size=n, validate=True)
+    t0 = perf_counter()
+    t.extend(data)
+    t_on = perf_counter() - t0
+
+    t = blosc2.CTable(Row, expected_size=n, validate=False)
+    t0 = perf_counter()
+    t.extend(data)
+    t_off = perf_counter() - t0
+
+    overhead = (t_on / t_off) if t_off > 0 else float("inf")
+    print(f"{n:>10,}  {t_on:>13.4f}s  {t_off:>14.4f}s  {overhead:>9.2f}x")
+
+# ─────────────────────────────────────────────────────────────────────────────
+# 3. extend() — validate=True vs validate=False with structured NumPy array
+# ─────────────────────────────────────────────────────────────────────────────
+print()
+print("=" * 65)
+print("3. extend() with structured NumPy array")
+print("=" * 65)
+print(f"{'N':>10}  {'validate=True':>14}  {'validate=False':>15}  {'overhead':>10}")
+print("-" * 65)
+
+np_dtype = np.dtype([("id", np.int64), ("score", np.float64), ("active", np.bool_)])
+
+for n in SIZES:
+    rng = np.random.default_rng(42)
+    arr = np.empty(n, dtype=np_dtype)
+    arr["id"]     = np.arange(n, dtype=np.int64)
+    arr["score"]  = rng.uniform(0, 100, n)
+    arr["active"] = rng.integers(0, 2, n, dtype=np.bool_)
+
+    t = blosc2.CTable(Row, expected_size=n, validate=True)
+    t0 = perf_counter()
+    t.extend(arr)
+    t_on = perf_counter() - t0
+
+    t = blosc2.CTable(Row, expected_size=n, validate=False)
+    t0 = perf_counter()
+    t.extend(arr)
+    t_off = perf_counter() - t0
+
+    overhead = (t_on / t_off) if t_off > 0 else float("inf")
+    print(f"{n:>10,}  {t_on:>13.4f}s  {t_off:>14.4f}s  {overhead:>9.2f}x")
+
+print()
+print("Note: 'overhead' = validate=True time / validate=False time.")
+print("      1.00x means validation is free; 2.00x means it doubles the time.")
@@ -9,25 +9,19 @@
 # of varying fractions of the table.
 
 from time import time
-from typing import Annotated
+from dataclasses import dataclass
 
 import numpy as np
-from pydantic import BaseModel, Field
 
 import blosc2
 
 
-class NumpyDtype:
-    def __init__(self, dtype):
-        self.dtype = dtype
-
-
-# Row model
-class RowModel(BaseModel):
-    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
-    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
-    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
-    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+@dataclass
+class Row:
+    id: int = blosc2.field(blosc2.int64(ge=0))
+    c_val: complex = blosc2.field(blosc2.complex128(), default=0j)
+    score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
+    active: bool = blosc2.field(blosc2.bool(), default=True)
 
 
 N = 1_000_000
@@ -56,7 +50,7 @@ class RowModel(BaseModel):
 print("-" * 75)
 
 for frac in delete_fractions:
-    ct = blosc2.CTable(RowModel, expected_size=N)
+    ct = blosc2.CTable(Row, expected_size=N)
     ct.extend(DATA)
 
     n_delete = int(N * frac)