Blosc
diff --git a/‎bench/ctable/compact.py‎
Lines changed: 81 additions & 0 deletions b/‎bench/ctable/compact.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎bench/ctable/ctable_v_panda.py‎
Lines changed: 127 additions & 0 deletions b/‎bench/ctable/ctable_v_panda.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎bench/ctable/delete.py‎
Lines changed: 82 additions & 0 deletions b/‎bench/ctable/delete.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎bench/ctable/expected_size.py‎
Lines changed: 75 additions & 0 deletions b/‎bench/ctable/expected_size.py‎
Lines changed: 75 additions & 0 deletions
@@ -0,0 +1,81 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring compact() time and memory gain after deletions
+# of varying fractions of the table.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+
+print(f"compact() benchmark  |  N = {N:,}\n")
+
+# Build base data once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+delete_fractions = [0.1, 0.25, 0.5, 0.75, 0.9]
+
+print("=" * 75)
+print(f"{'DELETED':>10} {'ROWS LEFT':>10} {'TIME (s)':>12} {'CBYTES BEFORE':>15} {'CBYTES AFTER':>14}")
+print("-" * 75)
+
+for frac in delete_fractions:
+    ct = blosc2.CTable(RowModel, expected_size=N)
+    ct.extend(DATA)
+
+    n_delete = int(N * frac)
+    ct.delete(list(range(n_delete)))
+
+    cbytes_before = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
+
+    t0 = time()
+    ct.compact()
+    t_compact = time() - t0
+
+    cbytes_after = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
+
+    print(
+        f"{frac*100:>9.0f}%"
+        f" {N - n_delete:>10,}"
+        f" {t_compact:>12.4f}"
+        f" {cbytes_before / 1024**2:>13.2f} MB"
+        f" {cbytes_after / 1024**2:>12.2f} MB"
+    )
+
+print("-" * 75)
@@ -0,0 +1,127 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark comparing CTable vs pandas DataFrame for:
+#   1. Creation from a NumPy structured array
+#   2. Column access (full column)
+#   3. Filtering (where/query)
+#   4. Row iteration
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+rng = np.random.default_rng(42)
+
+print(f"CTable vs pandas benchmark  |  N = {N:,}\n")
+
+# Build base data once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.empty(N, dtype=np_dtype)
+DATA["id"]     = np.arange(N, dtype=np.int64)
+DATA["c_val"]  = rng.standard_normal(N) + 1j * rng.standard_normal(N)
+DATA["score"]  = rng.uniform(0, 100, N)
+DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_)
+
+print("=" * 65)
+print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}")
+print("-" * 65)
+
+# 1. Creation
+t0 = time()
+ct = blosc2.CTable(RowModel, expected_size=N)
+ct.extend(DATA)
+t_ct_create = time() - t0
+
+t0 = time()
+df = pd.DataFrame(DATA)
+t_pd_create = time() - t0
+
+print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x")
+
+# 2. Column access (full column)
+t0 = time()
+arr = ct["score"]
+t_ct_col = time() - t0
+
+t0 = time()
+arr = df["score"]
+t_pd_col = time() - t0
+
+print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x")
+
+# 2.5 Column access (full column)
+t0 = time()
+arr = ct["score"].to_numpy()
+t_ct_col = time() - t0
+
+t0 = time()
+arr = df["score"].to_numpy()
+t_pd_col = time() - t0
+
+print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x")
+
+# 3. Filtering
+t0 = time()
+result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000))
+t_ct_filter = time() - t0
+
+t0 = time()
+result_pd = df.query("250000 < id < 750000")
+t_pd_filter = time() - t0
+
+print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x")
+
+# 4. Row iteration
+t0 = time()
+for val in ct["score"]:
+    pass
+t_ct_iter = time() - t0
+
+t0 = time()
+for val in df["score"]:
+    pass
+t_pd_iter = time() - t0
+
+print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x")
+
+print("-" * 65)
+
+# Memory
+ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
+ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes
+pd_nbytes  = df.memory_usage(deep=True).sum()
+
+print(f"\nMemory — CTable compressed:   {ct_cbytes / 1024**2:.2f} MB")
+print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB")
+print(f"Memory — pandas:              {pd_nbytes  / 1024**2:.2f} MB")
+print(f"Compression ratio CTable:     {ct_nbytes / ct_cbytes:.2f}x")
@@ -0,0 +1,82 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring delete() performance with different index types:
+# int, slice, and list — with varying sizes.
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+N = 1_000_000
+
+print(f"delete() benchmark  |  N = {N:,}\n")
+
+# Build base data once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(N)
+    ],
+    dtype=np_dtype,
+)
+
+delete_cases = [
+    ("int",          0),
+    ("slice small",  slice(0, 100)),
+    ("slice large",  slice(0, 100_000)),
+    ("slice full",   slice(0, N)),
+    ("list small",   list(range(100))),
+    ("list large",   list(range(100_000))),
+    ("list full",    list(range(N))),
+]
+
+print("=" * 60)
+print(f"{'CASE':<20} {'ROWS DELETED':>14} {'TIME (s)':>12}")
+print("-" * 60)
+
+for label, key in delete_cases:
+    ct = blosc2.CTable(RowModel, expected_size=N)
+    ct.extend(DATA)
+
+    if isinstance(key, int):
+        n_deleted = 1
+    elif isinstance(key, slice):
+        n_deleted = len(range(*key.indices(N)))
+    else:
+        n_deleted = len(key)
+
+    t0 = time()
+    ct.delete(key)
+    t_delete = time() - t0
+    print(f"{label:<20} {n_deleted:>14,} {t_delete:>12.6f}")
+
+print("-" * 60)
@@ -0,0 +1,75 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark for measuring the overhead of resize() when expected_size
+# is too small (M rows) vs correctly sized (N rows) during extend().
+
+from time import time
+from typing import Annotated
+
+import numpy as np
+from pydantic import BaseModel, Field
+
+import blosc2
+
+
+class NumpyDtype:
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+
+# Row model
+class RowModel(BaseModel):
+    id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
+    c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
+    score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
+    active: Annotated[bool, NumpyDtype(np.bool_)] = True
+
+
+
+M = 779
+N = 62_500
+MAX_N = 1_000_000
+print(f"expected_size benchmark  |  wrong expected_size = {M}")
+
+# Pre-generate full dataset once
+np_dtype = np.dtype([
+    ("id",     np.int64),
+    ("c_val",  np.complex128),
+    ("score",  np.float64),
+    ("active", np.bool_),
+])
+DATA = np.array(
+    [
+        (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
+        for i in range(MAX_N)
+    ],
+    dtype=np_dtype,
+)
+
+while N <= MAX_N:
+    print("-" * 80)
+    print(f"N = {N:,} rows")
+
+    # 1. extend() with correct expected_size = N
+    ct_correct = blosc2.CTable(RowModel, expected_size=N)
+    t0 = time()
+    ct_correct.extend(DATA[:N])
+    t_correct = time() - t0
+    print(f"extend() expected_size=N  ({N:>8,}):  {t_correct:.4f} s   rows: {len(ct_correct):,}")
+
+    # 2. extend() with wrong expected_size = M (forces resize)
+    ct_wrong = blosc2.CTable(RowModel, expected_size=M)
+    t0 = time()
+    ct_wrong.extend(DATA[:N])
+    t_wrong = time() - t0
+    print(f"extend() expected_size=M  ({M:>8,}):  {t_wrong:.4f} s   rows: {len(ct_wrong):,}")
+
+    # Summary
+    print(f"  Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x")
+
+    N *= 2