Blosc
diff --git a/‎bench/ctable/Prueba_iter.py‎
Lines changed: 2 additions & 4 deletions b/‎bench/ctable/Prueba_iter.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎bench/ctable/bench_pandas_roundtrip.py‎
Lines changed: 209 additions & 0 deletions b/‎bench/ctable/bench_pandas_roundtrip.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎bench/ctable/bench_validation.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ctable/bench_validation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/ctable/compact.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ctable/compact.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/ctable/ctable_v_panda.py‎
Lines changed: 4 additions & 3 deletions b/‎bench/ctable/ctable_v_panda.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎bench/ctable/delete.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ctable/delete.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/ctable/expected_size.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ctable/expected_size.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/ctable/extend.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ctable/extend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/ctable/extend_vs_apend.py‎
Lines changed: 2 additions & 4 deletions b/‎bench/ctable/extend_vs_apend.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎bench/ctable/index.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ctable/index.py‎
Lines changed: 1 addition & 1 deletion
@@ -6,12 +6,10 @@
 #######################################################################
 
 from dataclasses import dataclass
-
-import numpy as np
 from time import time
 
-from blosc2 import CTable
 import blosc2
+from blosc2 import CTable
 
 
 @dataclass
@@ -32,7 +30,7 @@ class Row:
 # Test 1: iterate without accessing any column (minimum cost)
 # -------------------------------------------------------------------
 t0 = time()
-for row in tabla:
+for _row in tabla:
     pass
 t1 = time()
 print(f"[Test 1] Iter without accessing columns:    {(t1 - t0)*1000:.3f} ms")
 
@@ -0,0 +1,209 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark: pandas ↔ CTable round-trip (with on-disk persistence)
+#
+# Pipeline measured in four isolated steps:
+#
+#   1. pandas → CTable   : DataFrame.to_arrow() + CTable.from_arrow()
+#   2. CTable.save()     : write in-memory CTable to disk
+#   3. CTable.load()     : read disk table back into RAM
+#   4. CTable → pandas   : CTable.to_arrow().to_pandas()
+#
+# Plus the combined full round-trip (steps 1-4) is shown at the end.
+#
+# Each measurement is the minimum of NRUNS repetitions to reduce noise.
+# Schema: id (int64), score (float64), active (bool), label (string ≤16).
+
+import os
+import shutil
+from time import perf_counter
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+
+from blosc2 import CTable
+
+NRUNS = 3
+TABLE_DIR = "saved_ctable/bench_pandas"
+SIZES = [1_000, 10_000, 100_000, 1_000_000]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def sep(title: str) -> None:
+    print(f"\n{'─' * 60}")
+    print(f"  {title}")
+    print(f"{'─' * 60}")
+
+
+def tmin(fn, n: int = NRUNS) -> float:
+    """Minimum elapsed time (s) over *n* calls of *fn*."""
+    best = float("inf")
+    for _ in range(n):
+        t0 = perf_counter()
+        fn()
+        best = min(best, perf_counter() - t0)
+    return best
+
+
+def clean(path: str = TABLE_DIR) -> None:
+    if os.path.exists(path):
+        shutil.rmtree(path)
+    os.makedirs(path, exist_ok=True)
+
+
+def make_dataframe(n: int) -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    return pd.DataFrame({
+        "id":     np.arange(n, dtype=np.int64),
+        "score":  rng.uniform(0, 100, n).astype(np.float64),
+        "active": rng.integers(0, 2, n, dtype=bool),
+        "label":  [f"r{i % 10000:05d}" for i in range(n)],
+    })
+
+
+# ---------------------------------------------------------------------------
+# Section 1: pandas → CTable  (in-memory)
+# ---------------------------------------------------------------------------
+
+sep("1. pandas → CTable  (from_arrow, in-memory)")
+print(f"{'rows':>12}  {'pandas→arrow (s)':>18}  {'arrow→ctable (s)':>18}  {'total (s)':>12}")
+print(f"{'----':>12}  {'----------------':>18}  {'----------------':>18}  {'---------':>12}")
+
+ctables: dict[int, CTable] = {}  # keep for steps 2 & 4
+
+for N in SIZES:
+    df = make_dataframe(N)
+
+    def bench_to_arrow(df=df):
+        return pa.Table.from_pandas(df, preserve_index=False)
+
+    def bench_from_arrow(df=df):
+        at = pa.Table.from_pandas(df, preserve_index=False)
+        return CTable.from_arrow(at)
+
+    t_pa  = tmin(bench_to_arrow)
+    t_ct  = tmin(bench_from_arrow) - t_pa   # from_arrow only
+    t_tot = t_pa + t_ct
+
+    # Keep one CTable for later steps
+    at = pa.Table.from_pandas(df, preserve_index=False)
+    ctables[N] = CTable.from_arrow(at)
+
+    print(f"{N:>12,}  {t_pa:>18.4f}  {t_ct:>18.4f}  {t_tot:>12.4f}")
+
+
+# ---------------------------------------------------------------------------
+# Section 2: CTable.save()  (in-memory → disk)
+# ---------------------------------------------------------------------------
+
+sep("2. CTable.save()  (in-memory → disk)")
+print(f"{'rows':>12}  {'save (s)':>14}  {'compressed':>12}  {'ratio':>8}")
+print(f"{'----':>12}  {'--------':>14}  {'----------':>12}  {'-----':>8}")
+
+for N in SIZES:
+    t = ctables[N]
+    path = os.path.join(TABLE_DIR, f"ct_{N}")
+
+    def bench_save(t=t, path=path):
+        if os.path.exists(path):
+            shutil.rmtree(path)
+        t.save(path, overwrite=True)
+
+    elapsed = tmin(bench_save)
+    # Final state for size info
+    t.save(path, overwrite=True)
+    cbytes = t.cbytes
+    nbytes = t.nbytes
+    ratio  = nbytes / cbytes if cbytes > 0 else float("nan")
+
+    def _fmt(n):
+        if n < 1024**2:
+            return f"{n / 1024:.1f} KB"
+        return f"{n / 1024**2:.1f} MB"
+
+    print(f"{N:>12,}  {elapsed:>14.4f}  {_fmt(cbytes):>12}  {ratio:>7.2f}x")
+
+
+# ---------------------------------------------------------------------------
+# Section 3: CTable.load()  (disk → in-memory)
+# ---------------------------------------------------------------------------
+
+sep("3. CTable.load()  (disk → in-memory)")
+print(f"{'rows':>12}  {'load (s)':>14}")
+print(f"{'----':>12}  {'--------':>14}")
+
+for N in SIZES:
+    path = os.path.join(TABLE_DIR, f"ct_{N}")
+
+    def bench_load(path=path):
+        return CTable.load(path)
+
+    elapsed = tmin(bench_load)
+    print(f"{N:>12,}  {elapsed:>14.4f}")
+
+
+# ---------------------------------------------------------------------------
+# Section 4: CTable → pandas  (to_arrow → to_pandas)
+# ---------------------------------------------------------------------------
+
+sep("4. CTable → pandas  (to_arrow + to_pandas)")
+print(f"{'rows':>12}  {'ctable→arrow (s)':>18}  {'arrow→pandas (s)':>18}  {'total (s)':>12}")
+print(f"{'----':>12}  {'----------------':>18}  {'----------------':>18}  {'---------':>12}")
+
+for N in SIZES:
+    t = ctables[N]
+    at_cache = t.to_arrow()  # pre-convert once so we can time each step cleanly
+
+    def bench_to_arrow_ct(t=t):
+        return t.to_arrow()
+
+    def bench_to_pandas(at=at_cache):
+        return at.to_pandas()
+
+    t_arr = tmin(bench_to_arrow_ct)
+    t_pd  = tmin(bench_to_pandas)
+    t_tot = t_arr + t_pd
+
+    print(f"{N:>12,}  {t_arr:>18.4f}  {t_pd:>18.4f}  {t_tot:>12.4f}")
+
+
+# ---------------------------------------------------------------------------
+# Section 5: Full round-trip  (pandas → CTable → disk → load → pandas)
+# ---------------------------------------------------------------------------
+
+sep("5. Full round-trip  (pandas → CTable → save → load → pandas)")
+print(f"{'rows':>12}  {'round-trip (s)':>16}")
+print(f"{'----':>12}  {'---------------':>16}")
+
+for N in SIZES:
+    df = make_dataframe(N)
+    path = os.path.join(TABLE_DIR, f"rt_{N}")
+
+    def bench_roundtrip(df=df, path=path):
+        # pandas → CTable
+        at = pa.Table.from_pandas(df, preserve_index=False)
+        t = CTable.from_arrow(at)
+        # save to disk
+        t.save(path, overwrite=True)
+        # load back
+        t2 = CTable.load(path)
+        # CTable → pandas
+        return t2.to_arrow().to_pandas()
+
+    elapsed = tmin(bench_roundtrip)
+    print(f"{N:>12,}  {elapsed:>16.4f}")
+
+
+# Cleanup
+clean()
+print()
@@ -33,7 +33,7 @@ def make_data(n: int):
     ids    = np.arange(n, dtype=np.int64)
     scores = rng.uniform(0, 100, n)
     flags  = rng.integers(0, 2, n, dtype=np.bool_)
-    return list(zip(ids.tolist(), scores.tolist(), flags.tolist()))
+    return list(zip(ids.tolist(), scores.tolist(), flags.tolist(), strict=False))
 
 
 SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
 
@@ -8,8 +8,8 @@
 # Benchmark for measuring compact() time and memory gain after deletions
 # of varying fractions of the table.
 
-from time import time
 from dataclasses import dataclass
+from time import time
 
 import numpy as np
 
 
@@ -11,10 +11,11 @@
 #   3. Filtering (where/query)
 #   4. Row iteration
 
-from time import time
 from dataclasses import dataclass
+from time import time
 
 import numpy as np
+import pandas as pd
 
 import blosc2
 
@@ -96,12 +97,12 @@ class Row:
 
 # 4. Row iteration
 t0 = time()
-for val in ct["score"]:
+for _val in ct["score"]:
     pass
 t_ct_iter = time() - t0
 
 t0 = time()
-for val in df["score"]:
+for _val in df["score"]:
     pass
 t_pd_iter = time() - t0
 
 
@@ -8,8 +8,8 @@
 # Benchmark for measuring delete() performance with different index types:
 # int, slice, and list — with varying sizes.
 
-from time import time
 from dataclasses import dataclass
+from time import time
 
 import numpy as np
 
 
@@ -8,8 +8,8 @@
 # Benchmark for measuring the overhead of resize() when expected_size
 # is too small (M rows) vs correctly sized (N rows) during extend().
 
-from time import time
 from dataclasses import dataclass
+from time import time
 
 import numpy as np
 
 
@@ -10,8 +10,8 @@
 #   2. NumPy structured array (1M rows) — list of named tuples
 #   3. An existing CTable (previously created from Python lists, 1M rows)
 
-from time import time
 from dataclasses import dataclass
+from time import time
 
 import numpy as np
 
 
@@ -8,10 +8,8 @@
 # Benchmark for comparing append() (row by row) vs extend() (bulk),
 # to find the crossover point where extend() becomes worth it.
 
-from time import time
 from dataclasses import dataclass
-
-import numpy as np
+from time import time
 
 import blosc2
 
@@ -26,7 +24,7 @@ class Row:
 
 # Parameter — change N to test different crossover points
 N = 2
-print(f"append() vs extend() benchmark")
+print("append() vs extend() benchmark")
 for i in range(6):
     print("\n")
     print("%" * 100)
 
@@ -8,8 +8,8 @@
 # Benchmark for measuring Column[int] access (single row by logical index),
 # which exercises _find_physical_index() traversal over chunk metadata.
 
-from time import time
 from dataclasses import dataclass
+from time import time
 
 import numpy as np