Skip to content

Commit 34f8219

Browse files
committed
CSV compatibility implementation
Arrow compatibility Examples Tutorial
1 parent 0472b3f commit 34f8219

34 files changed

Lines changed: 4212 additions & 36 deletions

bench/ctable/Prueba_iter.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,10 @@
66
#######################################################################
77

88
from dataclasses import dataclass
9-
10-
import numpy as np
119
from time import time
1210

13-
from blosc2 import CTable
1411
import blosc2
12+
from blosc2 import CTable
1513

1614

1715
@dataclass
@@ -32,7 +30,7 @@ class Row:
3230
# Test 1: iterate without accessing any column (minimum cost)
3331
# -------------------------------------------------------------------
3432
t0 = time()
35-
for row in tabla:
33+
for _row in tabla:
3634
pass
3735
t1 = time()
3836
print(f"[Test 1] Iter without accessing columns: {(t1 - t0)*1000:.3f} ms")
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark: pandas ↔ CTable round-trip (with on-disk persistence)
9+
#
10+
# Pipeline measured in four isolated steps:
11+
#
12+
# 1. pandas → CTable : DataFrame.to_arrow() + CTable.from_arrow()
13+
# 2. CTable.save() : write in-memory CTable to disk
14+
# 3. CTable.load() : read disk table back into RAM
15+
# 4. CTable → pandas : CTable.to_arrow().to_pandas()
16+
#
17+
# Plus the combined full round-trip (steps 1-4) is shown at the end.
18+
#
19+
# Each measurement is the minimum of NRUNS repetitions to reduce noise.
20+
# Schema: id (int64), score (float64), active (bool), label (string ≤16).
21+
22+
import os
23+
import shutil
24+
from time import perf_counter
25+
26+
import numpy as np
27+
import pandas as pd
28+
import pyarrow as pa
29+
30+
from blosc2 import CTable
31+
32+
NRUNS = 3
33+
TABLE_DIR = "saved_ctable/bench_pandas"
34+
SIZES = [1_000, 10_000, 100_000, 1_000_000]
35+
36+
37+
# ---------------------------------------------------------------------------
38+
# Helpers
39+
# ---------------------------------------------------------------------------
40+
41+
42+
def sep(title: str) -> None:
43+
print(f"\n{'─' * 60}")
44+
print(f" {title}")
45+
print(f"{'─' * 60}")
46+
47+
48+
def tmin(fn, n: int = NRUNS) -> float:
49+
"""Minimum elapsed time (s) over *n* calls of *fn*."""
50+
best = float("inf")
51+
for _ in range(n):
52+
t0 = perf_counter()
53+
fn()
54+
best = min(best, perf_counter() - t0)
55+
return best
56+
57+
58+
def clean(path: str = TABLE_DIR) -> None:
59+
if os.path.exists(path):
60+
shutil.rmtree(path)
61+
os.makedirs(path, exist_ok=True)
62+
63+
64+
def make_dataframe(n: int) -> pd.DataFrame:
65+
rng = np.random.default_rng(42)
66+
return pd.DataFrame({
67+
"id": np.arange(n, dtype=np.int64),
68+
"score": rng.uniform(0, 100, n).astype(np.float64),
69+
"active": rng.integers(0, 2, n, dtype=bool),
70+
"label": [f"r{i % 10000:05d}" for i in range(n)],
71+
})
72+
73+
74+
# ---------------------------------------------------------------------------
75+
# Section 1: pandas → CTable (in-memory)
76+
# ---------------------------------------------------------------------------
77+
78+
sep("1. pandas → CTable (from_arrow, in-memory)")
79+
print(f"{'rows':>12} {'pandas→arrow (s)':>18} {'arrow→ctable (s)':>18} {'total (s)':>12}")
80+
print(f"{'----':>12} {'----------------':>18} {'----------------':>18} {'---------':>12}")
81+
82+
ctables: dict[int, CTable] = {} # keep for steps 2 & 4
83+
84+
for N in SIZES:
85+
df = make_dataframe(N)
86+
87+
def bench_to_arrow(df=df):
88+
return pa.Table.from_pandas(df, preserve_index=False)
89+
90+
def bench_from_arrow(df=df):
91+
at = pa.Table.from_pandas(df, preserve_index=False)
92+
return CTable.from_arrow(at)
93+
94+
t_pa = tmin(bench_to_arrow)
95+
t_ct = tmin(bench_from_arrow) - t_pa # from_arrow only
96+
t_tot = t_pa + t_ct
97+
98+
# Keep one CTable for later steps
99+
at = pa.Table.from_pandas(df, preserve_index=False)
100+
ctables[N] = CTable.from_arrow(at)
101+
102+
print(f"{N:>12,} {t_pa:>18.4f} {t_ct:>18.4f} {t_tot:>12.4f}")
103+
104+
105+
# ---------------------------------------------------------------------------
106+
# Section 2: CTable.save() (in-memory → disk)
107+
# ---------------------------------------------------------------------------
108+
109+
sep("2. CTable.save() (in-memory → disk)")
110+
print(f"{'rows':>12} {'save (s)':>14} {'compressed':>12} {'ratio':>8}")
111+
print(f"{'----':>12} {'--------':>14} {'----------':>12} {'-----':>8}")
112+
113+
for N in SIZES:
114+
t = ctables[N]
115+
path = os.path.join(TABLE_DIR, f"ct_{N}")
116+
117+
def bench_save(t=t, path=path):
118+
if os.path.exists(path):
119+
shutil.rmtree(path)
120+
t.save(path, overwrite=True)
121+
122+
elapsed = tmin(bench_save)
123+
# Final state for size info
124+
t.save(path, overwrite=True)
125+
cbytes = t.cbytes
126+
nbytes = t.nbytes
127+
ratio = nbytes / cbytes if cbytes > 0 else float("nan")
128+
129+
def _fmt(n):
130+
if n < 1024**2:
131+
return f"{n / 1024:.1f} KB"
132+
return f"{n / 1024**2:.1f} MB"
133+
134+
print(f"{N:>12,} {elapsed:>14.4f} {_fmt(cbytes):>12} {ratio:>7.2f}x")
135+
136+
137+
# ---------------------------------------------------------------------------
138+
# Section 3: CTable.load() (disk → in-memory)
139+
# ---------------------------------------------------------------------------
140+
141+
sep("3. CTable.load() (disk → in-memory)")
142+
print(f"{'rows':>12} {'load (s)':>14}")
143+
print(f"{'----':>12} {'--------':>14}")
144+
145+
for N in SIZES:
146+
path = os.path.join(TABLE_DIR, f"ct_{N}")
147+
148+
def bench_load(path=path):
149+
return CTable.load(path)
150+
151+
elapsed = tmin(bench_load)
152+
print(f"{N:>12,} {elapsed:>14.4f}")
153+
154+
155+
# ---------------------------------------------------------------------------
156+
# Section 4: CTable → pandas (to_arrow → to_pandas)
157+
# ---------------------------------------------------------------------------
158+
159+
sep("4. CTable → pandas (to_arrow + to_pandas)")
160+
print(f"{'rows':>12} {'ctable→arrow (s)':>18} {'arrow→pandas (s)':>18} {'total (s)':>12}")
161+
print(f"{'----':>12} {'----------------':>18} {'----------------':>18} {'---------':>12}")
162+
163+
for N in SIZES:
164+
t = ctables[N]
165+
at_cache = t.to_arrow() # pre-convert once so we can time each step cleanly
166+
167+
def bench_to_arrow_ct(t=t):
168+
return t.to_arrow()
169+
170+
def bench_to_pandas(at=at_cache):
171+
return at.to_pandas()
172+
173+
t_arr = tmin(bench_to_arrow_ct)
174+
t_pd = tmin(bench_to_pandas)
175+
t_tot = t_arr + t_pd
176+
177+
print(f"{N:>12,} {t_arr:>18.4f} {t_pd:>18.4f} {t_tot:>12.4f}")
178+
179+
180+
# ---------------------------------------------------------------------------
181+
# Section 5: Full round-trip (pandas → CTable → disk → load → pandas)
182+
# ---------------------------------------------------------------------------
183+
184+
sep("5. Full round-trip (pandas → CTable → save → load → pandas)")
185+
print(f"{'rows':>12} {'round-trip (s)':>16}")
186+
print(f"{'----':>12} {'---------------':>16}")
187+
188+
for N in SIZES:
189+
df = make_dataframe(N)
190+
path = os.path.join(TABLE_DIR, f"rt_{N}")
191+
192+
def bench_roundtrip(df=df, path=path):
193+
# pandas → CTable
194+
at = pa.Table.from_pandas(df, preserve_index=False)
195+
t = CTable.from_arrow(at)
196+
# save to disk
197+
t.save(path, overwrite=True)
198+
# load back
199+
t2 = CTable.load(path)
200+
# CTable → pandas
201+
return t2.to_arrow().to_pandas()
202+
203+
elapsed = tmin(bench_roundtrip)
204+
print(f"{N:>12,} {elapsed:>16.4f}")
205+
206+
207+
# Cleanup
208+
clean()
209+
print()

bench/ctable/bench_validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def make_data(n: int):
3333
ids = np.arange(n, dtype=np.int64)
3434
scores = rng.uniform(0, 100, n)
3535
flags = rng.integers(0, 2, n, dtype=np.bool_)
36-
return list(zip(ids.tolist(), scores.tolist(), flags.tolist()))
36+
return list(zip(ids.tolist(), scores.tolist(), flags.tolist(), strict=False))
3737

3838

3939
SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]

bench/ctable/compact.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
# Benchmark for measuring compact() time and memory gain after deletions
99
# of varying fractions of the table.
1010

11-
from time import time
1211
from dataclasses import dataclass
12+
from time import time
1313

1414
import numpy as np
1515

bench/ctable/ctable_v_panda.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
# 3. Filtering (where/query)
1212
# 4. Row iteration
1313

14-
from time import time
1514
from dataclasses import dataclass
15+
from time import time
1616

1717
import numpy as np
18+
import pandas as pd
1819

1920
import blosc2
2021

@@ -96,12 +97,12 @@ class Row:
9697

9798
# 4. Row iteration
9899
t0 = time()
99-
for val in ct["score"]:
100+
for _val in ct["score"]:
100101
pass
101102
t_ct_iter = time() - t0
102103

103104
t0 = time()
104-
for val in df["score"]:
105+
for _val in df["score"]:
105106
pass
106107
t_pd_iter = time() - t0
107108

bench/ctable/delete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
# Benchmark for measuring delete() performance with different index types:
99
# int, slice, and list — with varying sizes.
1010

11-
from time import time
1211
from dataclasses import dataclass
12+
from time import time
1313

1414
import numpy as np
1515

bench/ctable/expected_size.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
# Benchmark for measuring the overhead of resize() when expected_size
99
# is too small (M rows) vs correctly sized (N rows) during extend().
1010

11-
from time import time
1211
from dataclasses import dataclass
12+
from time import time
1313

1414
import numpy as np
1515

bench/ctable/extend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
# 2. NumPy structured array (1M rows) — list of named tuples
1111
# 3. An existing CTable (previously created from Python lists, 1M rows)
1212

13-
from time import time
1413
from dataclasses import dataclass
14+
from time import time
1515

1616
import numpy as np
1717

bench/ctable/extend_vs_apend.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
# Benchmark for comparing append() (row by row) vs extend() (bulk),
99
# to find the crossover point where extend() becomes worth it.
1010

11-
from time import time
1211
from dataclasses import dataclass
13-
14-
import numpy as np
12+
from time import time
1513

1614
import blosc2
1715

@@ -26,7 +24,7 @@ class Row:
2624

2725
# Parameter — change N to test different crossover points
2826
N = 2
29-
print(f"append() vs extend() benchmark")
27+
print("append() vs extend() benchmark")
3028
for i in range(6):
3129
print("\n")
3230
print("%" * 100)

bench/ctable/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
# Benchmark for measuring Column[int] access (single row by logical index),
99
# which exercises _find_physical_index() traversal over chunk metadata.
1010

11-
from time import time
1211
from dataclasses import dataclass
12+
from time import time
1313

1414
import numpy as np
1515

0 commit comments

Comments
 (0)