Skip to content

Commit 01e47f4

Browse files
Merge pull request #604 from Jacc4224/ctable-new
Add CTable, a columnar in-memory table built on top of blosc2
2 parents e13ccaf + 6de7c30 commit 01e47f4

23 files changed

Lines changed: 3395 additions & 0 deletions

bench/ctable/compact.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark for measuring compact() time and memory gain after deletions
9+
# of varying fractions of the table.
10+
11+
from time import time
12+
from typing import Annotated
13+
14+
import numpy as np
15+
from pydantic import BaseModel, Field
16+
17+
import blosc2
18+
19+
20+
class NumpyDtype:
21+
def __init__(self, dtype):
22+
self.dtype = dtype
23+
24+
25+
# Row model
26+
class RowModel(BaseModel):
27+
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
28+
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
29+
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
30+
active: Annotated[bool, NumpyDtype(np.bool_)] = True
31+
32+
33+
N = 1_000_000
34+
35+
print(f"compact() benchmark | N = {N:,}\n")
36+
37+
# Build base data once
38+
np_dtype = np.dtype([
39+
("id", np.int64),
40+
("c_val", np.complex128),
41+
("score", np.float64),
42+
("active", np.bool_),
43+
])
44+
DATA = np.array(
45+
[
46+
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
47+
for i in range(N)
48+
],
49+
dtype=np_dtype,
50+
)
51+
52+
delete_fractions = [0.1, 0.25, 0.5, 0.75, 0.9]
53+
54+
print("=" * 75)
55+
print(f"{'DELETED':>10} {'ROWS LEFT':>10} {'TIME (s)':>12} {'CBYTES BEFORE':>15} {'CBYTES AFTER':>14}")
56+
print("-" * 75)
57+
58+
for frac in delete_fractions:
59+
ct = blosc2.CTable(RowModel, expected_size=N)
60+
ct.extend(DATA)
61+
62+
n_delete = int(N * frac)
63+
ct.delete(list(range(n_delete)))
64+
65+
cbytes_before = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
66+
67+
t0 = time()
68+
ct.compact()
69+
t_compact = time() - t0
70+
71+
cbytes_after = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
72+
73+
print(
74+
f"{frac*100:>9.0f}%"
75+
f" {N - n_delete:>10,}"
76+
f" {t_compact:>12.4f}"
77+
f" {cbytes_before / 1024**2:>13.2f} MB"
78+
f" {cbytes_after / 1024**2:>12.2f} MB"
79+
)
80+
81+
print("-" * 75)

bench/ctable/ctable_v_panda.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark comparing CTable vs pandas DataFrame for:
9+
# 1. Creation from a NumPy structured array
10+
# 2. Column access (full column)
11+
# 3. Filtering (where/query)
12+
# 4. Row iteration
13+
14+
from time import time
15+
from typing import Annotated
16+
17+
import numpy as np
18+
import pandas as pd
19+
from pydantic import BaseModel, Field
20+
21+
import blosc2
22+
23+
24+
class NumpyDtype:
25+
def __init__(self, dtype):
26+
self.dtype = dtype
27+
28+
29+
# Row model
30+
class RowModel(BaseModel):
31+
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
32+
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
33+
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
34+
active: Annotated[bool, NumpyDtype(np.bool_)] = True
35+
36+
37+
N = 1_000_000
38+
rng = np.random.default_rng(42)
39+
40+
print(f"CTable vs pandas benchmark | N = {N:,}\n")
41+
42+
# Build base data once
43+
np_dtype = np.dtype([
44+
("id", np.int64),
45+
("c_val", np.complex128),
46+
("score", np.float64),
47+
("active", np.bool_),
48+
])
49+
DATA = np.empty(N, dtype=np_dtype)
50+
DATA["id"] = np.arange(N, dtype=np.int64)
51+
DATA["c_val"] = rng.standard_normal(N) + 1j * rng.standard_normal(N)
52+
DATA["score"] = rng.uniform(0, 100, N)
53+
DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_)
54+
55+
print("=" * 65)
56+
print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}")
57+
print("-" * 65)
58+
59+
# 1. Creation
60+
t0 = time()
61+
ct = blosc2.CTable(RowModel, expected_size=N)
62+
ct.extend(DATA)
63+
t_ct_create = time() - t0
64+
65+
t0 = time()
66+
df = pd.DataFrame(DATA)
67+
t_pd_create = time() - t0
68+
69+
print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x")
70+
71+
# 2. Column access (full column)
72+
t0 = time()
73+
arr = ct["score"]
74+
t_ct_col = time() - t0
75+
76+
t0 = time()
77+
arr = df["score"]
78+
t_pd_col = time() - t0
79+
80+
print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x")
81+
82+
# 2.5 Column access (full column)
83+
t0 = time()
84+
arr = ct["score"].to_numpy()
85+
t_ct_col = time() - t0
86+
87+
t0 = time()
88+
arr = df["score"].to_numpy()
89+
t_pd_col = time() - t0
90+
91+
print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x")
92+
93+
# 3. Filtering
94+
t0 = time()
95+
result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000))
96+
t_ct_filter = time() - t0
97+
98+
t0 = time()
99+
result_pd = df.query("250000 < id < 750000")
100+
t_pd_filter = time() - t0
101+
102+
print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x")
103+
104+
# 4. Row iteration
105+
t0 = time()
106+
for val in ct["score"]:
107+
pass
108+
t_ct_iter = time() - t0
109+
110+
t0 = time()
111+
for val in df["score"]:
112+
pass
113+
t_pd_iter = time() - t0
114+
115+
print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x")
116+
117+
print("-" * 65)
118+
119+
# Memory
120+
ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes
121+
ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes
122+
pd_nbytes = df.memory_usage(deep=True).sum()
123+
124+
print(f"\nMemory — CTable compressed: {ct_cbytes / 1024**2:.2f} MB")
125+
print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB")
126+
print(f"Memory — pandas: {pd_nbytes / 1024**2:.2f} MB")
127+
print(f"Compression ratio CTable: {ct_nbytes / ct_cbytes:.2f}x")

bench/ctable/delete.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark for measuring delete() performance with different index types:
9+
# int, slice, and list — with varying sizes.
10+
11+
from time import time
12+
from typing import Annotated
13+
14+
import numpy as np
15+
from pydantic import BaseModel, Field
16+
17+
import blosc2
18+
19+
20+
class NumpyDtype:
21+
def __init__(self, dtype):
22+
self.dtype = dtype
23+
24+
25+
# Row model
26+
class RowModel(BaseModel):
27+
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
28+
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
29+
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
30+
active: Annotated[bool, NumpyDtype(np.bool_)] = True
31+
32+
33+
N = 1_000_000
34+
35+
print(f"delete() benchmark | N = {N:,}\n")
36+
37+
# Build base data once
38+
np_dtype = np.dtype([
39+
("id", np.int64),
40+
("c_val", np.complex128),
41+
("score", np.float64),
42+
("active", np.bool_),
43+
])
44+
DATA = np.array(
45+
[
46+
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
47+
for i in range(N)
48+
],
49+
dtype=np_dtype,
50+
)
51+
52+
delete_cases = [
53+
("int", 0),
54+
("slice small", slice(0, 100)),
55+
("slice large", slice(0, 100_000)),
56+
("slice full", slice(0, N)),
57+
("list small", list(range(100))),
58+
("list large", list(range(100_000))),
59+
("list full", list(range(N))),
60+
]
61+
62+
print("=" * 60)
63+
print(f"{'CASE':<20} {'ROWS DELETED':>14} {'TIME (s)':>12}")
64+
print("-" * 60)
65+
66+
for label, key in delete_cases:
67+
ct = blosc2.CTable(RowModel, expected_size=N)
68+
ct.extend(DATA)
69+
70+
if isinstance(key, int):
71+
n_deleted = 1
72+
elif isinstance(key, slice):
73+
n_deleted = len(range(*key.indices(N)))
74+
else:
75+
n_deleted = len(key)
76+
77+
t0 = time()
78+
ct.delete(key)
79+
t_delete = time() - t0
80+
print(f"{label:<20} {n_deleted:>14,} {t_delete:>12.6f}")
81+
82+
print("-" * 60)

bench/ctable/expected_size.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
# Benchmark for measuring the overhead of resize() when expected_size
9+
# is too small (M rows) vs correctly sized (N rows) during extend().
10+
11+
from time import time
12+
from typing import Annotated
13+
14+
import numpy as np
15+
from pydantic import BaseModel, Field
16+
17+
import blosc2
18+
19+
20+
class NumpyDtype:
21+
def __init__(self, dtype):
22+
self.dtype = dtype
23+
24+
25+
# Row model
26+
class RowModel(BaseModel):
27+
id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0)
28+
c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j)
29+
score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100)
30+
active: Annotated[bool, NumpyDtype(np.bool_)] = True
31+
32+
33+
34+
M = 779
35+
N = 62_500
36+
MAX_N = 1_000_000
37+
print(f"expected_size benchmark | wrong expected_size = {M}")
38+
39+
# Pre-generate full dataset once
40+
np_dtype = np.dtype([
41+
("id", np.int64),
42+
("c_val", np.complex128),
43+
("score", np.float64),
44+
("active", np.bool_),
45+
])
46+
DATA = np.array(
47+
[
48+
(i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0)
49+
for i in range(MAX_N)
50+
],
51+
dtype=np_dtype,
52+
)
53+
54+
while N <= MAX_N:
55+
print("-" * 80)
56+
print(f"N = {N:,} rows")
57+
58+
# 1. extend() with correct expected_size = N
59+
ct_correct = blosc2.CTable(RowModel, expected_size=N)
60+
t0 = time()
61+
ct_correct.extend(DATA[:N])
62+
t_correct = time() - t0
63+
print(f"extend() expected_size=N ({N:>8,}): {t_correct:.4f} s rows: {len(ct_correct):,}")
64+
65+
# 2. extend() with wrong expected_size = M (forces resize)
66+
ct_wrong = blosc2.CTable(RowModel, expected_size=M)
67+
t0 = time()
68+
ct_wrong.extend(DATA[:N])
69+
t_wrong = time() - t0
70+
print(f"extend() expected_size=M ({M:>8,}): {t_wrong:.4f} s rows: {len(ct_wrong):,}")
71+
72+
# Summary
73+
print(f" Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x")
74+
75+
N *= 2

0 commit comments

Comments
 (0)