|
| 1 | +####################################################################### |
| 2 | +# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org> |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# SPDX-License-Identifier: BSD-3-Clause |
| 6 | +####################################################################### |
| 7 | + |
| 8 | +# Benchmark comparing CTable vs pandas DataFrame for: |
| 9 | +# 1. Creation from a NumPy structured array |
| 10 | +# 2. Column access (full column) |
| 11 | +# 3. Filtering (where/query) |
| 12 | +# 4. Row iteration |
| 13 | + |
| 14 | +from time import time |
| 15 | +from typing import Annotated |
| 16 | + |
| 17 | +import numpy as np |
| 18 | +import pandas as pd |
| 19 | +from pydantic import BaseModel, Field |
| 20 | + |
| 21 | +import blosc2 |
| 22 | + |
| 23 | + |
| 24 | +class NumpyDtype: |
| 25 | + def __init__(self, dtype): |
| 26 | + self.dtype = dtype |
| 27 | + |
| 28 | + |
| 29 | +# Row model |
| 30 | +class RowModel(BaseModel): |
| 31 | + id: Annotated[int, NumpyDtype(np.int64)] = Field(ge=0) |
| 32 | + c_val: Annotated[complex, NumpyDtype(np.complex128)] = Field(default=0j) |
| 33 | + score: Annotated[float, NumpyDtype(np.float64)] = Field(ge=0, le=100) |
| 34 | + active: Annotated[bool, NumpyDtype(np.bool_)] = True |
| 35 | + |
| 36 | + |
| 37 | +N = 1_000_000 |
| 38 | +rng = np.random.default_rng(42) |
| 39 | + |
| 40 | +print(f"CTable vs pandas benchmark | N = {N:,}\n") |
| 41 | + |
| 42 | +# Build base data once |
| 43 | +np_dtype = np.dtype([ |
| 44 | + ("id", np.int64), |
| 45 | + ("c_val", np.complex128), |
| 46 | + ("score", np.float64), |
| 47 | + ("active", np.bool_), |
| 48 | +]) |
| 49 | +DATA = np.empty(N, dtype=np_dtype) |
| 50 | +DATA["id"] = np.arange(N, dtype=np.int64) |
| 51 | +DATA["c_val"] = rng.standard_normal(N) + 1j * rng.standard_normal(N) |
| 52 | +DATA["score"] = rng.uniform(0, 100, N) |
| 53 | +DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_) |
| 54 | + |
| 55 | +print("=" * 65) |
| 56 | +print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}") |
| 57 | +print("-" * 65) |
| 58 | + |
| 59 | +# 1. Creation |
| 60 | +t0 = time() |
| 61 | +ct = blosc2.CTable(RowModel, expected_size=N) |
| 62 | +ct.extend(DATA) |
| 63 | +t_ct_create = time() - t0 |
| 64 | + |
| 65 | +t0 = time() |
| 66 | +df = pd.DataFrame(DATA) |
| 67 | +t_pd_create = time() - t0 |
| 68 | + |
| 69 | +print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x") |
| 70 | + |
| 71 | +# 2. Column access (full column) |
| 72 | +t0 = time() |
| 73 | +arr = ct["score"] |
| 74 | +t_ct_col = time() - t0 |
| 75 | + |
| 76 | +t0 = time() |
| 77 | +arr = df["score"] |
| 78 | +t_pd_col = time() - t0 |
| 79 | + |
| 80 | +print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x") |
| 81 | + |
| 82 | +# 2.5 Column access (full column) |
| 83 | +t0 = time() |
| 84 | +arr = ct["score"].to_numpy() |
| 85 | +t_ct_col = time() - t0 |
| 86 | + |
| 87 | +t0 = time() |
| 88 | +arr = df["score"].to_numpy() |
| 89 | +t_pd_col = time() - t0 |
| 90 | + |
| 91 | +print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x") |
| 92 | + |
| 93 | +# 3. Filtering |
| 94 | +t0 = time() |
| 95 | +result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000)) |
| 96 | +t_ct_filter = time() - t0 |
| 97 | + |
| 98 | +t0 = time() |
| 99 | +result_pd = df.query("250000 < id < 750000") |
| 100 | +t_pd_filter = time() - t0 |
| 101 | + |
| 102 | +print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x") |
| 103 | + |
| 104 | +# 4. Row iteration |
| 105 | +t0 = time() |
| 106 | +for val in ct["score"]: |
| 107 | + pass |
| 108 | +t_ct_iter = time() - t0 |
| 109 | + |
| 110 | +t0 = time() |
| 111 | +for val in df["score"]: |
| 112 | + pass |
| 113 | +t_pd_iter = time() - t0 |
| 114 | + |
| 115 | +print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x") |
| 116 | + |
| 117 | +print("-" * 65) |
| 118 | + |
| 119 | +# Memory |
| 120 | +ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes |
| 121 | +ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes |
| 122 | +pd_nbytes = df.memory_usage(deep=True).sum() |
| 123 | + |
| 124 | +print(f"\nMemory — CTable compressed: {ct_cbytes / 1024**2:.2f} MB") |
| 125 | +print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB") |
| 126 | +print(f"Memory — pandas: {pd_nbytes / 1024**2:.2f} MB") |
| 127 | +print(f"Compression ratio CTable: {ct_nbytes / ct_cbytes:.2f}x") |
0 commit comments