Skip to content

Commit 0b922d6

Browse files
committed
Merge branch 'main' of github.com:Blosc/python-blosc2
2 parents ad8a441 + 0f283ec commit 0b922d6

20 files changed

Lines changed: 500 additions & 69 deletions

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ endif()
6363

6464
FetchContent_Declare(miniexpr
6565
GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
66-
GIT_TAG 06ee29cdc9c5cfbf22f9a6fedcdaf46af1fec373
66+
GIT_TAG 320240a849e185c84114501200052bfeb8d66f2b
6767
# SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr
6868
)
6969
FetchContent_MakeAvailable(miniexpr)

ROADMAP-TO-4.0.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,26 @@ The constructor for the `Table` object should take some parameters to specify pr
2121
* `.__iter__()` for easy and fast iteration over rows.
2222
* `.where()`: an iterator for querying with conditions that are evaluated with the internal compute engine.
2323
* `.index()` for indexing a column and getting better performance in queries (desirable, but optional for 4.0).
24+
25+
In particular, it should try to mimic much of the functionality of data-querying libraries such as ``pandas`` (see [this blog](https://datapythonista.me/blog/whats-new-in-pandas-3) for much of the followin). Hence, one should be able to filter rows of the `Table` via querying on multiple columns (accessed via `.` or perhaps ``__getitem__``), with conditions to select rows implemented via `.index`, `.where` like so
26+
27+
```
28+
tbl.where((tbl.property_type == "hotel") & (tbl.country == "us"))
29+
```
30+
31+
It should also be possible to modify the filtered ``Table`` in-place, using some operation which only acts on the filtered elements (e.g ``assign``)
32+
33+
```
34+
tbl = tbl.where((tbl.property_type == "hotel") & (tbl.country == "us")).assign(max_people = tbl.max_people + tbl.max_children)
35+
```
36+
37+
Secondly, it should be possible to write bespoke transformation functions which act row-wise and then may be applied to get results from the `Table` and/or modify the ``Table`` in-place:
38+
39+
```
40+
def myudf(row):
41+
col = row.name_of_column
42+
# do things with col
43+
return result
44+
45+
ans = tbl.apply(myudf, axis=1)
46+
```

bench/ndarray/linear-constructor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
@blosc2.dsl_kernel
1919
def kernel_ramp(start):
20-
return start + _global_linear_idx # noqa: F821 # DSL index/shape symbols resolved by miniexpr
20+
return start + _flat_idx # noqa: F821 # DSL index/shape symbols resolved by miniexpr
2121

2222
t0 = time()
2323
npa = np.arange(np.prod(shape), dtype=dtype).reshape(shape)

bench/ndarray/sum-linear-idx.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#######################################################################
77

88
# Compare reduction performance on DSL kernels.
9-
# This uses the special _global_linear_idx var.
9+
# This uses the special _flat_idx var.
1010

1111
from time import time
1212
import numpy as np
@@ -19,7 +19,7 @@
1919
@blosc2.dsl_kernel
2020
def kernel_ramp():
2121
# return _i0 * _n1 + _i1 # noqa: F821 # DSL index/shape symbols resolved by miniexpr
22-
return _global_linear_idx # noqa: F821 # DSL index/shape symbols resolved by miniexpr
22+
return _flat_idx # noqa: F821 # DSL index/shape symbols resolved by miniexpr
2323

2424
print(kernel_ramp.dsl_source)
2525
a = blosc2.lazyudf(kernel_ramp, (), dtype=dtype, shape=shape)

doc/getting_started/tutorials/09.ucodecs-ufilters.ipynb

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -621,9 +621,7 @@
621621
],
622622
"source": [
623623
"array = blosc2.zeros((30, 30))\n",
624-
"array.schunk.cparams = blosc2.CParams(\n",
625-
" **{\"codec\": 184, \"filters\": [filter_id], \"filters_meta\": [0], \"nthreads\": 1}\n",
626-
")\n",
624+
"array.schunk.cparams = blosc2.CParams(codec=184, filters=[filter_id], filters_meta=[0], nthreads=1)\n",
627625
"array.schunk.cparams"
628626
]
629627
}

doc/getting_started/tutorials/10.prefilters.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@
324324
},
325325
"outputs": [],
326326
"source": [
327-
"my_schunk.dparams = blosc2.DParams(**{\"nthreads\": 1}) # Disable parallelism for decompression\n",
327+
"my_schunk.dparams = blosc2.DParams(nthreads=1) # Disable parallelism for decompression\n",
328328
"\n",
329329
"\n",
330330
"@my_schunk.postfilter(input_dtype)\n",
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Working with arrays of strings in Blosc2\n",
8+
"\n",
9+
"Blosc2 provides support for arrays in which the elements are strings, either of bytes (``np.bytes_`` equivalent to ``np.dtype('S0')``) or of unicode characters (``np.str_``, equivalent to ``np.dtype('U0')``), with the typesize determined by the longest element in the array. That is"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 1,
15+
"metadata": {},
16+
"outputs": [
17+
{
18+
"name": "stdout",
19+
"output_type": "stream",
20+
"text": [
21+
"Bytes array - dtype: |S3, typesize: 3\n",
22+
"Unicode array - dtype: <U3, typesize: 12\n"
23+
]
24+
}
25+
],
26+
"source": [
27+
"import numpy as np\n",
28+
"\n",
29+
"arr = np.array([b\"a23\", b\"89u\"])\n",
30+
"print(f\"Bytes array - dtype: {arr.dtype}, typesize: {arr.dtype.itemsize}\")\n",
31+
"arr = np.array([\"a23\", \"89u\"])\n",
32+
"print(f\"Unicode array - dtype: {arr.dtype}, typesize: {arr.dtype.itemsize}\")"
33+
]
34+
},
35+
{
36+
"cell_type": "markdown",
37+
"metadata": {},
38+
"source": [
39+
"\n",
40+
"since each unicode character encodes 4 bytes. This carries over to the ``blosc2.NDArray`` object. Indeed, such arrays, particularly those of Unicode type, are highly compressible, since almost all of the bits encoding each item will be 0 (i.e. ``\\x00``), as can be seen by viewing the second array above as an array of bytestrings"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 2,
46+
"metadata": {},
47+
"outputs": [
48+
{
49+
"data": {
50+
"text/plain": [
51+
"array([b'a\\x00\\x00\\x002\\x00\\x00\\x003', b'8\\x00\\x00\\x009\\x00\\x00\\x00u'],\n",
52+
" dtype='|S12')"
53+
]
54+
},
55+
"execution_count": 2,
56+
"metadata": {},
57+
"output_type": "execute_result"
58+
}
59+
],
60+
"source": [
61+
"arr.view(\"S12\")"
62+
]
63+
},
64+
{
65+
"cell_type": "markdown",
66+
"metadata": {},
67+
"source": [
68+
"(The trailing ``\\x00`` bytes are suppressed for the last character). Consequently, using Blosc2 can save you a lot of space (in memory or disk) when working with arrays of strings, if one exploits the structure of the (unicode) strings correctly. Specifically, the fundamental building block of the array should be the byte, and not the element - in this way, using the shuffle filter groups the $N$ elements having $m$ characters of bytesize 4 into $4$ streams of $Nm$ bytes, so that the corresponding bytes for all characters are grouped together. For the array above, one transforms the array from (2 elements of $3 \\times 4 = 12$ bytes)\n",
69+
"```\n",
70+
"|a\\x00\\x00\\x002\\x00\\x00\\x003\\x00\\x00\\x00|8\\x00\\x00\\x009\\x00\\x00\\x00u\\x00\\x00\\x00|\n",
71+
"```\n",
72+
"to (4 streams of $2 \\times 3 = 6$ bytes)\n",
73+
"```\n",
74+
"|a2389u|\\x00\\x00\\x00\\x00\\x00\\x00|\\x00\\x00\\x00\\x00\\x00\\x00|\\x00\\x00\\x00\\x00\\x00\\x00|\n",
75+
"```\n",
76+
"For the example above, 3 of the bytes for each character are 0, and so by grouping these zeros together, it is more likely to have chunks composed entirely or almost entirely of zeros, which may then be readily compressed.\n",
77+
"If one were to break up the array by elements, one would end up with $4m$ streams of $N$ bytes, i.e. ($4 \\times 3 = 12$ streams of $2$ bytes)\n",
78+
"```\n",
79+
"|a8|\\x00\\x00|\\x00\\x00|\\x00\\x00|29|\\x00\\x00|\\x00\\x00|\\x00\\x00|3u|\\x00\\x00|\\x00\\x00|\\x00\\x00|\n",
80+
"```\n",
81+
"which is not very compressible, since the informative bytes are fragmented by small groups of 0 bytes. This heuristic has been implemented as a default for Blosc2 string compression, so you can just compress your arrays of strings and reap the benefits without having to worry about such intricacies though! Check it out below:"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 3,
87+
"metadata": {},
88+
"outputs": [
89+
{
90+
"name": "stdout",
91+
"output_type": "stream",
92+
"text": [
93+
"cratio forcing non-string defaults: 2417x\n",
94+
"cratio allowing blosc2 to optimise: 3902x\n"
95+
]
96+
}
97+
],
98+
"source": [
99+
"import blosc2\n",
100+
"\n",
101+
"N = int(1e5)\n",
102+
"nparr = np.repeat(np.array([\"josé\", \"pepe\", \"francisco\"]), N)\n",
103+
"cparams = blosc2.cparams_dflts\n",
104+
"arr1 = blosc2.asarray(nparr, cparams=cparams)\n",
105+
"print(f\"cratio forcing non-string defaults: {round(arr1.cratio)}x\")\n",
106+
"arr1 = blosc2.asarray(nparr)\n",
107+
"print(f\"cratio allowing blosc2 to optimise: {round(arr1.cratio)}x\")"
108+
]
109+
},
110+
{
111+
"cell_type": "markdown",
112+
"metadata": {},
113+
"source": [
114+
"## Operating on arrays of strings\n",
115+
"\n",
116+
"Blosc2 has two sides, compression and computation, which are tightly enmeshed, and for arrays of strings the same applies. We have implemented a subset of useful functions for strings:\n",
117+
"- comparison operations ``<, <=, ==, !=, >=, >``\n",
118+
"- 2-argument functions ``contains, startswith, endswith``\n",
119+
"- 1-argument functions ``lower, upper``\n",
120+
"\n",
121+
"Where possible these will be computed by the ``miniexpr`` backend, which is a highly optimised, fully compiled, multithreaded library which is the most complete expression of Blosc2's goal: fully vertically integrated decompression/computation/recompression with optimal cache hierarchy exploitation and super-fast compiled-C code for as much of the pipeline as possible. In cases where this is not possible, a more robust path which still avoids memory overload for large arrays is used.\n",
122+
"\n",
123+
"The arguments may be scalars or arrays (``blosc2.NDArray`` or other types) of strings or bytes. "
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": 4,
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"for t in (\"bytes\", \"string\"):\n",
133+
" if t == \"bytes\":\n",
134+
" a1 = np.array([b\"abc\", b\"def\", b\"atErr\", b\"oot\", b\"zu\", b\"ab c\"])\n",
135+
" a2 = a2_blosc = b\"a\"\n",
136+
" else:\n",
137+
" a1 = np.array([\"abc\", \"def\", \"atErr\", \"oot\", \"zu\", \"ab c\"])\n",
138+
" a2 = a2_blosc = \"a\"\n",
139+
" a1_blosc = blosc2.asarray(a1)\n",
140+
" for func, npfunc in zip(\n",
141+
" (blosc2.startswith, blosc2.endswith, blosc2.contains),\n",
142+
" (np.char.startswith, np.char.endswith, lambda *args: np.char.find(*args) != -1),\n",
143+
" strict=True,\n",
144+
" ):\n",
145+
" expr_lazy = func(a1_blosc, a2_blosc)\n",
146+
" res_numexpr = npfunc(a1, a2)\n",
147+
" assert expr_lazy.shape == res_numexpr.shape\n",
148+
" assert expr_lazy.dtype == blosc2.bool_\n",
149+
" np.testing.assert_array_equal(expr_lazy[:], res_numexpr)\n",
150+
"\n",
151+
" np.testing.assert_array_equal((a1_blosc < a2_blosc)[:], a1 < a2)\n",
152+
" np.testing.assert_array_equal((a1_blosc <= a2_blosc)[:], a1 <= a2)\n",
153+
" np.testing.assert_array_equal((a1_blosc == a2_blosc)[:], a1 == a2)\n",
154+
" np.testing.assert_array_equal((a1_blosc != a2_blosc)[:], a1 != a2)\n",
155+
" np.testing.assert_array_equal((a1_blosc >= a2_blosc)[:], a1 >= a2)\n",
156+
" np.testing.assert_array_equal((a1_blosc > a2_blosc)[:], a1 > a2)\n",
157+
"\n",
158+
" for func, npfunc in zip((blosc2.lower, blosc2.upper), (np.char.lower, np.char.upper), strict=True):\n",
159+
" expr_lazy = func(a1_blosc)\n",
160+
" res_numexpr = npfunc(a1)\n",
161+
" assert expr_lazy.shape == res_numexpr.shape\n",
162+
" np.testing.assert_array_equal(expr_lazy[:], res_numexpr)"
163+
]
164+
},
165+
{
166+
"cell_type": "markdown",
167+
"metadata": {},
168+
"source": []
169+
}
170+
],
171+
"metadata": {
172+
"kernelspec": {
173+
"display_name": "blosc2env",
174+
"language": "python",
175+
"name": "python3"
176+
},
177+
"language_info": {
178+
"codemirror_mode": {
179+
"name": "ipython",
180+
"version": 3
181+
},
182+
"file_extension": ".py",
183+
"mimetype": "text/x-python",
184+
"name": "python",
185+
"nbconvert_exporter": "python",
186+
"pygments_lexer": "ipython3",
187+
"version": "3.13.7"
188+
}
189+
},
190+
"nbformat": 4,
191+
"nbformat_minor": 2
192+
}

src/blosc2/__init__.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -393,17 +393,21 @@ def _raise(exc):
393393
"max dimensions": MAX_DIM,
394394
},
395395
default_device=lambda: "cpu",
396-
default_dtypes=lambda device=None: {
397-
"real floating": DEFAULT_FLOAT,
398-
"complex floating": DEFAULT_COMPLEX,
399-
"integral": DEFAULT_INT,
400-
"indexing": DEFAULT_INDEX,
401-
}
402-
if (device == "cpu" or device is None)
403-
else _raise(ValueError("Only cpu devices allowed")),
404-
dtypes=lambda device=None, kind=None: np.__array_namespace_info__().dtypes(kind=kind, device=device)
405-
if (device == "cpu" or device is None)
406-
else _raise(ValueError("Only cpu devices allowed")),
396+
default_dtypes=lambda device=None: (
397+
{
398+
"real floating": DEFAULT_FLOAT,
399+
"complex floating": DEFAULT_COMPLEX,
400+
"integral": DEFAULT_INT,
401+
"indexing": DEFAULT_INDEX,
402+
}
403+
if (device == "cpu" or device is None)
404+
else _raise(ValueError("Only cpu devices allowed"))
405+
),
406+
dtypes=lambda device=None, kind=None: (
407+
np.__array_namespace_info__().dtypes(kind=kind, device=device)
408+
if (device == "cpu" or device is None)
409+
else _raise(ValueError("Only cpu devices allowed"))
410+
),
407411
devices=lambda: ["cpu"],
408412
name="blosc2",
409413
version=__version__,
@@ -649,6 +653,7 @@ def _raise(exc):
649653
logical_not,
650654
logical_or,
651655
logical_xor,
656+
lower,
652657
max,
653658
maximum,
654659
mean,
@@ -681,6 +686,7 @@ def _raise(exc):
681686
tan,
682687
tanh,
683688
trunc,
689+
upper,
684690
var,
685691
where,
686692
)
@@ -852,6 +858,7 @@ def _raise(exc):
852858
"logical_not",
853859
"logical_or",
854860
"logical_xor",
861+
"lower",
855862
"matmul",
856863
"matrix_transpose",
857864
"max",
@@ -923,6 +930,7 @@ def _raise(exc):
923930
"unpack_array",
924931
"unpack_array2",
925932
"unpack_tensor",
933+
"upper",
926934
"validate_expr",
927935
"var",
928936
"vecdot",

0 commit comments

Comments
 (0)