Skip to content

Commit 920a6f7

Browse files
Merge pull request #602 from Blosc/vlarray
First implementation of a VLArray store
2 parents 73fa467 + 3d03196 commit 920a6f7

20 files changed

Lines changed: 1295 additions & 45 deletions

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ else()
119119
include(FetchContent)
120120
FetchContent_Declare(blosc2
121121
GIT_REPOSITORY https://github.com/Blosc/c-blosc2
122-
GIT_TAG 1386ef42f58b61c876edf714a2af84bd7b59dc5d # v2.23.1
122+
GIT_TAG 25197eb96d05318c939b3252a6b373ccd6ae49fe # variable-length chunks support in schunks
123+
# SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2
123124
)
124125
FetchContent_MakeAvailable(blosc2)
125126
include_directories("${blosc2_SOURCE_DIR}/include")

doc/getting_started/tutorials.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ Tutorials
1616
tutorials/08.schunk-slicing_and_beyond
1717
tutorials/09.ucodecs-ufilters
1818
tutorials/10.prefilters
19+
tutorials/11.vlarray
Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,325 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Working with VLArray\n",
8+
"\n",
9+
"A `VLArray` is a list-like container for variable-length Python values backed by a single `SChunk`. Each entry is stored in its own compressed chunk, and values are serialized with msgpack before reaching storage.\n",
10+
"\n",
11+
"This makes `VLArray` a good fit for heterogeneous, variable-length payloads such as small dictionaries, strings, tuples, byte blobs, or nested list/dict structures."
12+
],
13+
"id": "ceb4789a488cc07f"
14+
},
15+
{
16+
"cell_type": "code",
17+
"metadata": {
18+
"ExecuteTime": {
19+
"end_time": "2026-03-14T16:57:57.563663Z",
20+
"start_time": "2026-03-14T16:57:57.294290Z"
21+
}
22+
},
23+
"source": [
24+
"import blosc2\n",
25+
"\n",
26+
"\n",
27+
"def show(label, value):\n",
28+
" print(f\"{label}: {value}\")\n",
29+
"\n",
30+
"\n",
31+
"urlpath = \"vlarray_tutorial.b2frame\"\n",
32+
"copy_path = \"vlarray_tutorial_copy.b2frame\"\n",
33+
"blosc2.remove_urlpath(urlpath)\n",
34+
"blosc2.remove_urlpath(copy_path)"
35+
],
36+
"id": "f264f2e4bcb57029",
37+
"outputs": [],
38+
"execution_count": 1
39+
},
40+
{
41+
"cell_type": "markdown",
42+
"metadata": {},
43+
"source": [
44+
"## Creating and populating a VLArray\n",
45+
"\n",
46+
"Entries can be appended one by one or in batches with `extend()`. The container accepts the msgpack-safe Python types supported by the implementation: `bytes`, `str`, `int`, `float`, `bool`, `None`, `list`, `tuple`, and `dict`."
47+
],
48+
"id": "24ceae332dfa437"
49+
},
50+
{
51+
"cell_type": "code",
52+
"metadata": {
53+
"ExecuteTime": {
54+
"end_time": "2026-03-14T16:57:57.609603Z",
55+
"start_time": "2026-03-14T16:57:57.569987Z"
56+
}
57+
},
58+
"source": [
59+
"vla = blosc2.VLArray(urlpath=urlpath, mode=\"w\")\n",
60+
"vla.append({\"name\": \"alpha\", \"count\": 1})\n",
61+
"vla.extend([b\"bytes\", (\"a\", 2), [\"x\", \"y\"], 42, None])\n",
62+
"vla.insert(1, \"between\")\n",
63+
"\n",
64+
"show(\"Initial entries\", list(vla))\n",
65+
"show(\"Length\", len(vla))"
66+
],
67+
"id": "10e4e9ce600cda9d",
68+
"outputs": [
69+
{
70+
"name": "stdout",
71+
"output_type": "stream",
72+
"text": [
73+
"Initial entries: [{'name': 'alpha', 'count': 1}, 'between', b'bytes', ('a', 2), ['x', 'y'], 42, None]\n",
74+
"Length: 7\n"
75+
]
76+
}
77+
],
78+
"execution_count": 2
79+
},
80+
{
81+
"cell_type": "markdown",
82+
"metadata": {},
83+
"source": [
84+
"## Indexing and slicing\n",
85+
"\n",
86+
"Indexing behaves like a Python list. Negative indexes are supported, and slice reads return a plain Python list."
87+
],
88+
"id": "2f2dbe81b7653d8f"
89+
},
90+
{
91+
"cell_type": "code",
92+
"metadata": {
93+
"ExecuteTime": {
94+
"end_time": "2026-03-14T16:57:57.677796Z",
95+
"start_time": "2026-03-14T16:57:57.623048Z"
96+
}
97+
},
98+
"source": [
99+
"show(\"Last entry\", vla[-1])\n",
100+
"show(\"Slice [1:6:2]\", vla[1:6:2])\n",
101+
"show(\"Reverse slice\", vla[::-2])"
102+
],
103+
"id": "82ea38dca631efb9",
104+
"outputs": [
105+
{
106+
"name": "stdout",
107+
"output_type": "stream",
108+
"text": [
109+
"Last entry: None\n",
110+
"Slice [1:6:2]: ['between', ('a', 2), 42]\n",
111+
"Reverse slice: [None, ['x', 'y'], b'bytes', {'name': 'alpha', 'count': 1}]\n"
112+
]
113+
}
114+
],
115+
"execution_count": 3
116+
},
117+
{
118+
"cell_type": "markdown",
119+
"metadata": {},
120+
"source": [
121+
"## Updating, inserting, and deleting\n",
122+
"\n",
123+
"Single entries can be overwritten by index. Slice assignment follows Python list rules: slices with `step == 1` may resize the container, while extended slices require matching lengths."
124+
],
125+
"id": "a871bb9b21d6f36c"
126+
},
127+
{
128+
"cell_type": "code",
129+
"metadata": {
130+
"ExecuteTime": {
131+
"end_time": "2026-03-14T16:57:57.727569Z",
132+
"start_time": "2026-03-14T16:57:57.678936Z"
133+
}
134+
},
135+
"source": [
136+
"vla[2:5] = [\"replaced\", {\"nested\": True}]\n",
137+
"show(\"After slice replacement\", list(vla))\n",
138+
"\n",
139+
"vla[::2] = [\"even-0\", \"even-1\", \"even-2\"]\n",
140+
"show(\"After extended-slice update\", list(vla))\n",
141+
"\n",
142+
"del vla[1::3]\n",
143+
"show(\"After slice deletion\", list(vla))\n",
144+
"\n",
145+
"removed = vla.pop()\n",
146+
"show(\"Popped entry\", removed)\n",
147+
"show(\"After pop\", list(vla))"
148+
],
149+
"id": "e22e4f90499ae02",
150+
"outputs": [
151+
{
152+
"name": "stdout",
153+
"output_type": "stream",
154+
"text": [
155+
"After slice replacement: [{'name': 'alpha', 'count': 1}, 'between', 'replaced', {'nested': True}, 42, None]\n",
156+
"After extended-slice update: ['even-0', 'between', 'even-1', {'nested': True}, 'even-2', None]\n",
157+
"After slice deletion: ['even-0', 'even-1', {'nested': True}, None]\n",
158+
"Popped entry: None\n",
159+
"After pop: ['even-0', 'even-1', {'nested': True}]\n"
160+
]
161+
}
162+
],
163+
"execution_count": 4
164+
},
165+
{
166+
"cell_type": "markdown",
167+
"metadata": {},
168+
"source": [
169+
"## Copying with new storage or compression parameters\n",
170+
"\n",
171+
"The `copy()` method can duplicate the container into a different storage layout or with different compression settings."
172+
],
173+
"id": "f41af458cb5faa9f"
174+
},
175+
{
176+
"cell_type": "code",
177+
"metadata": {
178+
"ExecuteTime": {
179+
"end_time": "2026-03-14T16:57:57.747309Z",
180+
"start_time": "2026-03-14T16:57:57.730015Z"
181+
}
182+
},
183+
"source": [
184+
"vla_copy = vla.copy(\n",
185+
" urlpath=copy_path,\n",
186+
" contiguous=False,\n",
187+
" cparams={\"codec\": blosc2.Codec.LZ4, \"clevel\": 5},\n",
188+
")\n",
189+
"\n",
190+
"show(\"Copied entries\", list(vla_copy))\n",
191+
"show(\"Copy storage is contiguous\", vla_copy.schunk.contiguous)\n",
192+
"show(\"Copy codec\", vla_copy.cparams.codec)"
193+
],
194+
"id": "6e752260e010272e",
195+
"outputs": [
196+
{
197+
"name": "stdout",
198+
"output_type": "stream",
199+
"text": [
200+
"Copied entries: ['even-0', 'even-1', {'nested': True}]\n",
201+
"Copy storage is contiguous: False\n",
202+
"Copy codec: Codec.LZ4\n"
203+
]
204+
}
205+
],
206+
"execution_count": 5
207+
},
208+
{
209+
"cell_type": "markdown",
210+
"metadata": {},
211+
"source": [
212+
"## Round-tripping through cframes and reopening from disk\n",
213+
"\n",
214+
"Tagged persistent stores automatically reopen as `VLArray`, and a serialized cframe buffer does too."
215+
],
216+
"id": "bb576497d4b6f537"
217+
},
218+
{
219+
"cell_type": "code",
220+
"metadata": {
221+
"ExecuteTime": {
222+
"end_time": "2026-03-14T16:57:57.759998Z",
223+
"start_time": "2026-03-14T16:57:57.748296Z"
224+
}
225+
},
226+
"source": [
227+
"cframe = vla.to_cframe()\n",
228+
"restored = blosc2.from_cframe(cframe)\n",
229+
"show(\"from_cframe type\", type(restored).__name__)\n",
230+
"show(\"from_cframe entries\", list(restored))\n",
231+
"\n",
232+
"reopened = blosc2.open(urlpath, mode=\"r\", mmap_mode=\"r\")\n",
233+
"show(\"Reopened type\", type(reopened).__name__)\n",
234+
"show(\"Reopened entries\", list(reopened))"
235+
],
236+
"id": "42d59dccf6ea9c44",
237+
"outputs": [
238+
{
239+
"name": "stdout",
240+
"output_type": "stream",
241+
"text": [
242+
"from_cframe type: VLArray\n",
243+
"from_cframe entries: ['even-0', 'even-1', {'nested': True}]\n",
244+
"Reopened type: VLArray\n",
245+
"Reopened entries: ['even-0', 'even-1', {'nested': True}]\n"
246+
]
247+
}
248+
],
249+
"execution_count": 6
250+
},
251+
{
252+
"cell_type": "markdown",
253+
"metadata": {},
254+
"source": [
255+
"## Clearing and reusing a container\n",
256+
"\n",
257+
"Calling `clear()` resets the backing storage so the container remains ready for new variable-length entries."
258+
],
259+
"id": "53778312cc1a03bc"
260+
},
261+
{
262+
"cell_type": "code",
263+
"metadata": {
264+
"ExecuteTime": {
265+
"end_time": "2026-03-14T16:57:57.778160Z",
266+
"start_time": "2026-03-14T16:57:57.761236Z"
267+
}
268+
},
269+
"source": [
270+
"scratch = vla.copy()\n",
271+
"scratch.clear()\n",
272+
"scratch.extend([\"fresh\", 123, {\"done\": True}])\n",
273+
"show(\"After clear + extend on in-memory copy\", list(scratch))\n",
274+
"\n",
275+
"blosc2.remove_urlpath(urlpath)\n",
276+
"blosc2.remove_urlpath(copy_path)"
277+
],
278+
"id": "55b9ea793a41f38a",
279+
"outputs": [
280+
{
281+
"name": "stdout",
282+
"output_type": "stream",
283+
"text": [
284+
"After clear + extend on in-memory copy: ['fresh', 123, {'done': True}]\n"
285+
]
286+
}
287+
],
288+
"execution_count": 7
289+
},
290+
{
291+
"metadata": {
292+
"ExecuteTime": {
293+
"end_time": "2026-03-14T16:57:57.789994Z",
294+
"start_time": "2026-03-14T16:57:57.779434Z"
295+
}
296+
},
297+
"cell_type": "code",
298+
"source": "",
299+
"id": "34e77790ab2a0f94",
300+
"outputs": [],
301+
"execution_count": 7
302+
}
303+
],
304+
"metadata": {
305+
"kernelspec": {
306+
"display_name": "Python 3 (ipykernel)",
307+
"language": "python",
308+
"name": "python3"
309+
},
310+
"language_info": {
311+
"codemirror_mode": {
312+
"name": "ipython",
313+
"version": 3
314+
},
315+
"file_extension": ".py",
316+
"mimetype": "text/x-python",
317+
"name": "python",
318+
"nbconvert_exporter": "python",
319+
"pygments_lexer": "ipython3",
320+
"version": "3.13.5"
321+
}
322+
},
323+
"nbformat": 4,
324+
"nbformat_minor": 5
325+
}

doc/reference/classes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Main Classes
1616
DictStore
1717
TreeStore
1818
EmbedStore
19+
VLArray
1920
Proxy
2021
ProxySource
2122
ProxyNDSource
@@ -33,6 +34,7 @@ Main Classes
3334
dict_store
3435
tree_store
3536
embed_store
37+
vlarray
3638
proxy
3739
proxysource
3840
proxyndsource

doc/reference/misc.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ This page documents the miscellaneous members of the ``blosc2`` module that do n
134134
TreeStore,
135135
DictStore,
136136
EmbedStore,
137+
VLArray,
138+
vlarray_from_cframe,
137139
abs,
138140
acos,
139141
acosh,

0 commit comments

Comments
 (0)