Skip to content

Commit 0ab340d

Browse files
committed
Use metadata-based DictStore discovery and warn on leaf mismatches
1 parent a65cb69 commit 0ab340d

3 files changed

Lines changed: 249 additions & 37 deletions

File tree

src/blosc2/dict_store.py

Lines changed: 122 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os
1111
import shutil
1212
import tempfile
13+
import warnings
1314
import zipfile
1415
from typing import TYPE_CHECKING, Any
1516

@@ -94,6 +95,9 @@ class DictStore:
9495
-----
9596
- External persistence uses the following file extensions:
9697
.b2nd for NDArray, .b2f for SChunk, and .b2b for BatchStore.
98+
These suffixes are a naming convention for newly written leaves; when
99+
reopening an existing store, leaf typing is resolved from object
100+
metadata instead of trusting the suffix alone.
97101
"""
98102

99103
def __init__(
@@ -112,7 +116,7 @@ def __init__(
112116
"""
113117
See :class:`DictStore` for full documentation of parameters.
114118
"""
115-
self.localpath = localpath if isinstance(localpath, (str, bytes)) else str(localpath)
119+
self.localpath = localpath if isinstance(localpath, str | bytes) else str(localpath)
116120
if not self.localpath.endswith((".b2z", ".b2d")):
117121
raise ValueError(f"localpath must have a .b2z or .b2d extension; you passed: {self.localpath}")
118122
if mode not in ("r", "w", "a"):
@@ -182,13 +186,7 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None):
182186
mmap_mode=self.mmap_mode,
183187
dparams=dparams,
184188
)
185-
for filepath in self.offsets:
186-
if filepath.endswith((".b2nd", ".b2f", ".b2b")):
187-
if filepath.endswith(".b2nd"):
188-
key = "/" + filepath[:-5]
189-
else:
190-
key = "/" + filepath[:-4]
191-
self.map_tree[key] = filepath
189+
self._update_map_tree_from_offsets()
192190
else: # .b2d
193191
if not os.path.isdir(self.localpath):
194192
raise FileNotFoundError(f"Directory {self.localpath} does not exist for reading.")
@@ -204,6 +202,90 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None):
204202
self._estore = EmbedStore(_from_schunk=schunk)
205203
self.storage.meta = self._estore.storage.meta
206204

205+
@staticmethod
206+
def _logical_key_from_relpath(rel_path: str) -> str:
207+
"""Map an external leaf path to its logical tree key."""
208+
rel_path = rel_path.replace(os.sep, "/")
209+
key = os.path.splitext(rel_path)[0]
210+
if not key.startswith("/"):
211+
key = "/" + key
212+
return key
213+
214+
@staticmethod
215+
def _expected_ext_from_kind(kind: str) -> str:
216+
"""Return the canonical write-time suffix for a supported external leaf kind."""
217+
if kind == "ndarray":
218+
return ".b2nd"
219+
if kind == "batchstore":
220+
return ".b2b"
221+
return ".b2f"
222+
223+
@classmethod
224+
def _opened_external_kind(
225+
cls,
226+
opened: blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore | C2Array | Any,
227+
rel_path: str,
228+
) -> str | None:
229+
"""Return the supported external leaf kind for an already opened object."""
230+
processed = _process_opened_object(opened)
231+
if isinstance(processed, blosc2.BatchStore):
232+
kind = "batchstore"
233+
elif isinstance(processed, blosc2.VLArray):
234+
kind = "vlarray"
235+
elif isinstance(processed, blosc2.NDArray):
236+
kind = "ndarray"
237+
elif isinstance(processed, SChunk):
238+
kind = "schunk"
239+
else:
240+
warnings.warn(
241+
f"Ignoring unsupported Blosc2 object at '{rel_path}' during DictStore discovery: "
242+
f"{type(processed).__name__}",
243+
UserWarning,
244+
stacklevel=2,
245+
)
246+
return None
247+
248+
expected_ext = cls._expected_ext_from_kind(kind)
249+
found_ext = os.path.splitext(rel_path)[1]
250+
if found_ext != expected_ext:
251+
warnings.warn(
252+
f"External leaf '{rel_path}' uses extension '{found_ext}' but metadata resolves to "
253+
f"{type(processed).__name__}; expected '{expected_ext}'.",
254+
UserWarning,
255+
stacklevel=2,
256+
)
257+
return kind
258+
259+
def _probe_external_leaf_path(self, rel_path: str) -> bool:
260+
"""Return whether a working-dir file is a supported external leaf."""
261+
urlpath = os.path.join(self.working_dir, rel_path)
262+
try:
263+
opened = blosc2.blosc2_ext.open(
264+
urlpath,
265+
mode="r",
266+
offset=0,
267+
mmap_mode=self.mmap_mode,
268+
dparams=self.dparams,
269+
)
270+
except Exception:
271+
return False
272+
return self._opened_external_kind(opened, rel_path) is not None
273+
274+
def _probe_external_leaf_offset(self, filepath: str) -> bool:
275+
"""Return whether a zip member is a supported external leaf."""
276+
offset = self.offsets[filepath]["offset"]
277+
try:
278+
opened = blosc2.blosc2_ext.open(
279+
self.b2z_path,
280+
mode="r",
281+
offset=offset,
282+
mmap_mode=self.mmap_mode,
283+
dparams=self.dparams,
284+
)
285+
except Exception:
286+
return False
287+
return self._opened_external_kind(opened, filepath) is not None
288+
207289
def _init_write_append_mode(
208290
self,
209291
cparams: blosc2.CParams | None,
@@ -229,24 +311,23 @@ def _init_write_append_mode(
229311
self._update_map_tree()
230312

231313
def _update_map_tree(self):
232-
# Build map_tree from .b2nd and .b2f files in working dir
314+
# Build map_tree from supported external leaves in working dir.
233315
for root, _, files in os.walk(self.working_dir):
234316
for file in files:
235317
filepath = os.path.join(root, file)
236-
if filepath.endswith((".b2nd", ".b2f", ".b2b")):
237-
# Convert filename to key: remove extension and ensure starts with /
238-
rel_path = os.path.relpath(filepath, self.working_dir)
239-
# Normalize path separators to forward slashes for cross-platform consistency
240-
rel_path = rel_path.replace(os.sep, "/")
241-
if rel_path.endswith(".b2nd"):
242-
key = rel_path[:-5]
243-
elif rel_path.endswith(".b2b") or rel_path.endswith(".b2f"):
244-
key = rel_path[:-4]
245-
else:
246-
continue
247-
if not key.startswith("/"):
248-
key = "/" + key
249-
self.map_tree[key] = rel_path
318+
if os.path.abspath(filepath) == os.path.abspath(self.estore_path):
319+
continue
320+
rel_path = os.path.relpath(filepath, self.working_dir).replace(os.sep, "/")
321+
if self._probe_external_leaf_path(rel_path):
322+
self.map_tree[self._logical_key_from_relpath(rel_path)] = rel_path
323+
324+
def _update_map_tree_from_offsets(self):
325+
"""Build map_tree from supported external leaves in a zip store."""
326+
for filepath in self.offsets:
327+
if filepath == "embed.b2e":
328+
continue
329+
if self._probe_external_leaf_offset(filepath):
330+
self.map_tree[self._logical_key_from_relpath(filepath)] = filepath
250331

251332
@property
252333
def estore(self) -> EmbedStore:
@@ -255,13 +336,13 @@ def estore(self) -> EmbedStore:
255336

256337
@staticmethod
257338
def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> int:
258-
if isinstance(value, (blosc2.VLArray, blosc2.BatchStore)):
339+
if isinstance(value, blosc2.VLArray | blosc2.BatchStore):
259340
return value.schunk.nbytes
260341
return value.nbytes
261342

262343
@staticmethod
263344
def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchStore) -> bool:
264-
return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray, blosc2.BatchStore)) and bool(
345+
return isinstance(value, blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore) and bool(
265346
getattr(value, "urlpath", None)
266347
)
267348

@@ -406,12 +487,14 @@ def values(self) -> Iterator[blosc2.NDArray | SChunk | C2Array]:
406487
if self.is_zip_store:
407488
if filepath in self.offsets:
408489
offset = self.offsets[filepath]["offset"]
409-
yield blosc2.blosc2_ext.open(
410-
self.b2z_path,
411-
mode="r",
412-
offset=offset,
413-
mmap_mode=self.mmap_mode,
414-
dparams=self.dparams,
490+
yield _process_opened_object(
491+
blosc2.blosc2_ext.open(
492+
self.b2z_path,
493+
mode="r",
494+
offset=offset,
495+
mmap_mode=self.mmap_mode,
496+
dparams=self.dparams,
497+
)
415498
)
416499
else:
417500
urlpath = os.path.join(self.working_dir, filepath)
@@ -438,12 +521,14 @@ def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | C2Array]]:
438521
offset = self.offsets[filepath]["offset"]
439522
yield (
440523
key,
441-
blosc2.blosc2_ext.open(
442-
self.b2z_path,
443-
mode="r",
444-
offset=offset,
445-
mmap_mode=self.mmap_mode,
446-
dparams=self.dparams,
524+
_process_opened_object(
525+
blosc2.blosc2_ext.open(
526+
self.b2z_path,
527+
mode="r",
528+
offset=offset,
529+
mmap_mode=self.mmap_mode,
530+
dparams=self.dparams,
531+
)
447532
),
448533
)
449534
else:

tests/test_dict_store.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,22 @@
1616
from blosc2.dict_store import DictStore
1717

1818

19+
def _rename_store_member(store_path, old_name, new_name):
20+
"""Rename an external leaf inside a .b2d/.b2z store without changing its contents."""
21+
if str(store_path).endswith(".b2d"):
22+
old_path = os.path.join(store_path, old_name.replace("/", os.sep))
23+
new_path = os.path.join(store_path, new_name.replace("/", os.sep))
24+
os.rename(old_path, new_path)
25+
return
26+
27+
tmp_zip = f"{store_path}.tmp"
28+
with zipfile.ZipFile(store_path, "r") as src, zipfile.ZipFile(tmp_zip, "w", zipfile.ZIP_STORED) as dst:
29+
for info in src.infolist():
30+
arcname = new_name if info.filename == old_name else info.filename
31+
dst.writestr(arcname, src.read(info.filename), compress_type=zipfile.ZIP_STORED)
32+
os.replace(tmp_zip, store_path)
33+
34+
1935
@pytest.fixture(params=["b2d", "b2z"])
2036
def populated_dict_store(request):
2137
"""Create and populate a DictStore for tests.
@@ -266,6 +282,78 @@ def test_external_vlarray_file_and_reopen(tmp_path):
266282
assert value.vlmeta["description"] == "External VLArray"
267283

268284

285+
@pytest.mark.parametrize("storage_type", ["b2d", "b2z"])
286+
def test_metadata_discovery_reopens_renamed_external_ndarray(storage_type, tmp_path):
287+
path = tmp_path / f"test_renamed_ndarray.{storage_type}"
288+
ext_path = tmp_path / "renamed_array_source.b2nd"
289+
290+
with DictStore(str(path), mode="w", threshold=None) as dstore:
291+
arr_external = blosc2.arange(5, urlpath=str(ext_path), mode="w")
292+
arr_external.vlmeta["description"] = "Renamed NDArray"
293+
dstore["/dir1/node3"] = arr_external
294+
295+
old_name = "dir1/node3.b2nd"
296+
new_name = "dir1/node3.weird"
297+
_rename_store_member(str(path), old_name, new_name)
298+
299+
with pytest.warns(UserWarning, match=r"node3\.weird'.*NDArray.*expected '\.b2nd'"):
300+
dstore_read = DictStore(str(path), mode="r")
301+
with dstore_read:
302+
assert dstore_read.map_tree["/dir1/node3"] == new_name
303+
node3 = dstore_read["/dir1/node3"]
304+
assert isinstance(node3, blosc2.NDArray)
305+
assert np.array_equal(node3[:], np.arange(5))
306+
assert node3.vlmeta["description"] == "Renamed NDArray"
307+
308+
309+
@pytest.mark.parametrize("storage_type", ["b2d", "b2z"])
310+
def test_metadata_discovery_reopens_renamed_external_vlarray(storage_type, tmp_path):
311+
path = tmp_path / f"test_renamed_vlarray.{storage_type}"
312+
ext_path = tmp_path / "renamed_vlarray_source.b2frame"
313+
values = ["alpha", {"nested": True}, None, (1, 2, 3)]
314+
315+
vlarray = blosc2.VLArray(urlpath=str(ext_path), mode="w", contiguous=True)
316+
vlarray.extend(values)
317+
vlarray.vlmeta["description"] = "Renamed VLArray"
318+
319+
with DictStore(str(path), mode="w", threshold=None) as dstore:
320+
dstore["/dir1/vlarray_ext"] = vlarray
321+
322+
old_name = "dir1/vlarray_ext.b2f"
323+
new_name = "dir1/vlarray_ext.renamed"
324+
_rename_store_member(str(path), old_name, new_name)
325+
326+
with pytest.warns(UserWarning, match=r"vlarray_ext\.renamed'.*VLArray.*expected '\.b2f'"):
327+
dstore_read = DictStore(str(path), mode="r")
328+
with dstore_read:
329+
assert dstore_read.map_tree["/dir1/vlarray_ext"] == new_name
330+
value = dstore_read["/dir1/vlarray_ext"]
331+
assert isinstance(value, blosc2.VLArray)
332+
assert list(value) == values
333+
assert value.vlmeta["description"] == "Renamed VLArray"
334+
335+
336+
def test_metadata_discovery_warns_and_skips_unsupported_blosc2_leaf(tmp_path):
337+
path = tmp_path / "test_unsupported_lazyexpr.b2d"
338+
339+
with DictStore(str(path), mode="w") as dstore:
340+
dstore["/embedded"] = np.arange(3)
341+
342+
a = blosc2.asarray(np.arange(5), urlpath=str(tmp_path / "a.b2nd"), mode="w")
343+
b = blosc2.asarray(np.arange(5), urlpath=str(tmp_path / "b.b2nd"), mode="w")
344+
expr = a + b
345+
expr_path = path / "unsupported_lazyexpr.b2nd"
346+
expr.save(str(expr_path))
347+
348+
with pytest.warns(
349+
UserWarning, match=r"Ignoring unsupported Blosc2 object.*unsupported_lazyexpr\.b2nd.*LazyExpr"
350+
):
351+
dstore_read = DictStore(str(path), mode="r")
352+
with dstore_read:
353+
assert "/unsupported_lazyexpr" not in dstore_read
354+
assert "/embedded" in dstore_read
355+
356+
269357
def _digest_value(value):
270358
"""Return a bytes digest of a stored value."""
271359
if isinstance(value, blosc2.SChunk):

tests/test_tree_store.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import os
99
import shutil
10+
import zipfile
1011

1112
import numpy as np
1213
import pytest
@@ -15,6 +16,22 @@
1516
from blosc2.tree_store import TreeStore
1617

1718

19+
def _rename_store_member(store_path, old_name, new_name):
20+
"""Rename an external leaf inside a .b2d/.b2z store without changing its contents."""
21+
if str(store_path).endswith(".b2d"):
22+
old_path = os.path.join(store_path, old_name.replace("/", os.sep))
23+
new_path = os.path.join(store_path, new_name.replace("/", os.sep))
24+
os.rename(old_path, new_path)
25+
return
26+
27+
tmp_zip = f"{store_path}.tmp"
28+
with zipfile.ZipFile(store_path, "r") as src, zipfile.ZipFile(tmp_zip, "w", zipfile.ZIP_STORED) as dst:
29+
for info in src.infolist():
30+
arcname = new_name if info.filename == old_name else info.filename
31+
dst.writestr(arcname, src.read(info.filename), compress_type=zipfile.ZIP_STORED)
32+
os.replace(tmp_zip, store_path)
33+
34+
1835
@pytest.fixture(params=["b2d", "b2z"])
1936
def populated_tree_store(request):
2037
"""A fixture that creates and populates a TreeStore."""
@@ -671,6 +688,28 @@ def test_external_batchstore_support(tmp_path):
671688
assert [batch[:] for batch in retrieved] == [[{"id": 1}, {"id": 2}], [{"id": 3}]]
672689

673690

691+
@pytest.mark.parametrize("storage_type", ["b2d", "b2z"])
692+
def test_metadata_discovery_reopens_renamed_batchstore_leaf(storage_type, tmp_path):
693+
store_path = tmp_path / f"test_batchstore_renamed.{storage_type}"
694+
695+
with TreeStore(str(store_path), mode="w", threshold=0) as tstore:
696+
bstore = blosc2.BatchStore(max_blocksize=2)
697+
bstore.extend([[{"id": 1}, {"id": 2}], [{"id": 3}]])
698+
tstore["/data/batchstore"] = bstore
699+
700+
old_name = "data/batchstore.b2b"
701+
new_name = "data/batchstore.odd"
702+
_rename_store_member(str(store_path), old_name, new_name)
703+
704+
with pytest.warns(UserWarning, match=r"batchstore\.odd'.*BatchStore.*expected '\.b2b'"):
705+
tstore = TreeStore(str(store_path), mode="r")
706+
with tstore:
707+
assert tstore.map_tree["/data/batchstore"] == new_name
708+
retrieved = tstore["/data/batchstore"]
709+
assert isinstance(retrieved, blosc2.BatchStore)
710+
assert [batch[:] for batch in retrieved] == [[{"id": 1}, {"id": 2}], [{"id": 3}]]
711+
712+
674713
def test_treestore_vlmeta_externalized_b2d(tmp_path):
675714
store_path = tmp_path / "test_vlmeta_externalized.b2d"
676715

0 commit comments

Comments
 (0)