Skip to content

Commit 034c0bf

Browse files
committed
Fix .b2z double-open corruption caused by GC-triggered repacking
Opening a .b2z store in append mode a second time (in a new process) was failing with "blosc2_schunk_open_offset returned NULL" because the first open had silently overwritten the archive with a near-empty ZIP file. Two root causes were identified and fixed: 1. Probe store in _open_treestore_root_object() called close() on a temporary TreeStore used only to read the manifest. This triggered to_b2z() and repacked the archive before the real open started. Fix: added DictStore.discard() (cleanup without repack) and switched the probe store to call discard() instead of close(). 2. CTable.__del__ (GC path) called storage.close() which chained into TreeStore.close() → to_b2z(). With nothing modified, the temp dir could be partially torn down, producing a corrupt archive. Fix: DictStore gains _closed and _modified flags; __del__ now calls discard() when _modified is False (no writes via __setitem__/ __delitem__), and close() otherwise. CTable.__del__ is changed to call storage.discard() directly. FileTableStorage.discard() and TableStorage.discard() added to complete the delegation chain. 3. TreeStore subtree views (created via __dict__.update from the parent) shared the parent's _temp_dir_obj. GC of a subtree could destroy the parent's temp dir. Fix: subtrees now set _closed=True immediately after copying the parent's __dict__. The contract is: - Explicit close() / with block → always repacks (user intent) - GC __del__ with no store-level writes → discard() (no repack, safe) - GC __del__ with __setitem__/__delitem__ writes → close() (repack) Add two regression tests: one at the DictStore layer and one for the full indexed-CTable-in-.b2z scenario that triggered the original report. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6e75a47 commit 034c0bf

7 files changed

Lines changed: 125 additions & 4 deletions

File tree

src/blosc2/ctable.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1044,7 +1044,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
10441044

10451045
def __del__(self):
10461046
with contextlib.suppress(Exception):
1047-
self.close()
1047+
storage = getattr(self, "_storage", None)
1048+
if storage is not None and hasattr(storage, "discard"):
1049+
storage.discard()
1050+
elif storage is not None and hasattr(storage, "close"):
1051+
storage.close()
10481052

10491053
def _init_columns(
10501054
self, expected_size: int, default_chunks, default_blocks, storage: TableStorage

src/blosc2/ctable_storage.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ def rename_column(self, old: str, new: str) -> blosc2.NDArray:
9292
def close(self) -> None:
9393
raise NotImplementedError
9494

95+
def discard(self) -> None:
96+
"""Clean up resources without persisting changes back to the archive."""
97+
self.close()
98+
9599
# -- Index catalog and epoch helpers -------------------------------------
96100

97101
def load_index_catalog(self) -> dict:
@@ -370,6 +374,13 @@ def close(self) -> None:
370374
self._store = None
371375
self._meta = None
372376

377+
def discard(self) -> None:
378+
"""Clean up without repacking the .b2z archive."""
379+
if self._store is not None:
380+
self._store.discard()
381+
self._store = None
382+
self._meta = None
383+
373384
# -- Index catalog and epoch helpers -------------------------------------
374385

375386
def load_index_catalog(self) -> dict:

src/blosc2/dict_store.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ def __init__(
141141
self.offsets = {}
142142
self.map_tree = {}
143143
self._temp_dir_obj = None
144+
self._closed = False
145+
self._modified = False
144146

145147
self._setup_paths_and_dirs(tmpdir)
146148

@@ -315,6 +317,12 @@ def _init_write_append_mode(
315317
"""Initialize store in write/append mode."""
316318
if self.mode == "a" and os.path.exists(self.localpath):
317319
if self.is_zip_store:
320+
# When using an explicit tmpdir the directory may already contain
321+
# stale files from a previous open that was never closed. Clear
322+
# it before extracting so we always start from a clean slate.
323+
if self._temp_dir_obj is None:
324+
shutil.rmtree(self.working_dir, ignore_errors=True)
325+
os.makedirs(self.working_dir, exist_ok=True)
318326
with zipfile.ZipFile(self.localpath, "r") as zf:
319327
zf.extractall(self.working_dir)
320328
elif not os.path.isdir(self.working_dir):
@@ -387,6 +395,7 @@ def __setitem__(
387395
self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray
388396
) -> None:
389397
"""Add a node to the DictStore."""
398+
self._modified = True
390399
if isinstance(value, np.ndarray):
391400
value = blosc2.asarray(value, cparams=self.cparams, dparams=self.dparams)
392401
# C2Array should always go to embed store; let estore handle it directly
@@ -488,6 +497,7 @@ def get(
488497

489498
def __delitem__(self, key: str) -> None:
490499
"""Remove a node from the DictStore."""
500+
self._modified = True
491501
if key in self.map_tree:
492502
# Remove from map_tree and delete the external file
493503
filepath = self.map_tree[key]
@@ -672,6 +682,10 @@ def _get_zip_offsets(self) -> dict[str, dict[str, int]]:
672682

673683
def close(self) -> None:
674684
"""Persist changes and cleanup."""
685+
if self._closed:
686+
return
687+
self._closed = True
688+
675689
# Repack estore
676690
# TODO: for some reason this is not working
677691
# if self.mode != "r":
@@ -680,13 +694,46 @@ def close(self) -> None:
680694
# f.write(cframe)
681695

682696
if self.is_zip_store and self.mode in ("w", "a"):
683-
# Serialize to b2z file
697+
# Serialize to b2z file.
684698
self.to_b2z(overwrite=True)
685699

686700
# Clean up temporary directory if we created it
687701
if self._temp_dir_obj is not None:
688702
self._temp_dir_obj.cleanup()
689703

704+
def discard(self) -> None:
705+
"""Clean up resources *without* repacking the .b2z file.
706+
707+
Use this instead of :meth:`close` when the store was opened only for
708+
inspection and should be thrown away without persisting any changes
709+
back to the archive.
710+
"""
711+
if self._closed:
712+
return
713+
self._closed = True
714+
if self._temp_dir_obj is not None:
715+
self._temp_dir_obj.cleanup()
716+
717+
def __del__(self):
718+
"""Ensure the temporary directory is removed and, if writes were made
719+
through this store's own API, the store is flushed back to the .b2z
720+
file.
721+
722+
When no Python-level writes went through ``__setitem__`` / ``__delitem__``
723+
(``_modified`` is False), we skip ``to_b2z()`` to avoid repacking a
724+
potentially partial temp dir during garbage collection. Explicit
725+
``close()`` / ``__exit__`` always repacks regardless.
726+
"""
727+
try:
728+
if not self._closed and self.is_zip_store and self.mode in ("w", "a") and not self._modified:
729+
# Skip repacking — discard is safe and avoids corrupting the
730+
# archive when the temp dir is torn down during GC.
731+
self.discard()
732+
else:
733+
self.close()
734+
except Exception:
735+
pass
736+
690737
def __enter__(self):
691738
"""Context manager enter."""
692739
return self

src/blosc2/schunk.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1705,7 +1705,11 @@ def _open_treestore_root_object(store, urlpath, mode):
17051705
if manifest["kind"] == "ctable":
17061706
if mode not in {"r", "a"}:
17071707
return store
1708-
store.close()
1708+
# Discard the probe store without repacking — it was only opened
1709+
# to peek at the manifest. A full close() would trigger to_b2z()
1710+
# even though nothing was modified, and CTable.open() below will
1711+
# create its own store anyway.
1712+
store.discard()
17091713
return blosc2.CTable.open(urlpath, mode=mode)
17101714

17111715
return store

src/blosc2/tree_store.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,12 @@ def __init__(self, *args, _from_parent_store=None, **kwargs):
154154
It supports the same arguments as :class:`blosc2.DictStore`.
155155
"""
156156
if _from_parent_store is not None:
157-
# This is a subtree view, copy state from parent
157+
# This is a subtree view, copy state from parent.
158+
# Mark it as closed so DictStore.__del__ does not attempt to pack
159+
# or clean up the shared backing store when this ephemeral view
160+
# is garbage-collected.
158161
self.__dict__.update(_from_parent_store.__dict__)
162+
self._closed = True
159163
else:
160164
# Call initialization and mark this storage as a b2tree object
161165
super().__init__(*args, **kwargs, _storage_meta={"b2tree": {"version": 1}})

tests/ctable/test_ctable_indexing.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,3 +436,32 @@ def test_indexes_multiple_columns():
436436
assert len(t.indexes) == 2
437437
col_names = {idx.col_name for idx in t.indexes}
438438
assert col_names == {"id", "category"}
439+
440+
441+
def test_indexed_ctable_b2z_double_open_append_no_corruption(tmp_path):
442+
"""Opening an indexed CTable .b2z in append mode twice must not corrupt it.
443+
444+
Regression test: GC of a CTable opened from .b2z was calling close() →
445+
to_b2z() even when nothing was modified, overwriting the archive with a
446+
near-empty ZIP that broke subsequent opens.
447+
"""
448+
path = str(tmp_path / "indexed.b2z")
449+
b2d_path = str(tmp_path / "indexed.b2d")
450+
451+
t = _make_table(50, persistent_path=b2d_path)
452+
t.create_index("id")
453+
t._storage._store.to_b2z(filename=path, overwrite=True)
454+
t._storage._store.close()
455+
shutil.rmtree(b2d_path)
456+
457+
# First open without explicit close — GC must not corrupt the archive
458+
t1 = blosc2.open(path, mode="a")
459+
assert t1.nrows == 50
460+
assert len(t1.indexes) == 1
461+
del t1 # triggers __del__; must NOT repack/corrupt
462+
463+
# Second open must succeed and see correct data
464+
t2 = blosc2.open(path, mode="a")
465+
assert t2.nrows == 50
466+
assert len(t2.indexes) == 1
467+
del t2

tests/test_dict_store.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,3 +693,25 @@ def test_mmap_mode_validation(tmp_path):
693693

694694
with pytest.raises(ValueError, match="mmap_mode='r' requires mode='r'"):
695695
DictStore(str(path), mode="a", mmap_mode="r")
696+
697+
698+
def test_b2z_double_open_append_no_corruption(tmp_path):
699+
"""Opening a .b2z store twice in append mode must not corrupt the archive.
700+
701+
Regression test: previously, GC of the first open's DictStore triggered
702+
``to_b2z()`` which overwrote the archive with a near-empty ZIP, causing the
703+
second open to fail with ``blosc2_schunk_open_offset`` returning NULL.
704+
"""
705+
path = str(tmp_path / "double_open.b2z")
706+
707+
with DictStore(path, mode="w") as ds:
708+
ds["/arr"] = blosc2.arange(20)
709+
710+
# First open — no explicit close (simulates the GC-triggered path)
711+
ds1 = DictStore(path, mode="a")
712+
assert np.array_equal(ds1["/arr"][:], np.arange(20))
713+
del ds1 # GC; must NOT corrupt the archive
714+
715+
# Second open — must succeed and see correct data
716+
with DictStore(path, mode="a") as ds2:
717+
assert np.array_equal(ds2["/arr"][:], np.arange(20))

0 commit comments

Comments
 (0)