Skip to content

Commit 5ae15cb

Browse files
committed
Add a tmpdir for allowing to specify a dir for temps during full indexes creation
1 parent 808d993 commit 5ae15cb

3 files changed

Lines changed: 86 additions & 5 deletions

File tree

src/blosc2/indexing.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,18 @@ def _tmpdir_for_array(array: blosc2.NDArray) -> str | None:
272272
return None
273273

274274

275+
def _resolve_full_index_tmpdir(array: blosc2.NDArray, tmpdir: str | None) -> str | None:
276+
"""Resolve the workspace for temporary files used by OOC full index builds.
277+
278+
An explicit ``tmpdir`` wins. Otherwise, persistent arrays default to the
279+
directory that stores the array so temporary sidecars stay on the same
280+
filesystem. In-memory arrays fall back to the system temporary directory.
281+
"""
282+
if tmpdir is not None:
283+
return tmpdir
284+
return _tmpdir_for_array(array)
285+
286+
275287
def _load_store(array: blosc2.NDArray) -> dict:
276288
if _is_persistent_array(array):
277289
key = _array_key(array)
@@ -3006,8 +3018,16 @@ def create_index(
30063018
persistent: bool | None = None,
30073019
in_mem: bool = False,
30083020
name: str | None = None,
3021+
tmpdir: str | None = None,
30093022
**kwargs,
30103023
) -> dict:
3024+
"""Create an index descriptor for a 1-D array or structured field.
3025+
3026+
Parameters are equivalent to :meth:`blosc2.NDArray.create_index`.
3027+
``tmpdir`` controls where temporary files for out-of-core ``kind="full"``
3028+
builds are created. If ``None``, persistent arrays default to their own
3029+
directory and in-memory arrays use the system temporary directory.
3030+
"""
30113031
cparams = _normalize_index_cparams(kwargs.pop("cparams", None))
30123032
del kwargs
30133033
dtype = _validate_index_target(array, field)
@@ -3034,7 +3054,7 @@ def create_index(
30343054
full = None
30353055
if kind == "full":
30363056
with tempfile.TemporaryDirectory(
3037-
prefix="blosc2-index-ooc-", dir=_tmpdir_for_array(array)
3057+
prefix="blosc2-index-ooc-", dir=_resolve_full_index_tmpdir(array, tmpdir)
30383058
) as tmpdir:
30393059
full = _build_full_descriptor_ooc(
30403060
array, target, token, kind, dtype, persistent, Path(tmpdir), cparams
@@ -3201,8 +3221,15 @@ def create_expr_index(
32013221
return _copy_descriptor(descriptor)
32023222

32033223

3204-
def create_csindex(array: blosc2.NDArray, field: str | None = None, **kwargs) -> dict:
3205-
return create_index(array, field=field, kind="full", **kwargs)
3224+
def create_csindex(
3225+
array: blosc2.NDArray, field: str | None = None, tmpdir: str | None = None, **kwargs
3226+
) -> dict:
3227+
"""Create a full sorted index descriptor.
3228+
3229+
This is shorthand for :func:`create_index` with ``kind="full"``.
3230+
``tmpdir`` has the same meaning as in :func:`create_index`.
3231+
"""
3232+
return create_index(array, field=field, kind="full", tmpdir=tmpdir, **kwargs)
32063233

32073234

32083235
def _resolve_index_token(store: dict, field: str | None, name: str | None) -> str:

src/blosc2/ndarray.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4763,6 +4763,7 @@ def create_index(
47634763
persistent: bool | None = None,
47644764
in_mem: bool = False,
47654765
name: str | None = None,
4766+
tmpdir: str | None = None,
47664767
**kwargs: Any,
47674768
) -> dict:
47684769
"""Create an index for a 1-D array or structured field.
@@ -4791,6 +4792,12 @@ def create_index(
47914792
Optional logical label stored in the descriptor. Index identity is
47924793
still driven by the target field, so creating another index on the
47934794
same field replaces the previous one.
4795+
tmpdir : str or None, optional
4796+
Directory to use for temporary files during out-of-core
4797+
``kind="full"`` builds. If ``None``, persistent arrays use the same
4798+
directory as the array being indexed so temporaries stay on the
4799+
same filesystem. In-memory arrays fall back to the system
4800+
temporary directory.
47944801
kwargs : dict, optional
47954802
Keyword arguments forwarded to the index builder. At the moment the
47964803
supported option is ``cparams``. Pass ``cparams`` to control the
@@ -4822,19 +4829,22 @@ def create_index(
48224829
persistent=persistent,
48234830
in_mem=in_mem,
48244831
name=name,
4832+
tmpdir=tmpdir,
48254833
**kwargs,
48264834
)
48274835

4828-
def create_csindex(self, field: str | None = None, **kwargs: Any) -> dict:
4836+
def create_csindex(self, field: str | None = None, tmpdir: str | None = None, **kwargs: Any) -> dict:
48294837
"""Create a fully sorted index for a 1-D array or structured field.
48304838
48314839
This is a convenience wrapper for ``create_index(kind="full")`` and is
48324840
the required index tier for direct ordered reuse in
48334841
``sort(order=...)``, ``indices(order=...)``, and ``itersorted(...)``.
4842+
``tmpdir`` controls where temporary files for out-of-core full builds
4843+
are created, following the same rules as :meth:`create_index`.
48344844
"""
48354845
from . import indexing
48364846

4837-
return indexing.create_csindex(self, field=field, **kwargs)
4847+
return indexing.create_csindex(self, field=field, tmpdir=tmpdir, **kwargs)
48384848

48394849
def create_expr_index(
48404850
self,

tests/ndarray/test_indexing.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,50 @@ def test_forced_ooc_full_index_merge_preserves_sorted_sidecars(monkeypatch, tmp_
11231123
np.testing.assert_array_equal(values_sidecar[:], data[positions_sidecar[:]])
11241124

11251125

1126+
def test_create_index_full_ooc_defaults_tmpdir_to_array_directory(monkeypatch, tmp_path):
1127+
path = tmp_path / "default_tmpdir_full.b2nd"
1128+
data = np.arange(4096, dtype=np.int64)
1129+
arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(256,), blocks=(64,))
1130+
1131+
recorded = {}
1132+
real_temporary_directory = indexing.tempfile.TemporaryDirectory
1133+
1134+
def tracking_temporary_directory(*args, **kwargs):
1135+
recorded["dir"] = kwargs.get("dir")
1136+
return real_temporary_directory(*args, **kwargs)
1137+
1138+
monkeypatch.setattr(indexing.tempfile, "TemporaryDirectory", tracking_temporary_directory)
1139+
1140+
descriptor = arr.create_index(kind="full")
1141+
1142+
assert descriptor["ooc"] is True
1143+
assert recorded["dir"] == str(path.parent.resolve())
1144+
1145+
1146+
def test_create_csindex_full_ooc_uses_explicit_tmpdir(monkeypatch, tmp_path):
1147+
path = tmp_path / "explicit_tmpdir_full.b2nd"
1148+
custom_tmpdir = tmp_path / "custom-index-tmp"
1149+
custom_tmpdir.mkdir()
1150+
dtype = np.dtype([("a", np.int64), ("payload", np.int32)])
1151+
data = np.zeros(4096, dtype=dtype)
1152+
data["a"] = np.arange(data.shape[0], dtype=np.int64)
1153+
arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(256,), blocks=(64,))
1154+
1155+
recorded = {}
1156+
real_temporary_directory = indexing.tempfile.TemporaryDirectory
1157+
1158+
def tracking_temporary_directory(*args, **kwargs):
1159+
recorded["dir"] = kwargs.get("dir")
1160+
return real_temporary_directory(*args, **kwargs)
1161+
1162+
monkeypatch.setattr(indexing.tempfile, "TemporaryDirectory", tracking_temporary_directory)
1163+
1164+
descriptor = arr.create_csindex("a", tmpdir=str(custom_tmpdir))
1165+
1166+
assert descriptor["ooc"] is True
1167+
assert recorded["dir"] == str(custom_tmpdir)
1168+
1169+
11261170
@pytest.mark.parametrize("persistent", [False, True])
11271171
def test_compact_full_index_rebuilds_navigation_without_whole_loading(monkeypatch, tmp_path, persistent):
11281172
dtype = np.dtype([("a", np.int64), ("b", np.int64)])

0 commit comments

Comments
 (0)