diff --git a/changes/3899.bugfix.md b/changes/3899.bugfix.md new file mode 100644 index 0000000000..c24f803e37 --- /dev/null +++ b/changes/3899.bugfix.md @@ -0,0 +1,2 @@ +Make chunk normalization properly handle `-1` as a compact representation of the length of +an entire axis. \ No newline at end of file diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f0cd5dd734..e1166df7f3 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -39,9 +39,11 @@ ) from zarr.core.buffer.cpu import buffer_prototype as cpu_buffer_prototype from zarr.core.chunk_grids import ( + SHARDED_INNER_CHUNK_MAX_BYTES, ChunkGrid, - _auto_partition, - normalize_chunks, + guess_chunks, + normalize_chunks_nd, + resolve_outer_and_inner_chunks, ) from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, @@ -120,10 +122,8 @@ ) from zarr.core.metadata.v3 import ( ChunkGridMetadata, - RectilinearChunkGridMetadata, - RegularChunkGridMetadata, + create_chunk_grid_metadata, parse_node_type_array, - resolve_chunks, ) from zarr.core.sync import sync from zarr.errors import ( @@ -412,6 +412,7 @@ async def _create( from zarr.core.chunk_grids import _is_rectilinear_chunks + # Unify the v2 (chunks) and v3 (chunk_shape) parameter names _raw_chunks = chunks if chunks is not None else chunk_shape config_parsed = parse_array_config(config) @@ -437,7 +438,11 @@ async def _create( item_size = 1 if isinstance(dtype_parsed, HasItemSize): item_size = dtype_parsed.item_size - chunk_grid = resolve_chunks(_raw_chunks, shape, item_size) + if _raw_chunks is None: + outer_chunks = guess_chunks(shape, item_size) + else: + outer_chunks = normalize_chunks_nd(_raw_chunks, shape) + chunk_grid = create_chunk_grid_metadata(outer_chunks) result = await cls._create_v3( store_path, shape=shape, @@ -468,10 +473,12 @@ async def _create( item_size = 1 if isinstance(dtype_parsed, HasItemSize): item_size = dtype_parsed.item_size - if chunks: - _chunks = normalize_chunks(chunks, shape, item_size) + _raw = chunks or chunk_shape + if _raw is None: + outer_chunks = guess_chunks(shape, item_size) else: - _chunks = normalize_chunks(chunk_shape, shape, item_size) + outer_chunks = normalize_chunks_nd(_raw, shape) + _chunks = tuple(dim[0] for dim in outer_chunks) if order is None: order_parsed = config_parsed.order @@ -4384,6 +4391,7 @@ async def init_array( zdtype = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) + item_size = zdtype.item_size if isinstance(zdtype, HasItemSize) else 1 chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format ) @@ -4396,12 +4404,9 @@ async def init_array( else: await ensure_no_existing_node(store_path, zarr_format=zarr_format) - # Detect rectilinear (nested list) chunks or shards, e.g. [[10, 20, 30], [25, 25]] + # Validate rectilinear chunks constraints from zarr.core.chunk_grids import _is_rectilinear_chunks - rectilinear_meta: RectilinearChunkGridMetadata | None = None - rectilinear_shards = _is_rectilinear_chunks(shards) - if _is_rectilinear_chunks(chunks): if zarr_format == 2: raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") @@ -4411,43 +4416,29 @@ async def init_array( "Use rectilinear shards instead: " "chunks=(inner_size, ...), shards=[[shard_sizes], ...]" ) - rectilinear_meta = RectilinearChunkGridMetadata( - chunk_shapes=tuple(tuple(dim_edges) for dim_edges in chunks) + + # Normalize the user's chunks into canonical ChunksTuple form + if chunks is None or chunks == "auto": + chunks_normalized = guess_chunks( + shape_parsed, + item_size, + max_bytes=SHARDED_INNER_CHUNK_MAX_BYTES if shards is not None else None, ) - # Use first chunk size per dim as placeholder for _auto_partition - chunks_flat: tuple[int, ...] | Literal["auto"] = tuple(dim_edges[0] for dim_edges in chunks) else: - # Normalize scalar int to per-dimension tuple (e.g. chunks=100000 for a 1D array) - if isinstance(chunks, int): - chunks = tuple(chunks for _ in shape_parsed) - chunks_flat = cast("tuple[int, ...] | Literal['auto']", chunks) - - # Handle rectilinear shards: shards=[[60, 40, 20], [50, 50]] - # means variable-sized shard boundaries with uniform inner chunks - shards_for_partition: ShardsLike | None = shards - if _is_rectilinear_chunks(shards): - if zarr_format == 2: - raise ValueError("Zarr format 2 does not support rectilinear chunk grids.") - rectilinear_meta = RectilinearChunkGridMetadata( - chunk_shapes=tuple(tuple(dim_edges) for dim_edges in shards) - ) - # Use first shard size per dim as placeholder for _auto_partition - shards_for_partition = tuple(dim_edges[0] for dim_edges in shards) + chunks_normalized = normalize_chunks_nd(chunks, shape_parsed) - item_size = 1 - if isinstance(zdtype, HasItemSize): - item_size = zdtype.item_size - - shard_shape_parsed, chunk_shape_parsed = _auto_partition( + # Resolve chunks + shards into outer_chunks (grid metadata) and + # inner (sub-chunk structure for ShardingCodec, None if no sharding) + outer_chunks, inner = resolve_outer_and_inner_chunks( array_shape=shape_parsed, - shard_shape=shards_for_partition, - chunk_shape=chunks_flat, + chunks=chunks_normalized, + shard_shape=shards, item_size=item_size, ) - chunks_out: tuple[int, ...] + meta: ArrayV2Metadata | ArrayV3Metadata if zarr_format == 2: - if shard_shape_parsed is not None: + if inner is not None: msg = ( "Zarr format 2 arrays can only be created with `shard_shape` set to `None`. " f"Got `shard_shape={shards}` instead." @@ -4471,7 +4462,7 @@ async def init_array( meta = AsyncArray._create_metadata_v2( shape=shape_parsed, dtype=zdtype, - chunks=chunk_shape_parsed, + chunks=tuple(dim[0] for dim in outer_chunks), dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, order=order_parsed, @@ -4487,40 +4478,29 @@ async def init_array( dtype=zdtype, ) sub_codecs = cast("tuple[Codec, ...]", (*array_array, array_bytes, *bytes_bytes)) + grid = create_chunk_grid_metadata(outer_chunks) codecs_out: tuple[Codec, ...] - if shard_shape_parsed is not None: + if inner is not None: + inner_chunks_flat = tuple(dim[0] for dim in inner.outer_chunks) index_location = None if isinstance(shards, dict): index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) if index_location is None: index_location = ShardingCodecIndexLocation.end sharding_codec = ShardingCodec( - chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location + chunk_shape=inner_chunks_flat, codecs=sub_codecs, index_location=index_location ) - # Use rectilinear grid for validation when shards are rectilinear - if rectilinear_shards and rectilinear_meta is not None: - validation_grid: ChunkGridMetadata = rectilinear_meta - else: - validation_grid = RegularChunkGridMetadata(chunk_shape=shard_shape_parsed) sharding_codec.validate( - shape=chunk_shape_parsed, + shape=inner_chunks_flat, dtype=zdtype, - chunk_grid=validation_grid, + chunk_grid=grid, ) codecs_out = (sharding_codec,) - chunks_out = shard_shape_parsed else: - chunks_out = chunk_shape_parsed codecs_out = sub_codecs if order is not None: _warn_order_kwarg() - - grid: ChunkGridMetadata - if rectilinear_meta is not None: - grid = rectilinear_meta - else: - grid = RegularChunkGridMetadata(chunk_shape=chunks_out) meta = AsyncArray._create_metadata_v3( shape=shape_parsed, dtype=zdtype, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 3d7313cd5d..115e2230a5 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -8,7 +8,16 @@ import warnings from dataclasses import dataclass, field from functools import reduce -from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeGuard, cast, runtime_checkable +from typing import ( + TYPE_CHECKING, + Any, + NamedTuple, + NewType, + Protocol, + TypeGuard, + cast, + runtime_checkable, +) import numpy as np import numpy.typing as npt @@ -27,6 +36,38 @@ from zarr.core.array import ShardsLike from zarr.core.metadata import ArrayMetadata +SHARDED_INNER_CHUNK_MAX_BYTES: int = 1048576 +"""Target ceiling in bytes for the auto-chunking heuristic when sharding is active (1 MiB). + +Only used when `chunks="auto"` and `shards` is not `None`. Explicit chunk sizes +are not affected by this value. +""" + +ChunksTuple = NewType("ChunksTuple", tuple[tuple[int, ...], ...]) +"""Normalized chunk specification: one tuple of chunk sizes per dimension. + +Produced exclusively by `normalize_chunks_nd` and `guess_chunks`. +Consumers should use this type to ensure they receive validated, +canonical chunk specifications rather than raw user input. +""" + + +class ResolvedChunking(NamedTuple): + """Result of resolving user `chunks`/`shards` into grid metadata inputs. + + outer_chunks + Chunk sizes for the chunk grid metadata. When sharding is active + these are the shard sizes; otherwise they are the user's chunk sizes. + inner + Recursive sub-structure inside each chunk. `None` means the chunk is + opaque (no sharding). When present, `inner.outer_chunks` gives the + sub-chunk sizes passed to `ShardingCodec`, and `inner.inner` gives + the next level of nesting (for nested sharding), or `None`. + """ + + outer_chunks: ChunksTuple + inner: ResolvedChunking | None = None + @dataclass(frozen=True) class FixedDimension: @@ -565,7 +606,7 @@ def update_shape(self, new_shape: tuple[int, ...]) -> ChunkGrid: return ChunkGrid(dimensions=dims) -def _guess_chunks( +def _guess_regular_chunks( shape: tuple[int, ...] | int, typesize: int, *, @@ -641,53 +682,99 @@ def _guess_chunks( return tuple(int(x) for x in chunks) -def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tuple[int, ...]: - """Convenience function to normalize the `chunks` argument for an array - with the given `shape`.""" +def normalize_chunks_1d(chunks: int | Iterable[int], span: int) -> tuple[int, ...]: + """ + Normalize a one-dimensional chunk specification into a tuple of logical + chunk sizes that cover the span. - # N.B., expect shape already normalized + `-1` means "one chunk covering the entire span." + For an integer chunk size, all chunks are uniform — the last chunk may + overhang the span. The actual data extent of each chunk is determined + by the chunk grid at runtime, not by this function. + """ + if chunks == -1: + return (span,) + if isinstance(chunks, int): + if chunks <= 0: + raise ValueError(f"Chunk size must be positive, got {chunks}") + if span == 0: + return (chunks,) + n = ceildiv(span, chunks) + return tuple(chunks for _ in range(n)) + else: + chunk_list = list(chunks) + if not chunk_list: + raise ValueError("Chunk specification must not be empty") + if any(c <= 0 for c in chunk_list): + raise ValueError(f"All chunk sizes must be positive, got {chunk_list}") + if sum(chunk_list) != span: + raise ValueError(f"Chunk sizes {chunk_list} do not sum to span {span}") + return tuple(chunk_list) + + +def normalize_chunks_nd( + chunks: Any, + shape: tuple[int, ...], +) -> ChunksTuple: + """ + Normalize a chunk specification into a `ChunksTuple`. + + This is a mechanical transformation — no heuristics, no guessing. + Handles scalar ints, -1 sentinels, None per-dimension, short specs + (padded with None), and explicit per-chunk size lists. - # handle auto-chunking - if chunks is None or chunks is True: - return _guess_chunks(shape, typesize) + For auto-chunking, use `guess_chunks` which returns a + `ChunksTuple` directly. + """ + if chunks is None: + raise ValueError( + "normalize_chunks_nd does not accept None. Use guess_chunks() for auto-chunking." + ) # handle no chunking if chunks is False: - return shape + return ChunksTuple(tuple((s,) for s in shape)) # handle 1D convenience form if isinstance(chunks, numbers.Integral): chunks = tuple(int(chunks) for _ in shape) - # handle dask-style chunks (iterable of iterables) - if all(isinstance(c, (tuple, list)) for c in chunks): - for i, c in enumerate(chunks): - if any(x != y for x, y in itertools.pairwise(c[:-1])) or (len(c) > 1 and c[-1] > c[0]): - raise ValueError( - f"Irregular chunk sizes in dimension {i}: {tuple(c)}. " - "Only uniform chunks (with an optional smaller final chunk) are supported." - ) - chunks = tuple(c[0] for c in chunks) - # handle bad dimensionality - if len(chunks) > len(shape): - raise ValueError("too many dimensions in chunks") - - # handle underspecified chunks - if len(chunks) < len(shape): - # assume chunks across remaining dimensions - chunks += shape[len(chunks) :] - - # handle None or -1 in chunks - if -1 in chunks or None in chunks: - chunks = tuple( - s if c == -1 or c is None else int(c) for s, c in zip(shape, chunks, strict=False) + if len(chunks) != len(shape): + raise ValueError( + f"chunks has {len(chunks)} dimensions but shape has {len(shape)} dimensions" ) - if not all(isinstance(c, numbers.Integral) for c in chunks): - raise TypeError("non integer value in chunks") + return ChunksTuple( + tuple(normalize_chunks_1d(c, span=s) for c, s in zip(chunks, shape, strict=True)) + ) - return tuple(int(c) for c in chunks) + +def guess_chunks( + shape: tuple[int, ...], typesize: int, *, max_bytes: int | None = None +) -> ChunksTuple: + """ + Heuristically determine chunk sizes for an array. + + This is the policy function — it makes opinionated choices about + chunk sizes based on array shape and element size, and returns a + normalized `ChunksTuple`. + + Parameters + ---------- + shape : tuple[int, ...] + Array shape. + typesize : int + Size of one element in bytes. + max_bytes : int or None + Target maximum chunk size in bytes. If None, uses the default + heuristic from `_guess_regular_chunks`. + """ + if max_bytes is not None: + flat = _guess_regular_chunks(shape, typesize, max_bytes=max_bytes) + else: + flat = _guess_regular_chunks(shape, typesize) + return normalize_chunks_nd(flat, shape) def _guess_num_chunks_per_axis_shard( @@ -727,62 +814,76 @@ def _guess_num_chunks_per_axis_shard( return chunks_per_shard -def _auto_partition( +def resolve_outer_and_inner_chunks( *, array_shape: tuple[int, ...], - chunk_shape: tuple[int, ...] | Literal["auto"], + chunks: ChunksTuple, shard_shape: ShardsLike | None, item_size: int, -) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: - """ - Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. - If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based - on the dtype and shape of the array. - If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape - of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, - given the dtype and shard shape. Otherwise, the chunks will be returned as-is. +) -> ResolvedChunking: + """Resolve user `chunks`/`shards` into outer and inner chunk specs. + + Parameters + ---------- + array_shape + The array shape. + chunks + Normalized chunk specification (the user's `chunks=`). + shard_shape + Raw shard specification (the user's `shards=`). + `None` means no sharding, `"auto"` triggers heuristic inference, + a nested sequence is treated as rectilinear shard boundaries, + and anything else is used as a regular shard shape. + item_size + Element size in bytes. + + Returns + ------- + ResolvedChunking + `outer_chunks` is the `ChunksTuple` for chunk grid + metadata. `inner` holds the sub-chunk structure for + `ShardingCodec`, or is `None` when sharding is not active. """ if shard_shape is None: - _shards_out: None | tuple[int, ...] = None - if chunk_shape == "auto": - _chunks_out = _guess_chunks(array_shape, item_size) - else: - _chunks_out = chunk_shape - else: - if chunk_shape == "auto": - # aim for a 1MiB chunk - _chunks_out = _guess_chunks(array_shape, item_size, max_bytes=1048576) - else: - _chunks_out = chunk_shape + return ResolvedChunking(outer_chunks=chunks) - if shard_shape == "auto": - warnings.warn( - "Automatic shard shape inference is experimental and may change without notice.", - ZarrUserWarning, - stacklevel=2, - ) - _shards_out = () - target_shard_size_bytes = zarr.config.get("array.target_shard_size_bytes", None) - num_chunks_per_shard_axis = ( - _guess_num_chunks_per_axis_shard( - chunk_shape=_chunks_out, - item_size=item_size, - max_bytes=target_shard_size_bytes, - array_shape=array_shape, - ) - if (has_auto_shard := (target_shard_size_bytes is not None)) - else 2 + # Rectilinear shards: normalize the nested sequence directly. + if _is_rectilinear_chunks(shard_shape): + outer = normalize_chunks_nd(shard_shape, array_shape) + return ResolvedChunking(outer_chunks=outer, inner=ResolvedChunking(outer_chunks=chunks)) + + # Extract the flat chunk shape (first size per dimension) for arithmetic. + chunk_shape_flat = tuple(dim[0] for dim in chunks) + + if shard_shape == "auto": + warnings.warn( + "Automatic shard shape inference is experimental and may change without notice.", + ZarrUserWarning, + stacklevel=2, + ) + _shards_out: tuple[int, ...] = () + target_shard_size_bytes = zarr.config.get("array.target_shard_size_bytes", None) + num_chunks_per_shard_axis = ( + _guess_num_chunks_per_axis_shard( + chunk_shape=chunk_shape_flat, + item_size=item_size, + max_bytes=target_shard_size_bytes, + array_shape=array_shape, ) - for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): - # The previous heuristic was `a_shape // c_shape > 8` and now, with target_shard_size_bytes, we only check that the shard size is less than the array size. - can_shard_axis = a_shape // c_shape > 8 if not has_auto_shard else True - if can_shard_axis: - _shards_out += (c_shape * num_chunks_per_shard_axis,) - else: - _shards_out += (c_shape,) - elif isinstance(shard_shape, dict): - _shards_out = tuple(shard_shape["shape"]) - else: - _shards_out = cast("tuple[int, ...]", shard_shape) + if (has_auto_shard := (target_shard_size_bytes is not None)) + else 2 + ) + for a_shape, c_shape in zip(array_shape, chunk_shape_flat, strict=True): + can_shard_axis = a_shape // c_shape > 8 if not has_auto_shard else True + if can_shard_axis: + _shards_out += (c_shape * num_chunks_per_shard_axis,) + else: + _shards_out += (c_shape,) + shard_flat = _shards_out + elif isinstance(shard_shape, dict): + shard_flat = tuple(shard_shape["shape"]) + else: + shard_flat = cast("tuple[int, ...]", shard_shape) - return _shards_out, _chunks_out + outer = normalize_chunks_nd(shard_flat, array_shape) + return ResolvedChunking(outer_chunks=outer, inner=ResolvedChunking(outer_chunks=chunks)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 7773e2489d..1b54df97fb 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -17,7 +17,6 @@ from zarr.core.common import ( JSON, ZARR_JSON, - ChunksLike, DimensionNamesLike, NamedConfig, NamedRequiredConfig, @@ -39,6 +38,7 @@ from typing import Self from zarr.core.buffer import Buffer, BufferPrototype + from zarr.core.chunk_grids import ChunksTuple from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar @@ -368,27 +368,53 @@ def from_dict(cls, data: RectilinearChunkGridMetadataJSON) -> Self: # type: ign ChunkGridMetadata = RegularChunkGridMetadata | RectilinearChunkGridMetadata -def resolve_chunks( - chunks: ChunksLike, - shape: tuple[int, ...], - typesize: int, +def is_regular_1d(dim_chunks: Sequence[int]) -> bool: + """Check if a single dimension's chunk sizes represent a regular grid. + + A regular dimension has either all chunks the same size, or all + but the last chunk the same size with the last chunk smaller + (boundary chunk). + """ + if len(dim_chunks) <= 1: + return True + first = dim_chunks[0] + for c in dim_chunks[1:-1]: + if c != first: + return False + # Last chunk must be the same size or a smaller boundary chunk + return dim_chunks[-1] <= first + + +def is_regular_nd(chunks: Iterable[Sequence[int]]) -> bool: + """Check if an N-dimensional chunk specification represents a regular grid.""" + return all(is_regular_1d(d) for d in chunks) + + +def create_chunk_grid_metadata( + chunks: ChunksTuple, ) -> ChunkGridMetadata: - """Construct a chunk grid from user-facing input (e.g. ``create_array(chunks=...)``). + """Construct a chunk grid metadata object from a normalized `ChunksTuple`. + + Regular chunks produce a `RegularChunkGridMetadata`. + Rectilinear chunks produce a `RectilinearChunkGridMetadata`. - Nested sequences like ``[[10, 20], [5, 5]]`` produce a ``RectilinearChunkGridMetadata``. - Flat inputs like ``(10, 10)`` or a scalar ``int`` produce a ``RegularChunkGridMetadata`` - after normalization via :func:`~zarr.core.chunk_grids.normalize_chunks`. + Parameters + ---------- + chunks : ChunksTuple + Normalized chunk specification, as returned by + `normalize_chunks_nd` or `guess_chunks`. See Also -------- parse_chunk_grid : Deserialize a chunk grid from stored JSON metadata. """ - from zarr.core.chunk_grids import _is_rectilinear_chunks, normalize_chunks - - if _is_rectilinear_chunks(chunks): - return RectilinearChunkGridMetadata(chunk_shapes=tuple(tuple(c) for c in chunks)) - - return RegularChunkGridMetadata(chunk_shape=normalize_chunks(chunks, shape, typesize)) + if is_regular_nd(chunks): + # If we know the chunks specification is regular, then we can take the first + # chunk size for each dimension as the chunk shape. + chunk_shape = tuple(dim_chunks[0] for dim_chunks in chunks) + return RegularChunkGridMetadata(chunk_shape=chunk_shape) + else: + return RectilinearChunkGridMetadata(chunk_shapes=chunks) def parse_chunk_grid( @@ -398,7 +424,7 @@ def parse_chunk_grid( See Also -------- - resolve_chunks : Construct a chunk grid from user-facing input. + create_chunk_grid_metadata : Construct a chunk grid from user-facing input. """ if isinstance(data, (RegularChunkGridMetadata, RectilinearChunkGridMetadata)): return data diff --git a/tests/conftest.py b/tests/conftest.py index a02006d6a9..8f879764a7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,12 @@ _parse_chunk_encoding_v3, _parse_chunk_key_encoding, ) -from zarr.core.chunk_grids import _auto_partition +from zarr.core.chunk_grids import ( + SHARDED_INNER_CHUNK_MAX_BYTES, + guess_chunks, + normalize_chunks_nd, + resolve_outer_and_inner_chunks, +) from zarr.core.common import ( JSON, DimensionNamesLike, @@ -37,7 +42,7 @@ ) from zarr.core.dtype.common import HasItemSize from zarr.core.metadata.v2 import ArrayV2Metadata -from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGridMetadata +from zarr.core.metadata.v3 import ArrayV3Metadata, create_chunk_grid_metadata from zarr.core.sync import sync from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore from zarr.testing.store import LatencyStore @@ -326,10 +331,18 @@ def create_array_metadata( item_size = 1 if isinstance(dtype_parsed, HasItemSize): item_size = dtype_parsed.item_size - shard_shape_parsed, chunk_shape_parsed = _auto_partition( + if chunks == "auto": + chunks_normalized = guess_chunks( + shape_parsed, + item_size, + max_bytes=SHARDED_INNER_CHUNK_MAX_BYTES if shards is not None else None, + ) + else: + chunks_normalized = normalize_chunks_nd(chunks, shape_parsed) + outer_chunks, inner = resolve_outer_and_inner_chunks( array_shape=shape_parsed, + chunks=chunks_normalized, shard_shape=shards, - chunk_shape=chunks, item_size=item_size, ) @@ -337,7 +350,6 @@ def create_array_metadata( order_parsed = zarr_config.get("array.order") else: order_parsed = order - chunks_out: tuple[int, ...] if zarr_format == 2: filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( @@ -347,7 +359,7 @@ def create_array_metadata( return ArrayV2Metadata( shape=shape_parsed, dtype=dtype_parsed, - chunks=chunk_shape_parsed, + chunks=tuple(dim[0] for dim in outer_chunks), order=order_parsed, dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, @@ -365,32 +377,32 @@ def create_array_metadata( sub_codecs: tuple[Codec, ...] = (*array_array, array_bytes, *bytes_bytes) codecs_out: tuple[Codec, ...] - if shard_shape_parsed is not None: + if inner is not None: + inner_chunks_flat = tuple(dim[0] for dim in inner.outer_chunks) index_location = None if isinstance(shards, dict): index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) if index_location is None: index_location = ShardingCodecIndexLocation.end sharding_codec = ShardingCodec( - chunk_shape=chunk_shape_parsed, + chunk_shape=inner_chunks_flat, codecs=sub_codecs, index_location=index_location, ) + validation_grid = create_chunk_grid_metadata(outer_chunks) sharding_codec.validate( - shape=chunk_shape_parsed, + shape=inner_chunks_flat, dtype=dtype_parsed, - chunk_grid=RegularChunkGridMetadata(chunk_shape=shard_shape_parsed), + chunk_grid=validation_grid, ) codecs_out = (sharding_codec,) - chunks_out = shard_shape_parsed else: - chunks_out = chunk_shape_parsed codecs_out = sub_codecs return ArrayV3Metadata( shape=shape_parsed, data_type=dtype_parsed, - chunk_grid={"name": "regular", "configuration": {"chunk_shape": chunks_out}}, + chunk_grid=create_chunk_grid_metadata(outer_chunks), chunk_key_encoding=chunk_key_encoding_parsed, fill_value=fill_value, codecs=codecs_out, diff --git a/tests/test_array.py b/tests/test_array.py index f7f564f30e..2f93720475 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -46,7 +46,7 @@ ) from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype -from zarr.core.chunk_grids import _auto_partition +from zarr.core.chunk_grids import normalize_chunks_nd, resolve_outer_and_inner_chunks from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams from zarr.core.common import JSON, ZarrFormat, ceildiv from zarr.core.dtype import ( @@ -1073,36 +1073,55 @@ def test_auto_partition_auto_shards( where there are 8 or more chunks. """ dtype = np.dtype("uint8") + chunks_normalized = normalize_chunks_nd(chunk_shape, array_shape) with pytest.warns( ZarrUserWarning, match="Automatic shard shape inference is experimental and may change without notice.", ): with zarr.config.set({"array.target_shard_size_bytes": target_shard_size_bytes}): - auto_shards, _ = _auto_partition( + outer_chunks, _ = resolve_outer_and_inner_chunks( array_shape=array_shape, - chunk_shape=chunk_shape, + chunks=chunks_normalized, shard_shape="auto", item_size=dtype.itemsize, ) + auto_shards = tuple(dim[0] for dim in outer_chunks) assert auto_shards == expected_shards def test_auto_partition_auto_shards_with_auto_chunks_should_be_close_to_1MiB() -> None: """ - Test that automatically picking a shard size and a chunk size gives roughly 1MiB chunks. + Test that automatically picking chunk and shard sizes together produces + chunks close to 1 MiB and shards that are a multiple of the chunk size. """ + from zarr.core.chunk_grids import SHARDED_INNER_CHUNK_MAX_BYTES, guess_chunks + + array_shape = (10_000_000,) + item_size = 1 + # Auto-chunks with sharding use the default inner chunk size target + chunks_normalized = guess_chunks( + array_shape, item_size, max_bytes=SHARDED_INNER_CHUNK_MAX_BYTES + ) + chunk_shape = tuple(dim[0] for dim in chunks_normalized) + chunk_bytes = np.prod(chunk_shape) * item_size + assert chunk_bytes <= SHARDED_INNER_CHUNK_MAX_BYTES + assert chunk_bytes > SHARDED_INNER_CHUNK_MAX_BYTES // 4 # should be in the right ballpark + with pytest.warns( ZarrUserWarning, match="Automatic shard shape inference is experimental and may change without notice.", ): with zarr.config.set({"array.target_shard_size_bytes": 10_000_000}): - _, chunk_shape = _auto_partition( - array_shape=(10_000_000,), - chunk_shape="auto", + outer_chunks, inner = resolve_outer_and_inner_chunks( + array_shape=array_shape, + chunks=chunks_normalized, shard_shape="auto", - item_size=1, + item_size=item_size, ) - assert chunk_shape == (625000,) + assert inner is not None + shard_shape = tuple(dim[0] for dim in outer_chunks) + # Shard dimensions must be multiples of chunk dimensions + assert all(s % c == 0 for s, c in zip(shard_shape, chunk_shape, strict=True)) def test_chunks_and_shards() -> None: @@ -2321,3 +2340,44 @@ def test_with_config_polymorphism() -> None: arr_source_config_dict = arr.with_config(source_config_dict) assert arr_source_config.config == arr_source_config_dict.config + + +@pytest.mark.parametrize( + ("chunk_input", "expected"), + [ + (-1, ((10,),)), + ((-1,), ((10,),)), + ((10,), ((10,),)), + ((5,), ((5, 5),)), + ((3,), ((3, 3, 3, 1),)), + ], + ids=["scalar-neg1", "tuple-neg1", "exact", "half", "remainder"], +) +async def test_create_array_chunks_1d( + chunk_input: int | tuple[int, ...], + expected: tuple[tuple[int, ...], ...], +) -> None: + """Test that chunk normalization produces the expected chunk sizes for 1D arrays.""" + arr = await create_array(store={}, shape=(10,), chunks=chunk_input, dtype="uint8") + assert arr.write_chunk_sizes == expected + + +@pytest.mark.parametrize( + ("chunk_input", "expected"), + [ + (-1, ((10,), (12,), (15,))), + ((3, 4, 5), ((3, 3, 3, 1), (4, 4, 4), (5, 5, 5))), + ((-1, 4, -1), ((10,), (4, 4, 4), (15,))), + ((10, 12, 15), ((10,), (12,), (15,))), + ((7, 3, 2), ((7, 3), (3, 3, 3, 3), (2, 2, 2, 2, 2, 2, 2, 1))), + ], + ids=["all-neg1", "mixed", "neg1-middle", "exact", "remainder"], +) +async def test_create_array_chunks_3d( + chunk_input: int | tuple[int, ...], + expected: tuple[tuple[int, ...], ...], +) -> None: + """Test that chunk normalization produces the expected chunk sizes for 3D arrays.""" + shape = (10, 12, 15) + arr = await create_array(store={}, shape=shape, chunks=chunk_input, dtype="float64") + assert arr.write_chunk_sizes == expected diff --git a/tests/test_chunk_grids.py b/tests/test_chunk_grids.py index 2920b5d6f3..ce8854b413 100644 --- a/tests/test_chunk_grids.py +++ b/tests/test_chunk_grids.py @@ -3,7 +3,11 @@ import numpy as np import pytest -from zarr.core.chunk_grids import _guess_chunks, normalize_chunks +from zarr.core.chunk_grids import ( + _guess_regular_chunks, + normalize_chunks_nd, + resolve_outer_and_inner_chunks, +) @pytest.mark.parametrize( @@ -11,7 +15,7 @@ ) @pytest.mark.parametrize("itemsize", [1, 2, 4]) def test_guess_chunks(shape: tuple[int, ...], itemsize: int) -> None: - chunks = _guess_chunks(shape, itemsize) + chunks = _guess_regular_chunks(shape, itemsize) chunk_size = np.prod(chunks) * itemsize assert isinstance(chunks, tuple) assert len(chunks) == len(shape) @@ -21,43 +25,116 @@ def test_guess_chunks(shape: tuple[int, ...], itemsize: int) -> None: @pytest.mark.parametrize( - ("chunks", "shape", "typesize", "expected"), + ("chunks", "shape", "expected"), [ - ((10,), (100,), 1, (10,)), - ([10], (100,), 1, (10,)), - (10, (100,), 1, (10,)), - ((10, 10), (100, 10), 1, (10, 10)), - (10, (100, 10), 1, (10, 10)), - ((10, None), (100, 10), 1, (10, 10)), - (30, (100, 20, 10), 1, (30, 30, 30)), - ((30,), (100, 20, 10), 1, (30, 20, 10)), - ((30, None), (100, 20, 10), 1, (30, 20, 10)), - ((30, None, None), (100, 20, 10), 1, (30, 20, 10)), - ((30, 20, None), (100, 20, 10), 1, (30, 20, 10)), - ((30, 20, 10), (100, 20, 10), 1, (30, 20, 10)), - # dask-style chunks (uniform with optional smaller final chunk) - (((100, 100, 100), (50, 50)), (300, 100), 1, (100, 50)), - (((100, 100, 50),), (250,), 1, (100,)), - (((100,),), (100,), 1, (100,)), - # auto chunking - (None, (100,), 1, (100,)), - (-1, (100,), 1, (100,)), - ((30, -1, None), (100, 20, 10), 1, (30, 20, 10)), + # 1D cases + ((10,), (100,), ((10,) * 10,)), + ([10], (100,), ((10,) * 10,)), + (10, (100,), ((10,) * 10,)), + # 2D cases + ((10, 10), (100, 10), ((10,) * 10, (10,))), + (10, (100, 10), ((10,) * 10, (10,))), + ((10, -1), (100, 10), ((10,) * 10, (10,))), + # 3D cases + (30, (100, 20, 10), ((30, 30, 30, 30), (30,), (30,))), + ((30, -1, -1), (100, 20, 10), ((30, 30, 30, 30), (20,), (10,))), + ((30, 20, -1), (100, 20, 10), ((30, 30, 30, 30), (20,), (10,))), + ((30, 20, 10), (100, 20, 10), ((30, 30, 30, 30), (20,), (10,))), + # dask-style chunks (explicit per-chunk sizes) + (((100, 100, 100), (50, 50)), (300, 100), ((100, 100, 100), (50, 50))), + (((100, 100, 50),), (250,), ((100, 100, 50),)), + (((100,),), (100,), ((100,),)), + # no chunking (False means each dimension is one chunk spanning the full extent) + (False, (100,), ((100,),)), + (False, (100, 50), ((100,), (50,))), + # sentinel values + (-1, (100,), ((100,),)), + # zero-length dimensions preserve the declared chunk size + (10, (0,), ((10,),)), + ((5, 10), (0, 100), ((5,), (10,) * 10)), + ((5, 10), (20, 0), ((5, 5, 5, 5), (10,))), ], ) def test_normalize_chunks( - chunks: Any, shape: tuple[int, ...], typesize: int, expected: tuple[int, ...] + chunks: Any, shape: tuple[int, ...], expected: tuple[tuple[int, ...], ...] +) -> None: + assert expected == normalize_chunks_nd(chunks, shape) + + +@pytest.mark.parametrize( + ("array_shape", "chunks_input", "shard_shape", "expected_outer", "expected_inner_outer"), + [ + # no sharding: outer = chunks, inner = None + ((100,), (10,), None, ((10,) * 10,), None), + # explicit regular shards + ((100,), (10,), (50,), ((50, 50),), ((10,) * 10,)), + # rectilinear shards + ((100,), (10,), ((60, 40),), ((60, 40),), ((10,) * 10,)), + # dict-style shards + ((100, 100), (10, 10), {"shape": (50, 50)}, ((50, 50), (50, 50)), ((10,) * 10, (10,) * 10)), + ], +) +def test_resolve_outer_and_inner_chunks( + array_shape: tuple[int, ...], + chunks_input: tuple[int, ...], + shard_shape: Any, + expected_outer: tuple[tuple[int, ...], ...], + expected_inner_outer: tuple[tuple[int, ...], ...] | None, ) -> None: - assert expected == normalize_chunks(chunks, shape, typesize) + chunks = normalize_chunks_nd(chunks_input, array_shape) + outer_chunks, inner = resolve_outer_and_inner_chunks( + array_shape=array_shape, chunks=chunks, shard_shape=shard_shape, item_size=1 + ) + assert outer_chunks == expected_outer + if expected_inner_outer is None: + assert inner is None + else: + assert inner is not None + assert inner.outer_chunks == expected_inner_outer + assert inner.inner is None + + +def test_resolved_chunking_nested() -> None: + """Test that ResolvedChunking supports recursive nesting for nested sharding.""" + from zarr.core.chunk_grids import ResolvedChunking + + leaf = normalize_chunks_nd((5, 5), (100, 100)) + mid = ResolvedChunking( + outer_chunks=normalize_chunks_nd((25, 25), (100, 100)), + inner=ResolvedChunking(outer_chunks=leaf), + ) + top = ResolvedChunking(outer_chunks=normalize_chunks_nd((50, 50), (100, 100)), inner=mid) + + # Three levels: top -> mid -> leaf + assert top.outer_chunks == ((50, 50), (50, 50)) + assert top.inner is not None + assert top.inner.outer_chunks == ((25,) * 4, (25,) * 4) + assert top.inner.inner is not None + assert top.inner.inner.outer_chunks == ((5,) * 20, (5,) * 20) + assert top.inner.inner.inner is None + + +def test_normalize_chunks_1d_errors() -> None: + from zarr.core.chunk_grids import normalize_chunks_1d + + with pytest.raises(ValueError, match="Chunk size must be positive"): + normalize_chunks_1d(0, 100) + with pytest.raises(ValueError, match="Chunk size must be positive"): + normalize_chunks_1d(-2, 100) + with pytest.raises(ValueError, match="must not be empty"): + normalize_chunks_1d([], 100) + with pytest.raises(ValueError, match="must be positive"): + normalize_chunks_1d([10, -1, 10], 100) + with pytest.raises(ValueError, match="do not sum to span"): + normalize_chunks_1d([10, 20], 100) def test_normalize_chunks_errors() -> None: + with pytest.raises(ValueError, match="does not accept None"): + normalize_chunks_nd(None, (100,)) with pytest.raises(ValueError): - normalize_chunks("foo", (100,), 1) - with pytest.raises(ValueError): - normalize_chunks((100, 10), (100,), 1) - # dask-style irregular chunks should raise - with pytest.raises(ValueError, match="Irregular chunk sizes"): - normalize_chunks(((10, 20, 30),), (60,), 1) - with pytest.raises(ValueError, match="Irregular chunk sizes"): - normalize_chunks(((100, 100), (10, 20)), (200, 30), 1) + normalize_chunks_nd("foo", (100,)) + with pytest.raises(ValueError, match="dimensions"): + normalize_chunks_nd((100, 10), (100,)) + with pytest.raises(ValueError, match="dimensions"): + normalize_chunks_nd((10,), (100, 100)) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 43d03caf11..233cc4cb77 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -497,7 +497,7 @@ def test_invalid_shard_shape() -> None: {}, shape=(16, 16), shards=(16, 16), - chunks=(9,), + chunks=(9, 9), dtype=np.dtype("uint8"), fill_value=0, )