Skip to content

Commit 35537c5

Browse files
committed
refactor: define codec and data type classes upstream in a subpackage
1 parent ca9bd3e commit 35537c5

14 files changed

Lines changed: 640 additions & 27 deletions

File tree

packages/zarr-interfaces/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# zarr-interfaces
2+
3+
Interface definitions (ABCs and protocols) for zarr codecs and data types.
4+
5+
This package provides the abstract base classes and protocols that external
6+
codec and data type implementations should subclass or implement. It has
7+
minimal dependencies (only numpy) and does not depend on zarr-python itself.
8+
9+
## Usage
10+
11+
```python
12+
from zarr_interfaces.codec.v1 import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
13+
from zarr_interfaces.data_type.v1 import ZDType
14+
```
15+
16+
Interfaces are versioned under a `v1` namespace to support future evolution
17+
without breaking existing implementations.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "zarr-interfaces"
7+
version = "0.1.0"
8+
description = "Interface definitions (ABCs and protocols) for zarr codecs and data types"
9+
readme = "README.md"
10+
license = "BSD-3-Clause"
11+
requires-python = ">=3.12"
12+
dependencies = [
13+
"numpy>=2",
14+
]
15+
16+
[project.urls]
17+
homepage = "https://github.com/zarr-developers/zarr-python"

packages/zarr-interfaces/src/zarr_interfaces/__init__.py

Whitespace-only changes.

packages/zarr-interfaces/src/zarr_interfaces/codec/__init__.py

Whitespace-only changes.
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
"""Codec interface definitions (v1).
2+
3+
This module defines the abstract interfaces for zarr codecs.
4+
External codec implementations should subclass ``ArrayArrayCodec``,
5+
``ArrayBytesCodec``, or ``BytesBytesCodec`` from this module.
6+
7+
The ``Buffer`` and ``NDBuffer`` types here are protocols — they define
8+
the structural interface that zarr's concrete buffer types implement.
9+
Codec authors should type against these protocols, not zarr's concrete
10+
buffer classes.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
from abc import ABC, abstractmethod
16+
from dataclasses import dataclass
17+
from typing import TYPE_CHECKING, ClassVar, Protocol, Self, runtime_checkable
18+
19+
if TYPE_CHECKING:
20+
import numpy as np
21+
import numpy.typing as npt
22+
23+
from zarr_interfaces.data_type.v1 import JSON, TBaseDType, TBaseScalar, ZDType
24+
25+
26+
# ---------------------------------------------------------------------------
27+
# Buffer protocols
28+
# ---------------------------------------------------------------------------
29+
30+
31+
class Buffer(Protocol):
32+
"""Protocol for a flat contiguous memory block (bytes-like)."""
33+
34+
def __len__(self) -> int: ...
35+
def __getitem__(self, key: slice) -> Buffer: ...
36+
37+
38+
class NDBuffer(Protocol):
39+
"""Protocol for an N-dimensional array buffer."""
40+
41+
@property
42+
def dtype(self) -> np.dtype[np.generic]: ...
43+
44+
@property
45+
def shape(self) -> tuple[int, ...]: ...
46+
47+
def as_ndarray_like(self) -> npt.NDArray[np.generic]: ...
48+
49+
@classmethod
50+
def from_ndarray_like(cls, data: npt.NDArray[np.generic]) -> NDBuffer: ...
51+
52+
def transpose(self, axes: tuple[int, ...]) -> NDBuffer: ...
53+
54+
def __getitem__(self, key: object) -> NDBuffer: ...
55+
56+
def __setitem__(self, key: object, value: object) -> None: ...
57+
58+
59+
# ---------------------------------------------------------------------------
60+
# ArraySpec protocol
61+
# ---------------------------------------------------------------------------
62+
63+
64+
class ArraySpec(Protocol):
65+
"""Protocol for the specification of a chunk's metadata."""
66+
67+
@property
68+
def shape(self) -> tuple[int, ...]: ...
69+
70+
@property
71+
def dtype(self) -> ZDType[TBaseDType, TBaseScalar]: ...
72+
73+
@property
74+
def fill_value(self) -> object: ...
75+
76+
@property
77+
def ndim(self) -> int: ...
78+
79+
80+
# ---------------------------------------------------------------------------
81+
# Codec input/output type aliases
82+
# ---------------------------------------------------------------------------
83+
84+
type CodecInput = NDBuffer | Buffer
85+
type CodecOutput = NDBuffer | Buffer
86+
87+
88+
# ---------------------------------------------------------------------------
89+
# Sync codec protocol
90+
# ---------------------------------------------------------------------------
91+
92+
93+
@runtime_checkable
94+
class SupportsSyncCodec[CI: CodecInput, CO: CodecOutput](Protocol):
95+
"""Protocol for codecs that support synchronous encode/decode.
96+
97+
The type parameters mirror ``BaseCodec``: ``CI`` is the decoded type
98+
and ``CO`` is the encoded type.
99+
"""
100+
101+
def _decode_sync(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI: ...
102+
103+
def _encode_sync(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: ...
104+
105+
106+
# ---------------------------------------------------------------------------
107+
# Codec ABCs
108+
# ---------------------------------------------------------------------------
109+
110+
111+
@dataclass(frozen=True)
112+
class BaseCodec[CI: CodecInput, CO: CodecOutput](ABC):
113+
"""Generic base class for codecs.
114+
115+
Subclass ``ArrayArrayCodec``, ``ArrayBytesCodec``, or
116+
``BytesBytesCodec`` instead of this class directly.
117+
"""
118+
119+
is_fixed_size: ClassVar[bool]
120+
121+
@classmethod
122+
def from_dict(cls, data: dict[str, JSON]) -> Self:
123+
"""Create an instance from a JSON dictionary."""
124+
return cls(**data) # type: ignore[arg-type]
125+
126+
def to_dict(self) -> dict[str, JSON]:
127+
"""Serialize this codec to a JSON dictionary."""
128+
raise NotImplementedError
129+
130+
@abstractmethod
131+
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
132+
"""Return the encoded byte length for a given input byte length."""
133+
...
134+
135+
def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
136+
"""Return the chunk spec after encoding by this codec.
137+
138+
Override this for codecs that change shape, dtype, or fill value.
139+
"""
140+
return chunk_spec
141+
142+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
143+
"""Fill in codec parameters that can be inferred from array metadata."""
144+
return self
145+
146+
def validate(
147+
self,
148+
*,
149+
shape: tuple[int, ...],
150+
dtype: ZDType[TBaseDType, TBaseScalar],
151+
chunk_grid: object,
152+
) -> None:
153+
"""Validate that this codec is compatible with the array metadata.
154+
155+
The default implementation does nothing. Override to add checks.
156+
"""
157+
158+
async def _decode_single(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI:
159+
"""Decode a single chunk. Override this or ``_decode_sync``."""
160+
raise NotImplementedError
161+
162+
async def decode(
163+
self,
164+
chunks_and_specs: Iterable[tuple[CO | None, ArraySpec]],
165+
) -> Iterable[CI | None]:
166+
"""Decode a batch of chunks."""
167+
results: list[CI | None] = []
168+
for chunk_data, chunk_spec in chunks_and_specs:
169+
if chunk_data is not None:
170+
results.append(await self._decode_single(chunk_data, chunk_spec))
171+
else:
172+
results.append(None)
173+
return results
174+
175+
async def _encode_single(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None:
176+
"""Encode a single chunk. Override this or ``_encode_sync``."""
177+
raise NotImplementedError
178+
179+
async def encode(
180+
self,
181+
chunks_and_specs: Iterable[tuple[CI | None, ArraySpec]],
182+
) -> Iterable[CO | None]:
183+
"""Encode a batch of chunks."""
184+
results: list[CO | None] = []
185+
for chunk_data, chunk_spec in chunks_and_specs:
186+
if chunk_data is not None:
187+
results.append(await self._encode_single(chunk_data, chunk_spec))
188+
else:
189+
results.append(None)
190+
return results
191+
192+
193+
class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
194+
"""Base class for array-to-array codecs (e.g. transpose, scale_offset)."""
195+
196+
197+
class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]):
198+
"""Base class for array-to-bytes codecs (e.g. bytes, sharding)."""
199+
200+
201+
class BytesBytesCodec(BaseCodec[Buffer, Buffer]):
202+
"""Base class for bytes-to-bytes codecs (e.g. gzip, zstd)."""

packages/zarr-interfaces/src/zarr_interfaces/data_type/__init__.py

Whitespace-only changes.
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""Data type interface definitions (v1).
2+
3+
This module defines the abstract interface for zarr data types.
4+
External data type implementations should subclass ``ZDType`` from this
5+
module. The interface is intentionally minimal and stable.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from abc import ABC, abstractmethod
11+
from dataclasses import dataclass
12+
from typing import (
13+
ClassVar,
14+
Literal,
15+
Self,
16+
TypeGuard,
17+
overload,
18+
)
19+
20+
import numpy as np
21+
22+
# JSON-like type for serialization
23+
type JSON = str | int | float | bool | dict[str, JSON] | list[JSON] | None
24+
type ZarrFormat = Literal[2, 3]
25+
26+
# Bounds for the scalar and dtype type parameters
27+
type TBaseScalar = np.generic | str | bytes
28+
type TBaseDType = np.dtype[np.generic]
29+
30+
# JSON representations of data types
31+
type DTypeJSON = JSON
32+
type DTypeSpec_V2 = str | list[tuple[str, DTypeJSON]]
33+
type DTypeSpec_V3 = str | dict[str, JSON]
34+
35+
36+
@dataclass(frozen=True, kw_only=True, slots=True)
37+
class ZDType[DType: TBaseDType, Scalar: TBaseScalar](ABC):
38+
"""Abstract base class for wrapping native array data types.
39+
40+
Subclasses must implement all abstract methods to support serialization,
41+
deserialization, and scalar handling for their native data type.
42+
43+
Type Parameters
44+
---------------
45+
DType
46+
The native data type (e.g. ``np.dtype[np.float64]``).
47+
Scalar
48+
The scalar type produced by this data type (e.g. ``np.float64``).
49+
"""
50+
51+
dtype_cls: ClassVar[type[TBaseDType]]
52+
_zarr_v3_name: ClassVar[str]
53+
54+
@classmethod
55+
def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[DType]:
56+
"""Check that a native data type matches ``dtype_cls``."""
57+
return type(dtype) is cls.dtype_cls
58+
59+
@classmethod
60+
@abstractmethod
61+
def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self:
62+
"""Create an instance from a native data type."""
63+
...
64+
65+
@abstractmethod
66+
def to_native_dtype(self: Self) -> DType:
67+
"""Return the native data type wrapped by this instance."""
68+
...
69+
70+
@classmethod
71+
@abstractmethod
72+
def _from_json_v2(cls: type[Self], data: DTypeJSON) -> Self: ...
73+
74+
@classmethod
75+
@abstractmethod
76+
def _from_json_v3(cls: type[Self], data: DTypeJSON) -> Self: ...
77+
78+
@classmethod
79+
def from_json(cls: type[Self], data: DTypeJSON, *, zarr_format: ZarrFormat) -> Self:
80+
"""Create an instance from JSON metadata."""
81+
if zarr_format == 2:
82+
return cls._from_json_v2(data)
83+
if zarr_format == 3:
84+
return cls._from_json_v3(data)
85+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}")
86+
87+
@overload
88+
def to_json(self, zarr_format: Literal[2]) -> DTypeSpec_V2: ...
89+
90+
@overload
91+
def to_json(self, zarr_format: Literal[3]) -> DTypeSpec_V3: ...
92+
93+
@abstractmethod
94+
def to_json(self, zarr_format: ZarrFormat) -> DTypeSpec_V2 | DTypeSpec_V3:
95+
"""Serialize this data type to JSON."""
96+
...
97+
98+
@abstractmethod
99+
def _check_scalar(self, data: object) -> bool:
100+
"""Check that a python object is a valid scalar for this data type."""
101+
...
102+
103+
@abstractmethod
104+
def cast_scalar(self, data: object) -> Scalar:
105+
"""Cast a python object to the scalar type of this data type."""
106+
...
107+
108+
@abstractmethod
109+
def default_scalar(self) -> Scalar:
110+
"""Return the default scalar value for this data type."""
111+
...
112+
113+
@abstractmethod
114+
def from_json_scalar(self: Self, data: JSON, *, zarr_format: ZarrFormat) -> Scalar:
115+
"""Deserialize a JSON value to a scalar."""
116+
...
117+
118+
@abstractmethod
119+
def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> JSON:
120+
"""Serialize a scalar value to JSON."""
121+
...

packages/zarr-interfaces/src/zarr_interfaces/metadata/__init__.py

Whitespace-only changes.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Metadata protocol (v1).
2+
3+
Defines the structural interface for objects that can be serialized
4+
to and deserialized from JSON dictionaries.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from typing import Protocol, Self, runtime_checkable
10+
11+
type JSON = str | int | float | bool | dict[str, JSON] | list[JSON] | None
12+
13+
14+
@runtime_checkable
15+
class Metadata(Protocol):
16+
"""Protocol for objects that round-trip through JSON dictionaries."""
17+
18+
@classmethod
19+
def from_dict(cls, data: dict[str, JSON]) -> Self:
20+
"""Create an instance from a JSON dictionary."""
21+
...
22+
23+
def to_dict(self) -> dict[str, JSON]:
24+
"""Serialize to a JSON dictionary."""
25+
...

0 commit comments

Comments
 (0)