Skip to content

Commit 2b3eeff

Browse files
committed
Fix nullable validation, chunk sizing, print alignment, numpy mask support
1 parent 41f7a14 commit 2b3eeff

4 files changed

Lines changed: 65 additions & 55 deletions

File tree

src/blosc2/ctable.py

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,8 +1010,14 @@ def __str__(self) -> str:
10101010

10111011
sep = " ".join("─" * (w + 2) for w in widths.values())
10121012

1013+
def fmt_cell(value, width: int) -> str:
1014+
s = str(value)
1015+
if len(s) > width:
1016+
s = s[: width - 1] + "…"
1017+
return f" {s:<{width}} "
1018+
10131019
def fmt_row(values: dict) -> str:
1014-
return " ".join(f" {values[n]!s:<{widths[n]}} " for n in self.col_names)
1020+
return " ".join(fmt_cell(values[n], widths[n]) for n in self.col_names)
10151021

10161022
# -- batch-fetch values (one read per column, not one per cell) --
10171023
def rows_to_dicts(positions) -> list[dict]:
@@ -1162,7 +1168,10 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None:
11621168
# --- columns ---
11631169
for col in self._schema.columns:
11641170
name = col.name
1165-
col_storage = self._resolve_column_storage(col, default_chunks, default_blocks)
1171+
# Use dtype-aware defaults so large-itemsize columns (e.g. U4096) get
1172+
# sensible chunk/block sizes rather than the uint8-based defaults.
1173+
dtype_chunks, dtype_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype)
1174+
col_storage = self._resolve_column_storage(col, dtype_chunks, dtype_blocks)
11661175
disk_col = file_storage.create_column(
11671176
name,
11681177
dtype=col.dtype,
@@ -1212,25 +1221,26 @@ def load(cls, urlpath: str) -> CTable:
12121221
capacity = max(phys_size, 1)
12131222

12141223
mem_storage = InMemoryTableStorage()
1215-
default_chunks, default_blocks = compute_chunks_blocks((capacity,))
1224+
bool_chunks, bool_blocks = compute_chunks_blocks((capacity,), dtype=np.dtype(np.bool_))
12161225

12171226
mem_valid = mem_storage.create_valid_rows(
12181227
shape=(capacity,),
1219-
chunks=default_chunks,
1220-
blocks=default_blocks,
1228+
chunks=bool_chunks,
1229+
blocks=bool_blocks,
12211230
)
12221231
if phys_size > 0:
12231232
mem_valid[:phys_size] = disk_valid[:]
12241233

12251234
mem_cols: dict[str, blosc2.NDArray] = {}
12261235
for col in schema.columns:
12271236
name = col.name
1237+
col_chunks, col_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype)
12281238
mem_col = mem_storage.create_column(
12291239
name,
12301240
dtype=col.dtype,
12311241
shape=(capacity,),
1232-
chunks=default_chunks,
1233-
blocks=default_blocks,
1242+
chunks=col_chunks,
1243+
blocks=col_blocks,
12341244
cparams=None,
12351245
dparams=None,
12361246
)
@@ -1284,6 +1294,8 @@ def _make_view(cls, parent: CTable, new_valid_rows: blosc2.NDArray) -> CTable:
12841294
return obj
12851295

12861296
def view(self, new_valid_rows):
1297+
if isinstance(new_valid_rows, np.ndarray) and new_valid_rows.dtype == np.bool_:
1298+
new_valid_rows = blosc2.asarray(new_valid_rows)
12871299
if not (
12881300
isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr))
12891301
and (getattr(new_valid_rows, "dtype", None) == np.bool_)
@@ -1798,6 +1810,24 @@ def to_csv(self, path: str, *, header: bool = True, sep: str = ",") -> None:
17981810
for row in zip(*arrays, strict=True):
17991811
writer.writerow(row)
18001812

1813+
@staticmethod
1814+
def _csv_col_to_array(raw: list[str], col, nv) -> np.ndarray:
1815+
"""Convert a list of raw CSV strings to a numpy array for *col*."""
1816+
if col.dtype == np.bool_:
1817+
1818+
def _parse(v, _nv=nv):
1819+
stripped = v.strip()
1820+
if stripped == "" and _nv is not None:
1821+
return _nv
1822+
return stripped in ("True", "true", "1")
1823+
1824+
return np.array([_parse(v) for v in raw], dtype=np.bool_)
1825+
if col.dtype.kind == "S":
1826+
prepared: list = [nv if (v.strip() == "" and nv is not None) else v.encode() for v in raw]
1827+
return np.array(prepared, dtype=col.dtype)
1828+
prepared2 = [nv if (v.strip() == "" and nv is not None) else v for v in raw]
1829+
return np.array(prepared2, dtype=col.dtype)
1830+
18011831
@classmethod
18021832
def from_csv(
18031833
cls,
@@ -1900,25 +1930,7 @@ def from_csv(
19001930
if n > 0:
19011931
for i, col in enumerate(schema.columns):
19021932
nv = getattr(col.spec, "null_value", None)
1903-
if col.dtype == np.bool_:
1904-
# np.array(["False"], dtype=bool) treats any non-empty
1905-
# string as True. Parse "True"/"False"/"1"/"0" explicitly.
1906-
# Empty cells → null_value (or False if no null_value).
1907-
def _parse_bool(v, _nv=nv):
1908-
stripped = v.strip()
1909-
if stripped == "" and _nv is not None:
1910-
return _nv
1911-
return stripped in ("True", "true", "1")
1912-
1913-
arr = np.array([_parse_bool(v) for v in col_data[i]], dtype=np.bool_)
1914-
else:
1915-
raw_strings = col_data[i]
1916-
if nv is not None:
1917-
# Replace empty cells with the null sentinel string representation,
1918-
# then cast. For numeric types, store nv directly.
1919-
nv_str = str(nv)
1920-
raw_strings = [nv_str if v.strip() == "" else v for v in raw_strings]
1921-
arr = np.array(raw_strings, dtype=col.dtype)
1933+
arr = cls._csv_col_to_array(col_data[i], col, nv)
19221934
new_cols[col.name][:n] = arr
19231935
new_valid[:n] = True
19241936
obj._n_rows = n
@@ -2535,6 +2547,8 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
25352547

25362548
@profile
25372549
def where(self, expr_result) -> CTable:
2550+
if isinstance(expr_result, np.ndarray) and expr_result.dtype == np.bool_:
2551+
expr_result = blosc2.asarray(expr_result)
25382552
if not (
25392553
isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr))
25402554
and (getattr(expr_result, "dtype", None) == np.bool_)

src/blosc2/schema.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def __init_subclass__(cls, **kwargs):
5050
if isinstance(_np_dtype, np.dtype):
5151
cls.itemsize = _np_dtype.itemsize
5252
cls.kind = _np_dtype.kind
53+
cls.type = _np_dtype.type
5354
cls.str = _np_dtype.str
5455
cls.name = _np_dtype.name
5556

src/blosc2/schema_validation.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,29 @@ def build_validator_model(schema: CompiledSchema) -> type[BaseModel]:
2929
The model enforces all constraints declared in each column's
3030
:class:`~blosc2.schema.SchemaSpec` (``ge``, ``le``, ``gt``, ``lt``,
3131
``max_length``, ``min_length``, ``pattern``).
32+
33+
Nullable columns (those with a ``null_value``) are typed as
34+
``Optional[T]`` with ``default=None`` so that null sentinels can be
35+
passed as ``None`` and bypass constraint validation entirely — no
36+
placeholder guessing required.
3237
"""
3338
if schema.validator_model is not None:
3439
return schema.validator_model
3540

3641
field_definitions: dict[str, Any] = {}
3742
for col in schema.columns:
3843
pydantic_kwargs = col.spec.to_pydantic_kwargs()
44+
is_nullable = getattr(col.spec, "null_value", None) is not None
45+
py_type = col.py_type | None if is_nullable else col.py_type
46+
3947
if col.default is MISSING:
40-
field_definitions[col.name] = (col.py_type, Field(**pydantic_kwargs))
48+
default = None if is_nullable else MISSING
49+
if default is MISSING:
50+
field_definitions[col.name] = (py_type, Field(**pydantic_kwargs))
51+
else:
52+
field_definitions[col.name] = (py_type, Field(default=default, **pydantic_kwargs))
4153
else:
42-
field_definitions[col.name] = (col.py_type, Field(default=col.default, **pydantic_kwargs))
54+
field_definitions[col.name] = (py_type, Field(default=col.default, **pydantic_kwargs))
4355

4456
cls_name = schema.row_cls.__name__ if schema.row_cls is not None else "Unknown"
4557
model_cls = create_model(f"_Validator_{cls_name}", **field_definitions)
@@ -61,29 +73,15 @@ def _is_null_value(val, null_value) -> bool:
6173
return val == null_value
6274

6375

64-
def _safe_placeholder(col) -> Any:
65-
"""Return a value that passes Pydantic validation for *col* (used for null bypass)."""
66-
spec = col.spec
67-
ge = getattr(spec, "ge", None)
68-
gt = getattr(spec, "gt", None)
69-
if ge is not None:
70-
return ge
71-
if gt is not None:
72-
return gt + 1
73-
# For string/bytes, use an empty string
74-
if col.dtype.kind in ("U", "S"):
75-
return ""
76-
# For bool
77-
if col.dtype.kind == "b":
78-
return False
79-
return 0
80-
81-
8276
def _mask_nulls(schema: CompiledSchema, row: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
83-
"""Replace null sentinel values with safe placeholders.
77+
"""Replace null sentinel values with ``None`` so Pydantic skips constraint checks.
78+
79+
Nullable columns are declared as ``Optional[T]`` in the validator model,
80+
so passing ``None`` is always valid regardless of ``ge``/``le``/``pattern``
81+
constraints. The original sentinel is stashed in *nulled* and restored
82+
after validation.
8483
85-
Returns (masked_row, null_positions) where null_positions maps
86-
column name → original null value for columns that were masked.
84+
Returns (masked_row, nulled) where nulled maps column name → sentinel value.
8785
"""
8886
masked = dict(row)
8987
nulled: dict[str, Any] = {}
@@ -92,9 +90,9 @@ def _mask_nulls(schema: CompiledSchema, row: dict[str, Any]) -> tuple[dict[str,
9290
if nv is None:
9391
continue
9492
val = row.get(col.name)
95-
if val is not None and _is_null_value(val, nv):
93+
if _is_null_value(val, nv):
9694
nulled[col.name] = val
97-
masked[col.name] = _safe_placeholder(col)
95+
masked[col.name] = None
9896
return masked, nulled
9997

10098

src/blosc2/schema_vectorized.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,9 @@ def validate_column_values(col: CompiledColumn, values: Any) -> None:
7979

8080
# Compute null mask so sentinels bypass constraint checks
8181
null_mask = _null_mask_for_spec(arr, spec)
82-
# non_null is a boolean array True for positions that must be validated
8382
if null_mask is not None:
84-
non_null = ~null_mask
85-
check = arr[non_null]
83+
check = arr[~null_mask]
8684
else:
87-
non_null = None
8885
check = arr
8986

9087
# Numeric bounds

0 commit comments

Comments
 (0)