Fix nullable validation, chunk sizing, print alignment, numpy mask support

Jacc4224 · Jacc4224 · commit 2b3eeff70ab2 · 2026-04-16T11:33:50.000+02:00
diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py
@@ -1010,8 +1010,14 @@ def __str__(self) -> str:
 
         sep = "  ".join("─" * (w + 2) for w in widths.values())
 
+        def fmt_cell(value, width: int) -> str:
+            s = str(value)
+            if len(s) > width:
+                s = s[: width - 1] + "…"
+            return f" {s:<{width}} "
+
         def fmt_row(values: dict) -> str:
-            return "  ".join(f" {values[n]!s:<{widths[n]}} " for n in self.col_names)
+            return "  ".join(fmt_cell(values[n], widths[n]) for n in self.col_names)
 
         # -- batch-fetch values (one read per column, not one per cell) --
         def rows_to_dicts(positions) -> list[dict]:
@@ -1162,7 +1168,10 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None:
         # --- columns ---
         for col in self._schema.columns:
             name = col.name
-            col_storage = self._resolve_column_storage(col, default_chunks, default_blocks)
+            # Use dtype-aware defaults so large-itemsize columns (e.g. U4096) get
+            # sensible chunk/block sizes rather than the uint8-based defaults.
+            dtype_chunks, dtype_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype)
+            col_storage = self._resolve_column_storage(col, dtype_chunks, dtype_blocks)
             disk_col = file_storage.create_column(
                 name,
                 dtype=col.dtype,
@@ -1212,25 +1221,26 @@ def load(cls, urlpath: str) -> CTable:
         capacity = max(phys_size, 1)
 
         mem_storage = InMemoryTableStorage()
-        default_chunks, default_blocks = compute_chunks_blocks((capacity,))
+        bool_chunks, bool_blocks = compute_chunks_blocks((capacity,), dtype=np.dtype(np.bool_))
 
         mem_valid = mem_storage.create_valid_rows(
             shape=(capacity,),
-            chunks=default_chunks,
-            blocks=default_blocks,
+            chunks=bool_chunks,
+            blocks=bool_blocks,
         )
         if phys_size > 0:
             mem_valid[:phys_size] = disk_valid[:]
 
         mem_cols: dict[str, blosc2.NDArray] = {}
         for col in schema.columns:
             name = col.name
+            col_chunks, col_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype)
             mem_col = mem_storage.create_column(
                 name,
                 dtype=col.dtype,
                 shape=(capacity,),
-                chunks=default_chunks,
-                blocks=default_blocks,
+                chunks=col_chunks,
+                blocks=col_blocks,
                 cparams=None,
                 dparams=None,
             )
@@ -1284,6 +1294,8 @@ def _make_view(cls, parent: CTable, new_valid_rows: blosc2.NDArray) -> CTable:
         return obj
 
     def view(self, new_valid_rows):
+        if isinstance(new_valid_rows, np.ndarray) and new_valid_rows.dtype == np.bool_:
+            new_valid_rows = blosc2.asarray(new_valid_rows)
         if not (
             isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr))
             and (getattr(new_valid_rows, "dtype", None) == np.bool_)
@@ -1798,6 +1810,24 @@ def to_csv(self, path: str, *, header: bool = True, sep: str = ",") -> None:
             for row in zip(*arrays, strict=True):
                 writer.writerow(row)
 
+    @staticmethod
+    def _csv_col_to_array(raw: list[str], col, nv) -> np.ndarray:
+        """Convert a list of raw CSV strings to a numpy array for *col*."""
+        if col.dtype == np.bool_:
+
+            def _parse(v, _nv=nv):
+                stripped = v.strip()
+                if stripped == "" and _nv is not None:
+                    return _nv
+                return stripped in ("True", "true", "1")
+
+            return np.array([_parse(v) for v in raw], dtype=np.bool_)
+        if col.dtype.kind == "S":
+            prepared: list = [nv if (v.strip() == "" and nv is not None) else v.encode() for v in raw]
+            return np.array(prepared, dtype=col.dtype)
+        prepared2 = [nv if (v.strip() == "" and nv is not None) else v for v in raw]
+        return np.array(prepared2, dtype=col.dtype)
+
     @classmethod
     def from_csv(
         cls,
@@ -1900,25 +1930,7 @@ def from_csv(
         if n > 0:
             for i, col in enumerate(schema.columns):
                 nv = getattr(col.spec, "null_value", None)
-                if col.dtype == np.bool_:
-                    # np.array(["False"], dtype=bool) treats any non-empty
-                    # string as True.  Parse "True"/"False"/"1"/"0" explicitly.
-                    # Empty cells → null_value (or False if no null_value).
-                    def _parse_bool(v, _nv=nv):
-                        stripped = v.strip()
-                        if stripped == "" and _nv is not None:
-                            return _nv
-                        return stripped in ("True", "true", "1")
-
-                    arr = np.array([_parse_bool(v) for v in col_data[i]], dtype=np.bool_)
-                else:
-                    raw_strings = col_data[i]
-                    if nv is not None:
-                        # Replace empty cells with the null sentinel string representation,
-                        # then cast. For numeric types, store nv directly.
-                        nv_str = str(nv)
-                        raw_strings = [nv_str if v.strip() == "" else v for v in raw_strings]
-                    arr = np.array(raw_strings, dtype=col.dtype)
+                arr = cls._csv_col_to_array(col_data[i], col, nv)
                 new_cols[col.name][:n] = arr
             new_valid[:n] = True
             obj._n_rows = n
@@ -2535,6 +2547,8 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
 
     @profile
     def where(self, expr_result) -> CTable:
+        if isinstance(expr_result, np.ndarray) and expr_result.dtype == np.bool_:
+            expr_result = blosc2.asarray(expr_result)
         if not (
             isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr))
             and (getattr(expr_result, "dtype", None) == np.bool_)
diff --git a/src/blosc2/schema.py b/src/blosc2/schema.py
@@ -50,6 +50,7 @@ def __init_subclass__(cls, **kwargs):
         if isinstance(_np_dtype, np.dtype):
             cls.itemsize = _np_dtype.itemsize
             cls.kind = _np_dtype.kind
+            cls.type = _np_dtype.type
             cls.str = _np_dtype.str
             cls.name = _np_dtype.name
 
diff --git a/src/blosc2/schema_validation.py b/src/blosc2/schema_validation.py
@@ -29,17 +29,29 @@ def build_validator_model(schema: CompiledSchema) -> type[BaseModel]:
     The model enforces all constraints declared in each column's
     :class:`~blosc2.schema.SchemaSpec` (``ge``, ``le``, ``gt``, ``lt``,
     ``max_length``, ``min_length``, ``pattern``).
+
+    Nullable columns (those with a ``null_value``) are typed as
+    ``Optional[T]`` with ``default=None`` so that null sentinels can be
+    passed as ``None`` and bypass constraint validation entirely — no
+    placeholder guessing required.
     """
     if schema.validator_model is not None:
         return schema.validator_model
 
     field_definitions: dict[str, Any] = {}
     for col in schema.columns:
         pydantic_kwargs = col.spec.to_pydantic_kwargs()
+        is_nullable = getattr(col.spec, "null_value", None) is not None
+        py_type = col.py_type | None if is_nullable else col.py_type
+
         if col.default is MISSING:
-            field_definitions[col.name] = (col.py_type, Field(**pydantic_kwargs))
+            default = None if is_nullable else MISSING
+            if default is MISSING:
+                field_definitions[col.name] = (py_type, Field(**pydantic_kwargs))
+            else:
+                field_definitions[col.name] = (py_type, Field(default=default, **pydantic_kwargs))
         else:
-            field_definitions[col.name] = (col.py_type, Field(default=col.default, **pydantic_kwargs))
+            field_definitions[col.name] = (py_type, Field(default=col.default, **pydantic_kwargs))
 
     cls_name = schema.row_cls.__name__ if schema.row_cls is not None else "Unknown"
     model_cls = create_model(f"_Validator_{cls_name}", **field_definitions)
@@ -61,29 +73,15 @@ def _is_null_value(val, null_value) -> bool:
     return val == null_value
 
 
-def _safe_placeholder(col) -> Any:
-    """Return a value that passes Pydantic validation for *col* (used for null bypass)."""
-    spec = col.spec
-    ge = getattr(spec, "ge", None)
-    gt = getattr(spec, "gt", None)
-    if ge is not None:
-        return ge
-    if gt is not None:
-        return gt + 1
-    # For string/bytes, use an empty string
-    if col.dtype.kind in ("U", "S"):
-        return ""
-    # For bool
-    if col.dtype.kind == "b":
-        return False
-    return 0
-
-
 def _mask_nulls(schema: CompiledSchema, row: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
-    """Replace null sentinel values with safe placeholders.
+    """Replace null sentinel values with ``None`` so Pydantic skips constraint checks.
+
+    Nullable columns are declared as ``Optional[T]`` in the validator model,
+    so passing ``None`` is always valid regardless of ``ge``/``le``/``pattern``
+    constraints.  The original sentinel is stashed in *nulled* and restored
+    after validation.
 
-    Returns (masked_row, null_positions) where null_positions maps
-    column name → original null value for columns that were masked.
+    Returns (masked_row, nulled) where nulled maps column name → sentinel value.
     """
     masked = dict(row)
     nulled: dict[str, Any] = {}
@@ -92,9 +90,9 @@ def _mask_nulls(schema: CompiledSchema, row: dict[str, Any]) -> tuple[dict[str,
         if nv is None:
             continue
         val = row.get(col.name)
-        if val is not None and _is_null_value(val, nv):
+        if _is_null_value(val, nv):
             nulled[col.name] = val
-            masked[col.name] = _safe_placeholder(col)
+            masked[col.name] = None
     return masked, nulled
 
 
diff --git a/src/blosc2/schema_vectorized.py b/src/blosc2/schema_vectorized.py
@@ -79,12 +79,9 @@ def validate_column_values(col: CompiledColumn, values: Any) -> None:
 
     # Compute null mask so sentinels bypass constraint checks
     null_mask = _null_mask_for_spec(arr, spec)
-    # non_null is a boolean array True for positions that must be validated
     if null_mask is not None:
-        non_null = ~null_mask
-        check = arr[non_null]
+        check = arr[~null_mask]
     else:
-        non_null = None
         check = arr
 
     # Numeric bounds