NHSDigital
diff --git a/‎.github/workflows/ci_publish.yml‎
Lines changed: 57 additions & 0 deletions b/‎.github/workflows/ci_publish.yml‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 11 additions & 4 deletions b/‎poetry.lock‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dve/core_engine/backends/implementations/duckdb/readers/csv.py‎
Lines changed: 20 additions & 1 deletion b/‎src/dve/core_engine/backends/implementations/duckdb/readers/csv.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎src/dve/core_engine/backends/implementations/duckdb/reference_data.py‎
Lines changed: 1 addition & 1 deletion b/‎src/dve/core_engine/backends/implementations/duckdb/reference_data.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dve/core_engine/backends/implementations/spark/readers/csv.py‎
Lines changed: 11 additions & 1 deletion b/‎src/dve/core_engine/backends/implementations/spark/readers/csv.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/dve/metadata_parser/domain_types.py‎
Lines changed: 49 additions & 10 deletions b/‎src/dve/metadata_parser/domain_types.py‎
Lines changed: 49 additions & 10 deletions
diff --git a/‎tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,57 @@
+name: CI Publish
+
+on: workflow_dispatch
+
+jobs:
+  build:
+    name: Build dist
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install extra dependencies for a python install
+        run: |
+          sudo apt-get update
+          sudo apt -y install --no-install-recommends liblzma-dev libbz2-dev libreadline-dev
+
+      - name: Install asdf cli
+        uses: asdf-vm/actions/setup@b7bcd026f18772e44fe1026d729e1611cc435d47  # v4.0.1
+
+      - name: Install software through asdf
+        uses: asdf-vm/actions/install@b7bcd026f18772e44fe1026d729e1611cc435d47  # v4.0.1
+
+      - name: reshim asdf
+        run: asdf reshim
+
+      - name: ensure poetry using desired python version
+        run: poetry env use $(asdf which python)
+
+      - name: build wheels
+        run: make dist
+
+      - name: store wheels
+        uses: actions/upload-artifact@v5
+        with:
+          name: dve-wheels
+          path: dist/
+
+  publish-to-pypi:
+    name: Publish to PyPi
+    if: startsWith(github.ref, 'refs/tags/')
+    needs:
+      - build
+    runs-on: ubuntu-24.04
+    environment:
+      name: pypi
+      url: https://pypi.org/org/data-validation-engine
+    permissions:
+      id-token: write   # IMPORTANT: mandatory for trusted publishing
+    steps:
+      - name: download dist
+        uses: actions/download-artifact@v6
+        with:
+          name: dve-wheels
+          path: dist/
+
+      - name: publish
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e  # v1.13.0
@@ -1,3 +1,12 @@
+## v0.6.2 (2026-03-09)
+
+### Fix
+
+- fix issue where formattedtime type not triggering validate correctly (#54)
+- fix postcode type to raise error when postcode submitted without space (#53)
+- amend arrow file read to use streams
+- deal with case sensitivity on file extension derivation (#50)
+
 ## v0.6.1 (2026-02-19)
 
 ### Fix
 
@@ -51,7 +51,7 @@ faker = "18.11.1"
 behave = "1.3.3"
 coverage = "7.11.0"
 moto = {extras = ["s3"], version = "4.0.13"}
-Werkzeug = "3.1.5"
+Werkzeug = "3.1.6"
 pytest = "8.4.2"
 pytest-lazy-fixtures = "1.4.0"  # switched from https://github.com/TvoroG/pytest-lazy-fixture as it's no longer supported
 xlsx2csv = "0.8.2"
 
@@ -55,6 +55,7 @@ def __init__(
         field_check: bool = False,
         field_check_error_code: Optional[str] = "ExpectedVsActualFieldMismatch",
         field_check_error_message: Optional[str] = "The submitted header is missing fields",
+        null_empty_strings: bool = False,
         **_,
     ):
         self.header = header
@@ -64,6 +65,7 @@ def __init__(
         self.field_check = field_check
         self.field_check_error_code = field_check_error_code
         self.field_check_error_message = field_check_error_message
+        self.null_empty_strings = null_empty_strings
 
         super().__init__()
 
@@ -118,7 +120,16 @@ def read_to_relation(  # pylint: disable=unused-argument
         }
 
         reader_options["columns"] = ddb_schema
-        return self.add_record_index(read_csv(resource, **reader_options, parallel=False))
+
+        rel = self.add_record_index(read_csv(resource, **reader_options, parallel=False))
+
+        if self.null_empty_strings:
+            cleaned_cols = ",".join(
+                [f"NULLIF(TRIM({c}), '') as {c}" for c in reader_options["columns"].keys()]
+            )
+            rel = rel.select(cleaned_cols)
+
+        return rel
 
 
 @polars_record_index
@@ -161,6 +172,14 @@ def read_to_relation(  # pylint: disable=unused-argument
             )
         )
 
+        if self.null_empty_strings:
+            pl_exprs = [
+                pl.col(c).str.strip_chars().replace("", None)
+                for c in df.columns
+                if not c == RECORD_INDEX_COLUMN_NAME
+            ] + [pl.col(RECORD_INDEX_COLUMN_NAME)]
+            df = df.select(pl_exprs)
+
         return ddb.sql("SELECT * FROM df")
 
 
 
@@ -46,4 +46,4 @@ def load_parquet_file(self, uri: str) -> DuckDBPyRelation:
     @mark_refdata_file_extension("arrow")
     def load_arrow_file(self, uri: str) -> DuckDBPyRelation:
         """Load an arrow ipc file into a duckdb relation"""
-        return self.connection.from_arrow(ipc.open_file(uri).read_all())  # type:ignore
+        return self.connection.from_arrow(ipc.open_stream(uri).read_all())  # type:ignore
@@ -3,6 +3,7 @@
 from collections.abc import Iterator
 from typing import Any, Optional
 
+import pyspark.sql.functions as psf
 from pydantic import BaseModel
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import StructType
@@ -32,6 +33,7 @@ def __init__(
         header: bool = True,
         multi_line: bool = False,
         encoding: str = "utf-8-sig",
+        null_empty_strings: bool = False,
         spark_session: Optional[SparkSession] = None,
         **_,
     ) -> None:
@@ -42,6 +44,7 @@ def __init__(
         self.quote_char = quote_char
         self.header = header
         self.multi_line = multi_line
+        self.null_empty_strings = null_empty_strings
         self.spark_session = spark_session if spark_session else SparkSession.builder.getOrCreate()  # type: ignore  # pylint: disable=C0301
 
         super().__init__()
@@ -72,8 +75,15 @@ def read_to_dataframe(
             "multiLine": self.multi_line,
         }
 
-        return self.add_record_index(
+        df = self.add_record_index(
             self.spark_session.read.format("csv")
             .options(**kwargs)  # type: ignore
             .load(resource, schema=spark_schema)
         )
+
+        if self.null_empty_strings:
+            df = df.select(
+                *[psf.trim(psf.col(c.name)).alias(c.name) for c in spark_schema.fields]
+            ).replace("", None)
+
+        return df
@@ -173,33 +173,67 @@ def permissive_nhs_number(warn_on_test_numbers: bool = False):
     return type("NHSNumber", (NHSNumber, *NHSNumber.__bases__), dict_)
 
 
-# TODO: Make the spacing configurable. Not all downstream consumers want a single space
 class Postcode(types.ConstrainedStr):
     """Postcode constrained string"""
 
     regex: re.Pattern = POSTCODE_REGEX
     strip_whitespace = True
+    apply_normalize = True
 
     @staticmethod
-    def normalize(postcode: str) -> Optional[str]:
+    def normalize(_postcode: str) -> Optional[str]:
         """Strips internal and external spaces"""
-        postcode = postcode.replace(" ", "")
-        if not postcode or postcode.lower() in NULL_POSTCODES:
+        _postcode = _postcode.replace(" ", "")
+        if not _postcode or _postcode.lower() in NULL_POSTCODES:
             return None
-        postcode = postcode.replace(" ", "")
-        return " ".join((postcode[0:-3], postcode[-3:])).upper()
+        _postcode = _postcode.replace(" ", "")
+        return " ".join((_postcode[0:-3], _postcode[-3:])).upper()
 
     @classmethod
     def validate(cls, value: str) -> Optional[str]:  # type: ignore
         """Validates the given postcode"""
-        stripped = cls.normalize(value)
-        if not stripped:
+        if cls.apply_normalize and value:
+            value = cls.normalize(value)  # type: ignore
+
+        if not value:
             return None
 
-        if not cls.regex.match(stripped):
+        if not cls.regex.match(value):
             raise ValueError("Invalid Postcode submitted")
 
-        return stripped
+        return value
+
+
+@lru_cache()
+@validate_arguments
+def postcode(
+    # pylint: disable=R0913
+    strip_whitespace: Optional[bool] = True,
+    to_upper: Optional[bool] = False,
+    to_lower: Optional[bool] = False,
+    strict: Optional[bool] = False,
+    min_length: Optional[int] = None,
+    max_length: Optional[int] = None,
+    curtail_length: Optional[int] = None,
+    regex: Optional[str] = POSTCODE_REGEX,  # type: ignore
+    apply_normalize: Optional[bool] = True,
+) -> type[Postcode]:
+    """Return a formatted date class with a set date format
+    and timezone treatment.
+
+    """
+    dict_ = Postcode.__dict__.copy()
+    dict_["strip_whitespace"] = strip_whitespace
+    dict_["to_upper"] = to_upper
+    dict_["to_lower"] = to_lower
+    dict_["strict"] = strict
+    dict_["min_length"] = min_length
+    dict_["max_length"] = max_length
+    dict_["curtail_length"] = curtail_length
+    dict_["regex"] = regex
+    dict_["apply_normalize"] = apply_normalize
+
+    return type("Postcode", (Postcode, *Postcode.__bases__), dict_)
 
 
 class OrgID(_SimpleRegexValidator):
@@ -482,6 +516,11 @@ def validate(cls, value: Union[dt.time, dt.datetime, str]) -> dt.time | None:
 
         return new_time
 
+    @classmethod
+    def __get_validators__(cls) -> Iterator[classmethod]:
+        """Gets all validators"""
+        yield cls.validate  # type: ignore
+
 
 @lru_cache()
 @validate_arguments
 
@@ -51,7 +51,7 @@ def test_duckdb_data_contract_csv(temp_csv_file):
                                 "description": "test",
                                 "callable": "formattedtime",
                                 "constraints": {
-                                    "time_format": "%Y-%m-%d",
+                                    "time_format": "%H:%M:%S",
                                     "timezone_treatment": "forbid"
                                 }
                             }
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ def test_duckdb_data_contract_csv(temp_csv_file):`
`51`	`51`	`"description": "test",`
`52`	`52`	`"callable": "formattedtime",`
`53`	`53`	`"constraints": {`
`54`		`- "time_format": "%Y-%m-%d",`
	`54`	`+ "time_format": "%H:%M:%S",`
`55`	`55`	`"timezone_treatment": "forbid"`
`56`	`56`	`}`
`57`	`57`	`}`