Skip to content

Commit 38e8697

Browse files
committed
fix: encorporate record index with blank to null handling changes
2 parents 11c03cc + 506c1cc commit 38e8697

13 files changed

Lines changed: 333 additions & 20 deletions

File tree

.github/workflows/ci_publish.yml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
name: CI Publish
2+
3+
on: workflow_dispatch
4+
5+
jobs:
6+
build:
7+
name: Build dist
8+
runs-on: ubuntu-24.04
9+
steps:
10+
- uses: actions/checkout@v6
11+
12+
- name: Install extra dependencies for a python install
13+
run: |
14+
sudo apt-get update
15+
sudo apt -y install --no-install-recommends liblzma-dev libbz2-dev libreadline-dev
16+
17+
- name: Install asdf cli
18+
uses: asdf-vm/actions/setup@b7bcd026f18772e44fe1026d729e1611cc435d47 # v4.0.1
19+
20+
- name: Install software through asdf
21+
uses: asdf-vm/actions/install@b7bcd026f18772e44fe1026d729e1611cc435d47 # v4.0.1
22+
23+
- name: reshim asdf
24+
run: asdf reshim
25+
26+
- name: ensure poetry using desired python version
27+
run: poetry env use $(asdf which python)
28+
29+
- name: build wheels
30+
run: make dist
31+
32+
- name: store wheels
33+
uses: actions/upload-artifact@v5
34+
with:
35+
name: dve-wheels
36+
path: dist/
37+
38+
publish-to-pypi:
39+
name: Publish to PyPi
40+
if: startsWith(github.ref, 'refs/tags/')
41+
needs:
42+
- build
43+
runs-on: ubuntu-24.04
44+
environment:
45+
name: pypi
46+
url: https://pypi.org/org/data-validation-engine
47+
permissions:
48+
id-token: write # IMPORTANT: mandatory for trusted publishing
49+
steps:
50+
- name: download dist
51+
uses: actions/download-artifact@v6
52+
with:
53+
name: dve-wheels
54+
path: dist/
55+
56+
- name: publish
57+
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## v0.6.2 (2026-03-09)
2+
3+
### Fix
4+
5+
- fix issue where formattedtime type not triggering validate correctly (#54)
6+
- fix postcode type to raise error when postcode submitted without space (#53)
7+
- amend arrow file read to use streams
8+
- deal with case sensitivity on file extension derivation (#50)
9+
110
## v0.6.1 (2026-02-19)
211

312
### Fix

poetry.lock

Lines changed: 11 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ faker = "18.11.1"
5151
behave = "1.3.3"
5252
coverage = "7.11.0"
5353
moto = {extras = ["s3"], version = "4.0.13"}
54-
Werkzeug = "3.1.5"
54+
Werkzeug = "3.1.6"
5555
pytest = "8.4.2"
5656
pytest-lazy-fixtures = "1.4.0" # switched from https://github.com/TvoroG/pytest-lazy-fixture as it's no longer supported
5757
xlsx2csv = "0.8.2"

src/dve/core_engine/backends/implementations/duckdb/readers/csv.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def __init__(
5555
field_check: bool = False,
5656
field_check_error_code: Optional[str] = "ExpectedVsActualFieldMismatch",
5757
field_check_error_message: Optional[str] = "The submitted header is missing fields",
58+
null_empty_strings: bool = False,
5859
**_,
5960
):
6061
self.header = header
@@ -64,6 +65,7 @@ def __init__(
6465
self.field_check = field_check
6566
self.field_check_error_code = field_check_error_code
6667
self.field_check_error_message = field_check_error_message
68+
self.null_empty_strings = null_empty_strings
6769

6870
super().__init__()
6971

@@ -118,7 +120,16 @@ def read_to_relation( # pylint: disable=unused-argument
118120
}
119121

120122
reader_options["columns"] = ddb_schema
121-
return self.add_record_index(read_csv(resource, **reader_options, parallel=False))
123+
124+
rel = self.add_record_index(read_csv(resource, **reader_options, parallel=False))
125+
126+
if self.null_empty_strings:
127+
cleaned_cols = ",".join(
128+
[f"NULLIF(TRIM({c}), '') as {c}" for c in reader_options["columns"].keys()]
129+
)
130+
rel = rel.select(cleaned_cols)
131+
132+
return rel
122133

123134

124135
@polars_record_index
@@ -161,6 +172,14 @@ def read_to_relation( # pylint: disable=unused-argument
161172
)
162173
)
163174

175+
if self.null_empty_strings:
176+
pl_exprs = [
177+
pl.col(c).str.strip_chars().replace("", None)
178+
for c in df.columns
179+
if not c == RECORD_INDEX_COLUMN_NAME
180+
] + [pl.col(RECORD_INDEX_COLUMN_NAME)]
181+
df = df.select(pl_exprs)
182+
164183
return ddb.sql("SELECT * FROM df")
165184

166185

src/dve/core_engine/backends/implementations/duckdb/reference_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,4 @@ def load_parquet_file(self, uri: str) -> DuckDBPyRelation:
4646
@mark_refdata_file_extension("arrow")
4747
def load_arrow_file(self, uri: str) -> DuckDBPyRelation:
4848
"""Load an arrow ipc file into a duckdb relation"""
49-
return self.connection.from_arrow(ipc.open_file(uri).read_all()) # type:ignore
49+
return self.connection.from_arrow(ipc.open_stream(uri).read_all()) # type:ignore

src/dve/core_engine/backends/implementations/spark/readers/csv.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from collections.abc import Iterator
44
from typing import Any, Optional
55

6+
import pyspark.sql.functions as psf
67
from pydantic import BaseModel
78
from pyspark.sql import DataFrame, SparkSession
89
from pyspark.sql.types import StructType
@@ -32,6 +33,7 @@ def __init__(
3233
header: bool = True,
3334
multi_line: bool = False,
3435
encoding: str = "utf-8-sig",
36+
null_empty_strings: bool = False,
3537
spark_session: Optional[SparkSession] = None,
3638
**_,
3739
) -> None:
@@ -42,6 +44,7 @@ def __init__(
4244
self.quote_char = quote_char
4345
self.header = header
4446
self.multi_line = multi_line
47+
self.null_empty_strings = null_empty_strings
4548
self.spark_session = spark_session if spark_session else SparkSession.builder.getOrCreate() # type: ignore # pylint: disable=C0301
4649

4750
super().__init__()
@@ -72,8 +75,15 @@ def read_to_dataframe(
7275
"multiLine": self.multi_line,
7376
}
7477

75-
return self.add_record_index(
78+
df = self.add_record_index(
7679
self.spark_session.read.format("csv")
7780
.options(**kwargs) # type: ignore
7881
.load(resource, schema=spark_schema)
7982
)
83+
84+
if self.null_empty_strings:
85+
df = df.select(
86+
*[psf.trim(psf.col(c.name)).alias(c.name) for c in spark_schema.fields]
87+
).replace("", None)
88+
89+
return df

src/dve/metadata_parser/domain_types.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -173,33 +173,67 @@ def permissive_nhs_number(warn_on_test_numbers: bool = False):
173173
return type("NHSNumber", (NHSNumber, *NHSNumber.__bases__), dict_)
174174

175175

176-
# TODO: Make the spacing configurable. Not all downstream consumers want a single space
177176
class Postcode(types.ConstrainedStr):
178177
"""Postcode constrained string"""
179178

180179
regex: re.Pattern = POSTCODE_REGEX
181180
strip_whitespace = True
181+
apply_normalize = True
182182

183183
@staticmethod
184-
def normalize(postcode: str) -> Optional[str]:
184+
def normalize(_postcode: str) -> Optional[str]:
185185
"""Strips internal and external spaces"""
186-
postcode = postcode.replace(" ", "")
187-
if not postcode or postcode.lower() in NULL_POSTCODES:
186+
_postcode = _postcode.replace(" ", "")
187+
if not _postcode or _postcode.lower() in NULL_POSTCODES:
188188
return None
189-
postcode = postcode.replace(" ", "")
190-
return " ".join((postcode[0:-3], postcode[-3:])).upper()
189+
_postcode = _postcode.replace(" ", "")
190+
return " ".join((_postcode[0:-3], _postcode[-3:])).upper()
191191

192192
@classmethod
193193
def validate(cls, value: str) -> Optional[str]: # type: ignore
194194
"""Validates the given postcode"""
195-
stripped = cls.normalize(value)
196-
if not stripped:
195+
if cls.apply_normalize and value:
196+
value = cls.normalize(value) # type: ignore
197+
198+
if not value:
197199
return None
198200

199-
if not cls.regex.match(stripped):
201+
if not cls.regex.match(value):
200202
raise ValueError("Invalid Postcode submitted")
201203

202-
return stripped
204+
return value
205+
206+
207+
@lru_cache()
208+
@validate_arguments
209+
def postcode(
210+
# pylint: disable=R0913
211+
strip_whitespace: Optional[bool] = True,
212+
to_upper: Optional[bool] = False,
213+
to_lower: Optional[bool] = False,
214+
strict: Optional[bool] = False,
215+
min_length: Optional[int] = None,
216+
max_length: Optional[int] = None,
217+
curtail_length: Optional[int] = None,
218+
regex: Optional[str] = POSTCODE_REGEX, # type: ignore
219+
apply_normalize: Optional[bool] = True,
220+
) -> type[Postcode]:
221+
"""Return a formatted date class with a set date format
222+
and timezone treatment.
223+
224+
"""
225+
dict_ = Postcode.__dict__.copy()
226+
dict_["strip_whitespace"] = strip_whitespace
227+
dict_["to_upper"] = to_upper
228+
dict_["to_lower"] = to_lower
229+
dict_["strict"] = strict
230+
dict_["min_length"] = min_length
231+
dict_["max_length"] = max_length
232+
dict_["curtail_length"] = curtail_length
233+
dict_["regex"] = regex
234+
dict_["apply_normalize"] = apply_normalize
235+
236+
return type("Postcode", (Postcode, *Postcode.__bases__), dict_)
203237

204238

205239
class OrgID(_SimpleRegexValidator):
@@ -482,6 +516,11 @@ def validate(cls, value: Union[dt.time, dt.datetime, str]) -> dt.time | None:
482516

483517
return new_time
484518

519+
@classmethod
520+
def __get_validators__(cls) -> Iterator[classmethod]:
521+
"""Gets all validators"""
522+
yield cls.validate # type: ignore
523+
485524

486525
@lru_cache()
487526
@validate_arguments

tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def test_duckdb_data_contract_csv(temp_csv_file):
5151
"description": "test",
5252
"callable": "formattedtime",
5353
"constraints": {
54-
"time_format": "%Y-%m-%d",
54+
"time_format": "%H:%M:%S",
5555
"timezone_treatment": "forbid"
5656
}
5757
}

0 commit comments

Comments
 (0)