refactor: added further spark cast work with tests, small fixes to duckdb casting

stevenhsd · stevenhsd · commit f9f4d4ff5e2e · 2026-04-15T14:24:40.000+01:00
diff --git a/src/dve/core_engine/backends/implementations/duckdb/contract.py b/src/dve/core_engine/backends/implementations/duckdb/contract.py
@@ -28,10 +28,10 @@
     generate_error_casting_entity_message,
 )
 from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
-    get_duckdb_cast_statement_from_annotation,
     duckdb_read_parquet,
     duckdb_record_index,
     duckdb_write_parquet,
+    get_duckdb_cast_statement_from_annotation,
     get_duckdb_type_from_annotation,
     relation_is_empty,
 )
@@ -102,7 +102,7 @@ def create_entity_from_py_iterator(  # pylint: disable=unused-argument
         _lazy_df = pl.LazyFrame(records, polars_schema)  # type: ignore # pylint: disable=unused-variable
         return self._connection.sql("select * from _lazy_df")
 
-    # pylint: disable=R0914
+    # pylint: disable=R0914,R0915
     def apply_data_contract(
         self,
         working_dir: URI,
@@ -170,13 +170,16 @@ def apply_data_contract(
 
                 casting_statements = [
                     (
-                        get_duckdb_cast_statement_from_annotation(column, mdl_fld.annotation) + f""" AS "{column}" """
+                        get_duckdb_cast_statement_from_annotation(column, mdl_fld.annotation)
+                        + f""" AS "{column}" """
                         if column in relation.columns
                         else f"CAST(NULL AS {ddb_schema[column]}) AS {column}"
                     )
                     for column, mdl_fld in entity_fields.items()
                 ]
-                casting_statements.append(f"CAST({RECORD_INDEX_COLUMN_NAME} AS {get_duckdb_type_from_annotation(int)}) AS {RECORD_INDEX_COLUMN_NAME}")
+                casting_statements.append(
+                    f"CAST({RECORD_INDEX_COLUMN_NAME} AS {get_duckdb_type_from_annotation(int)}) AS {RECORD_INDEX_COLUMN_NAME}" # pylint: disable=C0301
+                )
                 try:
                     relation = relation.project(", ".join(casting_statements))
                 except Exception as err:  # pylint: disable=broad-except
diff --git a/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py b/src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py
@@ -314,40 +314,49 @@ def duckdb_record_index(cls):
     setattr(cls, "drop_record_index", _drop_duckdb_record_index)
     return cls
 
-def _cast_as_ddb_type(field_expr:str, type_annotation:Any) -> str:
+
+def _cast_as_ddb_type(field_expr: str, type_annotation: Any) -> str:
     return f"""try_cast({field_expr} as {get_duckdb_type_from_annotation(type_annotation)})"""
 
-def _ddb_safely_quote_name(field_name:str) -> str:
+
+def _ddb_safely_quote_name(field_name: str) -> str:
     try:
-        sep_idx = field_name.rindex(".")
-        return field_name[:sep_idx + 1] + f"\"{field_name[sep_idx + 1:]}\""
+        sep_idx = field_name.index(".")
+        return f'"{field_name[: sep_idx]}"' + field_name[sep_idx:]
     except ValueError:
-        return f"\"{field_name}\""
-
-
-def get_duckdb_cast_statement_from_annotation(element_name:str,
-                                              type_annotation: Any,
-                                              date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
-                                              timestamp_regex:str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$",
-                                              parent_element: bool = True) -> str:
+        return f'"{field_name}"'
+
+# pylint: disable=R0911
+def get_duckdb_cast_statement_from_annotation(
+    element_name: str,
+    type_annotation: Any,
+    parent_element: bool = True,
+    date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
+    timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$",
+) -> str:
+    """Generate casting statements for duckdb relations from type annotations"""
     type_origin = get_origin(type_annotation)
-    
+
     quoted_name = _ddb_safely_quote_name(element_name)
 
     # An `Optional` or `Union` type, check to ensure non-heterogenity.
     if type_origin is Union:
         python_type = _get_non_heterogenous_type(get_args(type_annotation))
-        return get_duckdb_cast_statement_from_annotation(element_name, python_type, date_regex, timestamp_regex, parent_element)
+        return get_duckdb_cast_statement_from_annotation(
+            element_name, python_type, date_regex, timestamp_regex, parent_element
+        )
 
     # Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
     if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
         element_type = _get_non_heterogenous_type(get_args(type_annotation))
-        stmt = f"list_transform({quoted_name}, x -> {get_duckdb_cast_statement_from_annotation('x',element_type, date_regex, timestamp_regex, False)})"
+        stmt = f"list_transform({quoted_name}, x -> {get_duckdb_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
         return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
 
     if type_origin is Annotated:
         python_type, *other_args = get_args(type_annotation)  # pylint: disable=unused-variable
-        return get_duckdb_cast_statement_from_annotation(element_name, python_type, date_regex, timestamp_regex, parent_element) # add other expected params here
+        return get_duckdb_cast_statement_from_annotation(
+            element_name, python_type, date_regex, timestamp_regex, parent_element
+        )  # add other expected params here
     # Ensure that we have a concrete type at this point.
     if not isinstance(type_annotation, type):
         raise ValueError(f"Unsupported type annotation {type_annotation!r}")
@@ -371,17 +380,14 @@ def get_duckdb_cast_statement_from_annotation(element_name:str,
                 continue
 
             fields[field_name] = get_duckdb_cast_statement_from_annotation(
-                f"{element_name}.{field_name}",
-                field_annotation,
-                date_regex,
-                timestamp_regex,
-                False)
+                f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
+            )
 
         if not fields:
             raise ValueError(
                 f"No type annotations in dict/dataclass type (got {type_annotation!r})"
             )
-        cast_exprs = ",".join([f"\"{nme}\":= {stmt}" for nme, stmt in fields.items()])
+        cast_exprs = ",".join([f'"{nme}":= {stmt}' for nme, stmt in fields.items()])
         stmt = f"struct_pack({cast_exprs})"
         return stmt if not parent_element else _cast_as_ddb_type(stmt, type_annotation)
 
@@ -394,13 +400,13 @@ def get_duckdb_cast_statement_from_annotation(element_name:str,
 
     for type_ in type_annotation.mro():
         if issubclass(type_, datetime):
-            stmt = f"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{timestamp_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIMESTAMP) ELSE NULL END"
+            stmt = f"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{timestamp_regex}') THEN TRY_CAST(TRIM({quoted_name}) as TIMESTAMP) ELSE NULL END" # pylint: disable=C0301
             return stmt
         if issubclass(type_, date):
-            stmt = f"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END"
+            stmt = f"CASE WHEN REGEXP_MATCHES(TRIM({quoted_name}), '{date_regex}') THEN TRY_CAST(TRIM({quoted_name}) as DATE) ELSE NULL END" # pylint: disable=C0301
             return stmt
-        duck_type = get_duckdb_type_from_annotation(type_)   
+        duck_type = get_duckdb_type_from_annotation(type_)
         if duck_type:
-            stmt =  f"trim({quoted_name})"
+            stmt = f"trim({quoted_name})"
             return _cast_as_ddb_type(stmt, type_) if parent_element else stmt
     raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
diff --git a/src/dve/core_engine/backends/implementations/spark/spark_helpers.py b/src/dve/core_engine/backends/implementations/spark/spark_helpers.py
@@ -441,29 +441,41 @@ def spark_record_index(cls):
     return cls
 
 
-def _cast_as_spark_type(field_expr:str, field_type: st.DataType) -> Column:
-    return sf.expr(field_expr).cast(field_type)
+def _cast_as_spark_type(field_expr: str, field_type: st.DataType) -> Column:
+    return sf.expr(field_expr).cast(get_type_from_annotation(field_type))
 
-
-def get_spark_cast_statement_from_annotation(element_name:str,
-                                              type_annotation: Any,
-                                              include_cast: bool = True) -> st.DataType:
+def _spark_safely_quote_name(field_name: str) -> str:
+    try:
+        sep_idx = field_name.index(".")
+        return f'`{field_name[: sep_idx]}`' + field_name[sep_idx:]
+    except ValueError:
+        return f'`{field_name}`'
+
+def get_spark_cast_statement_from_annotation(
+    element_name: str, type_annotation: Any, parent_element: bool = True,
+    date_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$",
+    timestamp_regex: str = r"^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$"):
+    """Generate casting statements for spark dataframes based on type annotations"""
     type_origin = get_origin(type_annotation)
+    
+    quoted_name = _spark_safely_quote_name(element_name)
 
     # An `Optional` or `Union` type, check to ensure non-heterogenity.
     if type_origin is Union:
         python_type = _get_non_heterogenous_type(get_args(type_annotation))
-        return get_spark_cast_statement_from_annotation(element_name, include_cast)
+        return get_spark_cast_statement_from_annotation(element_name, python_type, parent_element, date_regex, timestamp_regex)
 
     # Type hint is e.g. `List[str]`, check to ensure non-heterogenity.
     if type_origin is list or (isinstance(type_origin, type) and issubclass(type_origin, list)):
         element_type = _get_non_heterogenous_type(get_args(type_annotation))
-        stmt = f"transform({element_name}, x -> {get_spark_cast_statement_from_annotation('x',element_type, False)})"
-        return stmt if not include_cast else _cast_as_spark_type(stmt, type_annotation)
+        stmt = f"transform({quoted_name}, x -> {get_spark_cast_statement_from_annotation('x',element_type, False, date_regex, timestamp_regex)})" # pylint: disable=C0301
+        return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
 
     if type_origin is Annotated:
-        python_type, *other_args = get_args(type_annotation)  # pylint: disable=unused-variable
-        return get_spark_cast_statement_from_annotation(element_name, python_type, include_cast) # add other expected params here
+        python_type, *_ = get_args(type_annotation)  # pylint: disable=unused-variable
+        return get_spark_cast_statement_from_annotation(
+            element_name, python_type, parent_element, date_regex, timestamp_regex
+        )  # add other expected params here
     # Ensure that we have a concrete type at this point.
     if not isinstance(type_annotation, type):
         raise ValueError(f"Unsupported type annotation {type_annotation!r}")
@@ -487,18 +499,16 @@ def get_spark_cast_statement_from_annotation(element_name:str,
                 continue
 
             fields[field_name] = get_spark_cast_statement_from_annotation(
-                f"{element_name}.{field_name}",
-                field_annotation,
-                False)
+                f"{element_name}.{field_name}", field_annotation, False, date_regex, timestamp_regex
+            )
 
         if not fields:
             raise ValueError(
                 f"No type annotations in dict/dataclass type (got {type_annotation!r})"
             )
-        cast_exprs = ",".join([f'{nme}:= {stmt}' for nme, stmt in fields.items()])
-        stmt = f"struct_pack({cast_exprs})"
-        return stmt if not include_cast else _cast_as_spark_type(stmt, type_annotation)
-
+        cast_exprs = ",".join([f"{stmt} AS `{nme}`" for nme, stmt in fields.items()])
+        stmt = f"struct({cast_exprs})"
+        return stmt if not parent_element else _cast_as_spark_type(stmt, type_annotation)
     if type_annotation is list:
         raise ValueError(
             f"List must have type annotation (e.g. `List[str]`), got {type_annotation!r}"
@@ -507,8 +517,15 @@ def get_spark_cast_statement_from_annotation(element_name:str,
         raise ValueError(f"dict must be `typing.TypedDict` subclass, got {type_annotation!r}")
 
     for type_ in type_annotation.mro():
-        duck_type = get_type_from_annotation(type_)   
-        if duck_type:
-            stmt =  f"trim({element_name})"
-            return _cast_as_spark_type(stmt, type_) if include_cast else stmt
-    raise ValueError(f"No equivalent DuckDB type for {type_annotation!r}")
+        if issubclass(type_, dt.datetime):
+            stmt = f"CASE WHEN REGEXP(TRIM({quoted_name}), '{timestamp_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
+            return _cast_as_spark_type(stmt, type_) if parent_element else stmt
+        elif issubclass(type_, dt.date):
+            stmt = f"CASE WHEN REGEXP(TRIM({quoted_name}), '{date_regex}') THEN TRIM({quoted_name}) ELSE NULL END" # pylint: disable=C0301
+            return _cast_as_spark_type(stmt, type_) if parent_element else stmt
+        else:
+            spark_type = get_type_from_annotation(type_)
+            if spark_type:
+                stmt = f"trim({quoted_name})"
+                return _cast_as_spark_type(stmt, type_) if parent_element else stmt
+    raise ValueError(f"No equivalent Spark type for {type_annotation!r}")
diff --git a/src/dve/pipeline/foundry_ddb_pipeline.py b/src/dve/pipeline/foundry_ddb_pipeline.py
@@ -42,13 +42,13 @@ def persist_audit_records(self, submission_info: SubmissionInfo) -> URI:
             write_to.parent.mkdir(parents=True, exist_ok=True)
             write_to = write_to.as_posix()
         self.write_parquet(  # type: ignore # pylint: disable=E1101
-            self._audit_tables._processing_status.get_relation().filter( # pylint: disable=W0212
+            self._audit_tables._processing_status.get_relation().filter(  # pylint: disable=W0212
                 f"submission_id = '{submission_info.submission_id}'"
             ),
             fh.joinuri(write_to, "processing_status.parquet"),
         )
         self.write_parquet(  # type: ignore # pylint: disable=E1101
-            self._audit_tables._submission_statistics.get_relation().filter( # pylint: disable=W0212
+            self._audit_tables._submission_statistics.get_relation().filter(  # pylint: disable=W0212
                 f"submission_id = '{submission_info.submission_id}'"
             ),
             fh.joinuri(write_to, "submission_statistics.parquet"),
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_duckdb_helpers.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_duckdb_helpers.py
@@ -159,8 +159,8 @@ def test_duckdb_rel_to_dictionaries(temp_ddb_conn: DuckDBPyConnection,
                           ("date_test", datetime.date,"CASE WHEN REGEXP_MATCHES(TRIM(\"date_test\"), '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') THEN TRY_CAST(TRIM(\"date_test\") as DATE) ELSE NULL END"),
                           ("timestamp_test", datetime.datetime,"CASE WHEN REGEXP_MATCHES(TRIM(\"timestamp_test\"), '^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$') THEN TRY_CAST(TRIM(\"timestamp_test\") as TIMESTAMP) ELSE NULL END"),
                           ("list_int_field", list[int], "try_cast(list_transform(\"list_int_field\", x -> trim(\"x\")) as BIGINT[])"),
-                          ("basic_model", BasicModel, "try_cast(struct_pack(\"str_field\":= trim(basic_model.\"str_field\"),\"date_field\":= CASE WHEN REGEXP_MATCHES(TRIM(basic_model.\"date_field\"), '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') THEN TRY_CAST(TRIM(basic_model.\"date_field\") as DATE) ELSE NULL END) as STRUCT(str_field VARCHAR, date_field DATE))"),
-                          ("another_model", AnotherModel, "try_cast(struct_pack(\"unique_id\":= trim(another_model.\"unique_id\"),\"basic_models\":= list_transform(another_model.\"basic_models\", x -> struct_pack(\"str_field\":= trim(x.\"str_field\"),\"date_field\":= CASE WHEN REGEXP_MATCHES(TRIM(x.\"date_field\"), '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') THEN TRY_CAST(TRIM(x.\"date_field\") as DATE) ELSE NULL END))) as STRUCT(unique_id BIGINT, basic_models STRUCT(str_field VARCHAR, date_field DATE)[]))")])
+                          ("basic_model", BasicModel, "try_cast(struct_pack(\"str_field\":= trim(\"basic_model\".str_field),\"date_field\":= CASE WHEN REGEXP_MATCHES(TRIM(\"basic_model\".date_field), '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') THEN TRY_CAST(TRIM(\"basic_model\".date_field) as DATE) ELSE NULL END) as STRUCT(str_field VARCHAR, date_field DATE))"),
+                          ("another_model", AnotherModel, "try_cast(struct_pack(\"unique_id\":= trim(\"another_model\".unique_id),\"basic_models\":= list_transform(\"another_model\".basic_models, x -> struct_pack(\"str_field\":= trim(\"x\".str_field),\"date_field\":= CASE WHEN REGEXP_MATCHES(TRIM(\"x\".date_field), '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') THEN TRY_CAST(TRIM(\"x\".date_field) as DATE) ELSE NULL END))) as STRUCT(unique_id BIGINT, basic_models STRUCT(str_field VARCHAR, date_field DATE)[]))")])
 def test_get_duckdb_cast_statement_from_annotation(field_name, field_type, cast_statement):
     assert get_duckdb_cast_statement_from_annotation(field_name, field_type) == cast_statement
 
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_spark/test_spark_helpers.py b/tests/test_core_engine/test_backends/test_implementations/test_spark/test_spark_helpers.py