1616 get_duckdb_type_from_annotation ,
1717)
1818from dve .core_engine .backends .implementations .duckdb .types import SQLType
19+ from dve .core_engine .backends .readers .utilities import check_csv_header_expected
1920from dve .core_engine .backends .utilities import get_polars_type_from_annotation
2021from dve .core_engine .message import FeedbackMessage
2122from dve .core_engine .type_hints import URI , EntityName
2425
2526@duckdb_write_parquet
2627class DuckDBCSVReader (BaseFileReader ):
27- """A reader for CSV files"""
28+ """A reader for CSV files including the ability to compare the passed model
29+ to the file header, if it exists.
30+
31+ field_check: flag to compare submitted file header to the accompanying pydantic model
32+ field_check_error_code: The error code to provide if the file header doesn't contain
33+ the expected fields
34+ field_check_error_message: The error message to provide if the file header doesn't contain
35+ the expected fields"""
2836
2937 # TODO - the read_to_relation should include the schema and determine whether to
3038 # TODO - stringify or not
@@ -35,15 +43,43 @@ def __init__(
3543 delim : str = "," ,
3644 quotechar : str = '"' ,
3745 connection : Optional [DuckDBPyConnection ] = None ,
46+ field_check : bool = False ,
47+ field_check_error_code : Optional [str ] = "ExpectedVsActualFieldMismatch" ,
48+ field_check_error_message : Optional [str ] = "The submitted header is missing fields" ,
3849 ** _ ,
3950 ):
4051 self .header = header
4152 self .delim = delim
4253 self .quotechar = quotechar
4354 self ._connection = connection if connection else default_connection
55+ self .field_check = field_check
56+ self .field_check_error_code = field_check_error_code
57+ self .field_check_error_message = field_check_error_message
4458
4559 super ().__init__ ()
4660
61+ def perform_field_check (
62+ self , resource : URI , entity_name : str , expected_schema : type [BaseModel ]
63+ ):
64+ """Check that the header of the CSV aligns with the provided model"""
65+ if not self .header :
66+ raise ValueError ("Cannot perform field check without a CSV header" )
67+
68+ if missing := check_csv_header_expected (resource , expected_schema , self .delim ):
69+ raise MessageBearingError (
70+ "The CSV header doesn't match what is expected" ,
71+ messages = [
72+ FeedbackMessage (
73+ entity = entity_name ,
74+ record = None ,
75+ failure_type = "submission" ,
76+ error_location = "Whole File" ,
77+ error_code = self .field_check_error_code ,
78+ error_message = f"{ self .field_check_error_message } - missing fields: { missing } " , # pylint: disable=line-too-long
79+ )
80+ ],
81+ )
82+
4783 def read_to_py_iterator (
4884 self , resource : URI , entity_name : EntityName , schema : type [BaseModel ]
4985 ) -> Iterator [dict [str , Any ]]:
@@ -58,6 +94,9 @@ def read_to_relation( # pylint: disable=unused-argument
5894 if get_content_length (resource ) == 0 :
5995 raise EmptyFileError (f"File at { resource } is empty." )
6096
97+ if self .field_check :
98+ self .perform_field_check (resource , entity_name , schema )
99+
61100 reader_options : dict [str , Any ] = {
62101 "header" : self .header ,
63102 "delimiter" : self .delim ,
@@ -89,6 +128,9 @@ def read_to_relation( # pylint: disable=unused-argument
89128 if get_content_length (resource ) == 0 :
90129 raise EmptyFileError (f"File at { resource } is empty." )
91130
131+ if self .field_check :
132+ self .perform_field_check (resource , entity_name , schema )
133+
92134 reader_options : dict [str , Any ] = {
93135 "has_header" : self .header ,
94136 "separator" : self .delim ,
@@ -132,6 +174,17 @@ class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
132174 | shop1 | clothes | 2025-01-01 |
133175 """
134176
177+ def __init__ (
178+ self ,
179+ * args ,
180+ non_unique_header_error_code : Optional [str ] = "NonUniqueHeader" ,
181+ non_unique_header_error_message : Optional [str ] = None ,
182+ ** kwargs ,
183+ ):
184+ self ._non_unique_header_code = non_unique_header_error_code
185+ self ._non_unique_header_message = non_unique_header_error_message
186+ super ().__init__ (* args , ** kwargs )
187+
135188 @read_function (DuckDBPyRelation )
136189 def read_to_relation ( # pylint: disable=unused-argument
137190 self , resource : URI , entity_name : EntityName , schema : type [BaseModel ]
@@ -156,10 +209,12 @@ def read_to_relation( # pylint: disable=unused-argument
156209 failure_type = "submission" ,
157210 error_message = (
158211 f"Found { no_records } distinct combination of header values."
212+ if not self ._non_unique_header_message
213+ else self ._non_unique_header_message
159214 ),
160215 error_location = entity_name ,
161216 category = "Bad file" ,
162- error_code = "NonUniqueHeader" ,
217+ error_code = self . _non_unique_header_code ,
163218 )
164219 ],
165220 )
0 commit comments