@@ -461,39 +461,129 @@ def nested_typecast_parquet(temp_dir) -> Iterator[Tuple[URI, List[Dict[str, Any]
461461 _df .coalesce (1 ).write .format ("parquet" ).save (output_location )
462462 yield output_location , data
463463
464+ @pytest .fixture (scope = "function" )
465+ def nested_all_string_parquet_w_errors (temp_dir ,
466+ nested_parquet_custom_dc_err_details ) -> Iterator [Tuple [URI , str , List [Dict [str , Any ]]]]:
467+ contract_meta = json .dumps (
468+ {
469+ "contract" : {
470+ "error_details" : f"{ nested_parquet_custom_dc_err_details .as_posix ()} " ,
471+ "schemas" : {
472+ "SubField" : {
473+ "fields" : {
474+ "id" : "int" ,
475+ "substrfield" : "str" ,
476+ "subarrayfield" : {"type" : "date" , "is_array" : True },
477+ },
478+ "mandatory_fields" : ["id" ],
479+ }
480+ },
481+ "datasets" : {
482+ "nested_model" : {
483+ "fields" : {
484+ "id" : "int" ,
485+ "strfield" : "str" ,
486+ "datetimefield" : "datetime" ,
487+ "subfield" : {"model" : "SubField" , "is_array" : True },
488+ },
489+ "reader_config" : {
490+ ".xml" : {
491+ "reader" : "DuckDBXMLStreamReader" ,
492+ "parameters" : {"root_tag" : "root" , "record_tag" : "NestedModel" },
493+ }
494+ },
495+ "key_field" : "id" ,
496+ }
497+ },
498+ }
499+ }
500+ )
501+
502+ _spark : SparkSession = SparkSession .builder .getOrCreate ()
503+ data : List [Dict [str , Any ]] = [
504+ dict (
505+ id = 1 ,
506+ strfield = "hi" ,
507+ datetimefield = str (datetime (2020 , 9 , 20 , 12 , 34 , 56 )),
508+ subfield = [
509+ dict (
510+ id = 1 ,
511+ substrfield = "bye" ,
512+ subarrayfield = [str (date (2020 , 9 , 20 )), str (date (2020 , 9 , 21 ))],
513+ )
514+ ],
515+ ),
516+ dict (
517+ id = "WRONG" ,
518+ strfield = "hello" ,
519+ datetimefield = str (datetime (2020 , 9 , 21 , 12 , 34 , 56 )),
520+ subfield = [
521+ dict (
522+ id = 2 ,
523+ substrfield = "bye" ,
524+ subarrayfield = [str (date (2020 , 9 , 20 )), str (date (2020 , 9 , 21 ))],
525+ ),
526+ dict (
527+ id = "WRONG" ,
528+ substrfield = "aurevoir" ,
529+ subarrayfield = [str (date (2020 , 9 , 22 )), str (date (2020 , 9 , 23 ))],
530+ ),
531+ ],
532+ ),
533+ ]
534+
535+ output_location : URI = str (Path (temp_dir ).joinpath ("nested_parquet" ).as_posix ()) + "/"
536+
537+ _df : DataFrame = _spark .createDataFrame (
538+ data ,
539+ schema = StructType (
540+ [
541+ StructField ("id" , StringType ()),
542+ StructField ("strfield" , StringType ()),
543+ StructField ("datetimefield" , StringType ()),
544+ StructField (
545+ "subfield" ,
546+ ArrayType (
547+ StructType (
548+ [
549+ StructField ("id" , StringType ()),
550+ StructField ("substrfield" , StringType ()),
551+ StructField ("subarrayfield" , ArrayType (StringType ())),
552+ ]
553+ )
554+ ),
555+ ),
556+ ]
557+ ),
558+ )
559+ _df .coalesce (1 ).write .format ("parquet" ).save (output_location )
560+ yield output_location , contract_meta , data
561+
562+
464563@pytest .fixture ()
465564def nested_parquet_custom_dc_err_details (temp_dir ):
565+ file_path = Path (temp_dir ).joinpath ("nested_parquet_data_contract_codes.json" )
466566 err_details = {
467567 "id" : {
468- "Blank" : {"error_code" : "TESTID " ,
568+ "Blank" : {"error_code" : "TESTIDBLANK " ,
469569 "error_message" : "id cannot be null" },
470- "Bad Value " : {"error_code" : "TESTID " ,
570+ "Bad value " : {"error_code" : "TESTIDBAD " ,
471571 "error_message" : "id is invalid: id - {{id}}" }
472572 },
473573 "datetimefield" : {
474- "Bad Value " : {"error_code" : "TESTDTFIELD " ,
574+ "Bad value " : {"error_code" : "TESTDTFIELDBAD " ,
475575 "error_message" : "datetimefield is invalid: id - {{id}}, datetimefield - {{datetimefield}}" }
476- }
576+ },
577+ "subfield.id" : {
578+ "Blank" : {"error_code" : "SUBFIELDTESTIDBLANK" ,
579+ "error_message" : "subfield id cannot be null" },
580+ "Bad value" : {"error_code" : "SUBFIELDTESTIDBAD" ,
581+ "error_message" : "subfield id is invalid: subfield.id - {{__error_value}}" }
582+ },
477583 }
584+ with open (file_path , mode = "w" ) as fle :
585+ json .dump (err_details , fle )
586+
587+ yield file_path
478588
479-
480-
481- StructType (
482- [
483- StructField ("id" , StringType ()),
484- StructField ("strfield" , StringType ()),
485- StructField ("datetimefield" , StringType ()),
486- StructField (
487- "subfield" ,
488- ArrayType (
489- StructType (
490- [
491- StructField ("id" , StringType ()),
492- StructField ("substrfield" , StringType ()),
493- StructField ("subarrayfield" , ArrayType (StringType ())),
494- ]
495- )
496- ),
497- ),
498- ]
499- )
589+
0 commit comments