feat: improved movies dataset test coverage. Added testing for spark and duckdb refdata loaders when table config specified.

stevenhsd · stevenhsd · commit b75c0bc8f40b · 2025-10-24T20:28:13.000+01:00
diff --git a/src/dve/core_engine/backends/implementations/duckdb/reference_data.py b/src/dve/core_engine/backends/implementations/duckdb/reference_data.py
@@ -42,7 +42,7 @@ def __init__(
 
     def load_table(self, config: ReferenceTable) -> DuckDBPyRelation:
         """Load reference entity from a database table"""
-        return self.connection.table(f"{config.fq_table_name}")
+        return self.connection.sql(f"select * from {config.fq_table_name}")
 
     def load_file(self, config: ReferenceFile) -> DuckDBPyRelation:
         "Load reference entity from a relative file path"
diff --git a/src/dve/core_engine/backends/implementations/spark/readers/json.py b/src/dve/core_engine/backends/implementations/spark/readers/json.py
@@ -25,8 +25,8 @@ class SparkJSONReader(BaseFileReader):
     def __init__(
         self,
         *,
-        encoding: Optional[str] = "utf-8-sig",
-        multi_line: Optional[bool] = False,
+        encoding: Optional[str] = "utf-8",
+        multi_line: Optional[bool] = True,
         spark_session: Optional[SparkSession] = None
     ) -> None:
         
@@ -57,7 +57,8 @@ def read_to_dataframe(
 
         spark_schema: StructType = get_type_from_annotation(schema)
         kwargs = {
-            "multiLine": self.multi_line,
+            "encoding": self.encoding,
+            "multiline": self.multi_line,
             
         }
         
diff --git a/tests/features/movies.feature b/tests/features/movies.feature
@@ -7,9 +7,45 @@ Feature: Pipeline tests using the movies dataset
     Some validation of entity attributes is performed: SQL expressions and Python filter
     functions are used, and templatable business rules feature in the transformations.
 
+        Scenario: Validate and filter movies (spark)
+        Given I submit the movies file movies.json for processing
+        And A spark pipeline is configured
+        And I create the following reference data tables in the database movies_refdata
+            | table_name | parquet_path                                         |
+            | sequels    | tests/testdata/movies/refdata/movies_sequels.parquet |
+        And I add initial audit entries for the submission
+        Then the latest audit record for the submission is marked with processing status file_transformation
+        When I run the file transformation phase
+        Then the movies entity is stored as a parquet after the file_transformation phase
+        And the latest audit record for the submission is marked with processing status data_contract
+        When I run the data contract phase
+        Then there are 3 record rejections from the data_contract phase
+        And there are errors with the following details and associated error_count from the data_contract phase
+            | ErrorCode | ErrorMessage                              | error_count |
+            | BLANKYEAR | year not provided                         | 1           |
+            | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
+            | DODGYDATE | date_joined value is not valid: daft_date | 1           |
+        And the movies entity is stored as a parquet after the data_contract phase
+        And the latest audit record for the submission is marked with processing status business_rules
+        When I run the business rules phase
+        Then The rules restrict "movies" to 4 qualifying records
+        And At least one row from "movies" has generated error code "LIMITED_RATINGS"
+        And At least one row from "derived" has generated error code "RUBBISH_SEQUEL"
+        And the latest audit record for the submission is marked with processing status error_report
+        When I run the error report phase
+        Then An error report is produced
+        And The statistics entry for the submission shows the following information
+            | parameter                | value |
+            | record_count             | 5     |
+            | number_record_rejections | 4     |
+            | number_warnings          | 1     |
+
     Scenario: Validate and filter movies (duckdb)
         Given I submit the movies file movies.json for processing
-        And A duckdb pipeline is configured
+        And A duckdb pipeline is configured with schema file 'movies_ddb.dischema.json'
+        And I create the following reference data tables in the database "movies_refdata"
+            | table_name | parquet_path                                         |
+            | sequels    | tests/testdata/movies/refdata/movies_sequels.parquet |
         And I add initial audit entries for the submission
         Then the latest audit record for the submission is marked with processing status file_transformation
         When I run the file transformation phase
@@ -24,3 +60,16 @@ Feature: Pipeline tests using the movies dataset
             | DODGYDATE | date_joined value is not valid: daft_date | 1           |
         And the movies entity is stored as a parquet after the data_contract phase
         And the latest audit record for the submission is marked with processing status business_rules
+        When I run the business rules phase
+        Then The rules restrict "movies" to 4 qualifying records
+        And At least one row from "movies" has generated error code "LIMITED_RATINGS"
+        And At least one row from "derived" has generated error code "RUBBISH_SEQUEL"
+        And the latest audit record for the submission is marked with processing status error_report
+        When I run the error report phase
+        Then An error report is produced
+        And The statistics entry for the submission shows the following information
+            | parameter                | value |
+            | record_count             | 5     |
+            | number_record_rejections | 4     |
+            | number_warnings          | 1     |
+
diff --git a/tests/features/steps/steps_pipeline.py b/tests/features/steps/steps_pipeline.py
@@ -295,3 +295,30 @@ def check_rows_eq_to_category(context: Context, entity_name: str, category: str)
         (pl.col("Entity").eq(entity_name)) & (pl.col("Category").eq(category))
     ).shape[0]
     assert recs_with_err_code >= 1
+
+@given("I create the following reference data tables in the database {database}")
+def create_refdata_tables(context: Context, database: str):
+    table: Optional[Table] = context.table
+    refdata_tables: Dict[str, URI] = {}
+    row: Row
+    for row in table:
+        record = row.as_dict()
+        refdata_tables[record["table_name"]] = record["parquet_path"]
+    pipeline = ctxt.get_pipeline(context)
+    refdata_loader = getattr(pipeline, "_reference_data_loader")
+    if refdata_loader == SparkRefDataLoader:
+        refdata_loader.spark.sql(f"CREATE DATABASE IF NOT EXISTS {database}")
+        for tbl, source in refdata_tables.items():
+            (refdata_loader.spark.read.parquet(source)
+             .write.saveAsTable(f"{database}.{tbl}"))
+            
+    if refdata_loader == DuckDBRefDataLoader:
+        ref_db_file = Path(ctxt.get_processing_location(context), f"{database}.duckdb").as_posix()
+        refdata_loader.connection.sql(f"ATTACH '{ref_db_file}' AS {database}")
+        for tbl, source in refdata_tables.items():
+            refdata_loader.connection.read_parquet(source).to_table(f"{database}.{tbl}")
+
+        
+        
+    
+    
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_audit_ddb.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_audit_ddb.py
@@ -174,7 +174,7 @@ def test_dve_audit_using_thread_pool(ddb_audit_manager_threaded: DDBAuditingMana
 
         aud.add_new_submissions([_sub_info])
         while not aud.queue.empty():
-            time.sleep(2)
+            time.sleep(0.2)
 
         at_entry = list(
             aud._processing_status.get_relation()
@@ -188,7 +188,7 @@ def test_dve_audit_using_thread_pool(ddb_audit_manager_threaded: DDBAuditingMana
         assert len(at_entry) == 1
         aud.mark_transform([_sub_info.submission_id])
         while not aud.queue.empty():
-            time.sleep(2)
+            time.sleep(0.2)
 
     file_trans = aud.get_all_file_transformation_submissions()
     assert [rw.get("submission_id") for rw in file_trans.pl().iter_rows(named=True)] == [
diff --git a/tests/testdata/movies/movies.dischema.json b/tests/testdata/movies/movies.dischema.json
@@ -30,7 +30,7 @@
                     },
                 "reader_config": {
                     ".json": {
-                        "reader": "DuckDBJSONReader"
+                        "reader": "SparkJSONReader"
                     }
                 },
                 "mandatory_fields": [
@@ -40,7 +40,34 @@
             }
         }
     },
-    "transformations": {}
+    "transformations": {
+        "parameters": {"entity": "movies"},
+        "reference_data": {
+            "sequels": {
+                "type": "table",
+                "database": "movies_refdata",
+                "table_name": "sequels"
+            }
+        },
+        "rule_stores": [
+            {
+                "store_type": "json",
+                "filename": "movies_spark_rule_store.json"
+            }
+        ],
+        "complex_rules": [
+            {
+                "rule_name": "ratings_count"
+            },
+            {
+                "rule_name": "poor_sequel_check",
+                "parameters": {
+                    "sequel_entity": "refdata_sequels"
+                }
+            }
+        ]
+    }
 }
 
+
         
diff --git a/tests/testdata/movies/movies_ddb.dischema.json b/tests/testdata/movies/movies_ddb.dischema.json
@@ -0,0 +1,73 @@
+{
+    "contract": {
+        "schemas": {
+            "cast": {
+                "fields": {
+                    "name": "str",
+                    "role": "str",
+                    "date_joined": "date"
+                }
+            }
+        },
+        "error_details": "movies_contract_error_details.json",
+        "datasets": {
+            "movies": {
+                "fields": {
+                    "title": "str",
+                    "year": "int",
+                    "genre": {
+                        "type": "str",
+                        "is_array": true
+                    },
+                    "duration_minutes": "int",
+                    "ratings": {
+                        "type": "NonNegativeFloat",
+                        "is_array": true
+                    },
+                    "cast": {
+                        "model": "cast",
+                        "is_array": true}
+                    },
+                "reader_config": {
+                    ".json": {
+                        "reader": "DuckDBJSONReader"
+                    }
+                },
+                "mandatory_fields": [
+                    "title",
+                    "year"
+                ]
+            }
+        }
+    },
+    "transformations": {
+        "parameters": {"entity": "movies"},
+        "reference_data": {
+            "sequels": {
+                "type": "table",
+                "database": "movies_refdata",
+                "table_name": "sequels"
+            }
+        },
+        "rule_stores": [
+            {
+                "store_type": "json",
+                "filename": "movies_ddb_rule_store.json"
+            }
+        ],
+        "complex_rules": [
+            {
+                "rule_name": "ratings_count"
+            },
+            {
+                "rule_name": "poor_sequel_check",
+                "parameters": {
+                    "sequel_entity": "refdata_sequels"
+                }
+            }
+        ]
+    }
+}
+
+
+        
diff --git a/tests/testdata/movies/movies_ddb_rule_store.json b/tests/testdata/movies/movies_ddb_rule_store.json
@@ -0,0 +1,92 @@
+{
+    "ratings_count": {
+        "description": "Ensure more than 1 rating",
+        "type": "complex_rule",
+        "parameter_descriptions": {
+            "entity": "The entity to apply the workflow to."
+        },
+        "parameter_defaults": {},
+        "rule_config": {
+            "rules": [
+                {
+                    "name": "Get count of ratings",
+                    "operation": "add",
+                    "entity": "{{entity}}",
+                    "column_name": "no_of_ratings",
+                    "expression": "length(ratings)"
+                }
+            ],
+            "filters": [
+                {
+                    "name": "filter_too_few_ratings",
+                    "entity": "{{entity}}",
+                    "expression": "no_of_ratings > 1",
+                    "error_code": "LIMITED_RATINGS",
+                    "reporting_field": "title",
+                    "failure_message": "Movie has too few ratings"
+                }
+            ],
+            "post_filter_rules": [
+                {
+                    "name": "Remove the no_of_ratings field",
+                    "operation": "remove",
+                    "entity": "{{entity}}",
+                    "column_name": "no_of_ratings"
+                }
+            ]
+        }
+    },
+    "poor_sequel_check": {
+        "description": "check if bad sequel exists",
+        "type": "complex_rule",
+        "parameter_descriptions": {
+            "entity": "The entity to apply the workflow to.",
+            "sequel_entity": "The entity containing sequel data"
+        },
+        "parameter_defaults": {},
+        "rule_config": {
+            "rules": [
+                {
+                    "name": "Join sequel data",
+                    "operation": "inner_join",
+                    "entity": "{{entity}}",
+                    "target": "{{sequel_entity}}",
+                    "join_condition": "{{entity}}.title = {{sequel_entity}}.sequel_to",
+                    "new_entity_name": "with_sequels",
+                    "new_columns": {
+                        "{{sequel_entity}}.ratings": "sequel_rating"
+                    }
+                },
+                {
+                    "name": "Get median sequel rating",
+                    "operation": "group_by",
+                    "entity": "with_sequels",
+                    "group_by": "title",
+                    "agg_columns": {
+                        "list_aggregate(sequel_rating, 'median')": "median_sequel_rating"
+                    }
+                }
+
+            ],
+            "filters": [
+                {
+                    "name": "filter_rubbish_sequel",
+                    "entity": "with_sequels",
+                    "expression": "median_sequel_rating > 5",
+                    "error_code": "RUBBISH_SEQUEL",
+                    "reporting_entity": "derived",
+                    "reporting_field": "title",
+                    "failure_message": "Movie has rubbish sequel",
+                    "is_informational": true
+                }
+            ],
+            "post_filter_rules": [
+                {
+                    "name": "Remove the with_sequel entity",
+                    "operation": "remove_entity",
+                    "entity": "with_sequels"
+                }
+            ]
+        }
+    }
+}
diff --git a/tests/testdata/movies/movies_spark_rule_store.json b/tests/testdata/movies/movies_spark_rule_store.json
diff --git a/tests/testdata/movies/refdata/movies_sequels.parquet b/tests/testdata/movies/refdata/movies_sequels.parquet