feat: integrate record index into error report

stevenhsd · stevenhsd · commit bf170f49b725 · 2026-03-06T15:47:09.000Z
diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py
@@ -116,6 +116,8 @@ class UserMessage:
     "The offending values"
     Category: ErrorCategory
     "The category of error"
+    RecordIndex: Optional[int] = None
+    "The record index where the error occurred (if applicable)"
 
     @property
     def is_informational(self) -> bool:
@@ -187,6 +189,7 @@ class FeedbackMessage:  # pylint: disable=too-many-instance-attributes
         "ErrorMessage",
         "ErrorCode",
         "ReportingField",
+        "RecordIndex",
         "Value",
         "Category",
     ]
@@ -224,15 +227,6 @@ def _validate_error_location(cls, value: Any) -> Optional[str]:
 
         return str(value)
 
-    @validator("record")
-    def _strip_rowid(  # pylint: disable=no-self-argument
-        cls, value: Optional[dict[str, Any]]
-    ) -> Optional[dict[str, Any]]:
-        """Strip the row ID column from the record, if present."""
-        if isinstance(value, dict):
-            value.pop(RECORD_INDEX_COLUMN_NAME, None)
-        return value
-
     @property
     def is_critical(self) -> bool:
         """Whether the error is unrecoverable."""
@@ -333,6 +327,7 @@ def to_row(
             error_message,
             self.error_code,
             self.reporting_field_name or reporting_field,
+            self.record.get(RECORD_INDEX_COLUMN_NAME),
             value,
             self.category,
         )
diff --git a/src/dve/pipeline/pipeline.py b/src/dve/pipeline/pipeline.py
@@ -432,7 +432,9 @@ def apply_data_contract(
 
         for path, _ in fh.iter_prefix(read_from):
             entity_locations[fh.get_file_name(path)] = path
-            entities[fh.get_file_name(path)] = self.data_contract.read_parquet(path)
+            entities[fh.get_file_name(path)] = self.data_contract.add_record_index(
+                self.data_contract.read_parquet(path)
+                )
 
         key_fields = {model: conf.reporting_fields for model, conf in model_config.items()}
 
@@ -743,6 +745,7 @@ def _get_error_dataframes(self, submission_id: str):
                     pl.col("ErrorCode").alias("Error_Code"),  # type: ignore
                     pl.col("ReportingField").alias("Data_Item"),  # type: ignore
                     pl.col("ErrorMessage").alias("Error"),  # type: ignore
+                    pl.col("RecordIndex").alias("Record_Index"),
                     pl.col("Value"),  # type: ignore
                     pl.col("Key").alias("ID"),  # type: ignore
                     pl.col("Category"),  # type: ignore
diff --git a/src/dve/reporting/error_report.py b/src/dve/reporting/error_report.py
@@ -18,6 +18,7 @@
     "Error_Code": Utf8(),
     "Data_Item": Utf8(),
     "Error": Utf8(),
+    "Record_Index": pl.UInt32(),
     "Value": Utf8(),
     "ID": Utf8(),
     "Category": Utf8(),
diff --git a/tests/features/books.feature b/tests/features/books.feature
@@ -4,33 +4,6 @@ Feature: Pipeline tests using the books dataset
     This tests submissions using nested, complex JSON datasets with arrays, and
     introduces more complex transformations that require aggregation.
 
-    Scenario: Validate complex nested XML data (spark)
-        Given I submit the books file nested_books.XML for processing
-        And A spark pipeline is configured with schema file 'nested_books.dischema.json'
-        And I add initial audit entries for the submission
-        Then the latest audit record for the submission is marked with processing status file_transformation
-        When I run the file transformation phase
-        Then the header entity is stored as a parquet after the file_transformation phase
-        And the nested_books entity is stored as a parquet after the file_transformation phase
-        And the latest audit record for the submission is marked with processing status data_contract
-        When I run the data contract phase
-        Then there is 1 record rejection from the data_contract phase
-        And the header entity is stored as a parquet after the data_contract phase
-        And the nested_books entity is stored as a parquet after the data_contract phase
-        And the latest audit record for the submission is marked with processing status business_rules
-        When I run the business rules phase
-        Then The rules restrict "nested_books" to 3 qualifying records
-        And The entity "nested_books" contains an entry for "17.85" in column "total_value_of_books"
-        And the nested_books entity is stored as a parquet after the business_rules phase
-        And the latest audit record for the submission is marked with processing status error_report
-        When I run the error report phase
-        Then An error report is produced
-        And The statistics entry for the submission shows the following information
-            | parameter                | value |
-            | record_count             | 4     |
-            | number_record_rejections | 2     |
-            | number_warnings          | 0     |
-
     Scenario: Validate complex nested XML data (duckdb)
         Given I submit the books file nested_books.XML for processing
         And A duckdb pipeline is configured with schema file 'nested_books_ddb.dischema.json'
diff --git a/tests/features/movies.feature b/tests/features/movies.feature
@@ -21,18 +21,18 @@ Feature: Pipeline tests using the movies dataset
         When I run the data contract phase
         Then there are 3 record rejections from the data_contract phase
         And there are errors with the following details and associated error_count from the data_contract phase
-            | Entity             | ErrorCode | ErrorMessage                              | error_count |
-            | movies             | BLANKYEAR | year not provided                         | 1           |
-            | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
-            | movies             | DODGYDATE | date_joined value is not valid: daft_date | 1           |
+            | Entity             | ErrorCode | ErrorMessage                              | RecordIndex | error_count |
+            | movies             | BLANKYEAR | year not provided                         | 2           | 1           |
+            | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           | 1           |
+            | movies             | DODGYDATE | date_joined value is not valid: daft_date | 1           | 1           |
         And the movies entity is stored as a parquet after the data_contract phase
         And the latest audit record for the submission is marked with processing status business_rules
         When I run the business rules phase
         Then The rules restrict "movies" to 4 qualifying records
         And there are errors with the following details and associated error_count from the business_rules phase
-            | ErrorCode       | ErrorMessage                                           | error_count |
-            | LIMITED_RATINGS | Movie has too few ratings ([6.5])                      | 1           |
-            | RUBBISH_SEQUEL  | The movie The Greatest Movie Ever has a rubbish sequel | 1           |
+            | ErrorCode       | ErrorMessage                                           | RecordIndex | error_count |
+            | LIMITED_RATINGS | Movie has too few ratings ([6.5])                      | 4           | 1           |
+            | RUBBISH_SEQUEL  | The movie The Greatest Movie Ever has a rubbish sequel | 1           | 1           |
         And the latest audit record for the submission is marked with processing status error_report
         When I run the error report phase
         Then An error report is produced
@@ -57,18 +57,18 @@ Feature: Pipeline tests using the movies dataset
         When I run the data contract phase
         Then there are 3 record rejections from the data_contract phase
         And there are errors with the following details and associated error_count from the data_contract phase
-            | Entity             | ErrorCode | ErrorMessage                              | error_count |
-            | movies             | BLANKYEAR | year not provided                         | 1           |
-            | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
-            | movies             | DODGYDATE | date_joined value is not valid: daft_date | 1           |
+            | Entity             | ErrorCode | ErrorMessage                              | RecordIndex | error_count |
+            | movies             | BLANKYEAR | year not provided                         | 2           | 1           |
+            | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           | 1           |
+            | movies             | DODGYDATE | date_joined value is not valid: daft_date | 1           | 1           |
         And the movies entity is stored as a parquet after the data_contract phase
         And the latest audit record for the submission is marked with processing status business_rules
         When I run the business rules phase
         Then The rules restrict "movies" to 4 qualifying records
         And there are errors with the following details and associated error_count from the business_rules phase
-            | ErrorCode       | ErrorMessage                                           | error_count |
-            | LIMITED_RATINGS | Movie has too few ratings ([6.5])                      | 1           |
-            | RUBBISH_SEQUEL  | The movie The Greatest Movie Ever has a rubbish sequel | 1           |
+            | ErrorCode       | ErrorMessage                                           | RecordIndex | error_count |
+            | LIMITED_RATINGS | Movie has too few ratings ([6.5])                      | 4           | 1           |
+            | RUBBISH_SEQUEL  | The movie The Greatest Movie Ever has a rubbish sequel | 1           | 1           |
         And the latest audit record for the submission is marked with processing status error_report
         When I run the error report phase
         Then An error report is produced
diff --git a/tests/features/steps/utilities.py b/tests/features/steps/utilities.py
@@ -23,6 +23,7 @@
     "ErrorType",
     "ErrorLocation",
     "ErrorMessage",
+    "RecordIndex",
     "ReportingField",
     "Category",
 ]
diff --git a/tests/test_core_engine/test_message.py b/tests/test_core_engine/test_message.py
@@ -8,20 +8,8 @@
 from pydantic import BaseModel, ValidationError
 import pytest
 
-from dve.core_engine.constants import RECORD_INDEX_COLUMN_NAME
 from dve.core_engine.message import DEFAULT_ERROR_DETAIL, DataContractErrorDetail, FeedbackMessage
 
-
-def test_rowid_column_stripped():
-    """Ensure that the rowID column is stripped from FeedbackMessages."""
-
-    message = FeedbackMessage(
-        entity="entity", record={"key": "value", RECORD_INDEX_COLUMN_NAME: "some identifier"}
-    )
-
-    assert message.record.get(RECORD_INDEX_COLUMN_NAME) is None
-
-
 @pytest.mark.parametrize(
     ("derived_column", "expected"),
     [
diff --git a/tests/test_pipeline/pipeline_helpers.py b/tests/test_pipeline/pipeline_helpers.py
@@ -152,6 +152,7 @@ def dodgy_planet_data_after_file_transformation() -> Iterator[Tuple[SubmissionIn
             "numberOfMoons": "-1",
             "hasRingSystem": "false",
             "hasGlobalMagneticField": "sometimes",
+            "__record_index__": "1"
         }
         planet_contract_df = pl.DataFrame(
             planet_contract_data, {k: pl.Utf8() for k in planet_contract_data}
@@ -381,7 +382,8 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]:
                 "ErrorCode": "LONG_ORBIT",
                 "ReportingField": "orbitalPeriod",
                 "Value": "365.20001220703125",
-                "Category": "Bad value"
+                "Category": "Bad value",
+                "RecordIndex": "1"
             },
             {
                 "Entity": "planets",
@@ -394,7 +396,8 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]:
                 "ErrorCode": "STRONG_GRAVITY",
                 "ReportingField": "gravity",
                 "Value": "9.800000190734863",
-                "Category": "Bad value"
+                "Category": "Bad value",
+                "RecordIndex": "1"
             }
         ]"""
         )
diff --git a/tests/test_pipeline/test_spark_pipeline.py b/tests/test_pipeline/test_spark_pipeline.py
@@ -175,6 +175,7 @@ def test_apply_data_contract_failed(  # pylint: disable=redefined-outer-name
             "ErrorMessage": "is invalid",
             "ErrorCode": "BadValue",
             "ReportingField": "planet",
+            "RecordIndex": "1",
             "Value": "EarthEarthEarthEarthEarthEarthEarthEarthEarth",
             "Category": "Bad value",
         },
@@ -188,6 +189,7 @@ def test_apply_data_contract_failed(  # pylint: disable=redefined-outer-name
             "ErrorMessage": "is invalid",
             "ErrorCode": "BadValue",
             "ReportingField": "numberOfMoons",
+            "RecordIndex": "1",
             "Value": "-1",
             "Category": "Bad value",
         },
@@ -201,6 +203,7 @@ def test_apply_data_contract_failed(  # pylint: disable=redefined-outer-name
             "ErrorMessage": "is invalid",
             "ErrorCode": "BadValue",
             "ReportingField": "hasGlobalMagneticField",
+            "RecordIndex": "1",
             "Value": "sometimes",
             "Category": "Bad value",
         },
@@ -347,6 +350,7 @@ def test_apply_business_rules_with_data_errors(  # pylint: disable=redefined-out
             "ReportingField": "orbitalPeriod",
             "Value": "365.20001220703125",
             "Category": "Bad value",
+            "RecordIndex": "1"
         },
         {
             "Entity": "planets",
@@ -360,6 +364,7 @@ def test_apply_business_rules_with_data_errors(  # pylint: disable=redefined-out
             "ReportingField": "gravity",
             "Value": "9.800000190734863",
             "Category": "Bad value",
+            "RecordIndex": "1"
         },
     ]
     
@@ -504,6 +509,7 @@ def test_error_report_where_report_is_expected(  # pylint: disable=redefined-out
                 "Error Code": "LONG_ORBIT",
                 "Data Item Submission Name": "orbitalPeriod",
                 "Errors and Warnings": "Planet has long orbital period",
+                "Record Index": 1,
                 "Value": 365.20001220703125,
                 "ID": None,
                 "Category": "Bad value",
@@ -516,6 +522,7 @@ def test_error_report_where_report_is_expected(  # pylint: disable=redefined-out
                 "Error Code": "STRONG_GRAVITY",
                 "Data Item Submission Name": "gravity",
                 "Errors and Warnings": "Planet has too strong gravity",
+                "Record Index": 1,
                 "Value": 9.800000190734863,
                 "ID": None,
                 "Category": "Bad value",
diff --git a/tests/testdata/movies/movies_ddb_rule_store.json b/tests/testdata/movies/movies_ddb_rule_store.json
@@ -61,7 +61,7 @@
                     "name": "Get median sequel rating",
                     "operation": "group_by",
                     "entity": "with_sequels",
-                    "group_by": "title",
+                    "group_by": ["__record_index__", "title"],
                     "agg_columns": {
                         "list_aggregate(sequel_rating, 'median')": "median_sequel_rating"
                     }
diff --git a/tests/testdata/movies/movies_spark_rule_store.json b/tests/testdata/movies/movies_spark_rule_store.json
@@ -63,14 +63,15 @@
                     "entity": "with_sequels",
                     "columns": {
                         "title": "title",
+                        "__record_index__": "__record_index__",
                         "explode(sequel_rating)": "sequel_rating"
                     }
                 },
                 {
                     "name": "Get median sequel rating",
                     "operation": "group_by",
                     "entity": "with_sequels",
-                    "group_by": "title",
+                    "group_by": ["__record_index__","title"],
                     "agg_columns": {
                         "percentile_approx(sequel_rating, 0.5)": "median_sequel_rating"
                     }

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`"ErrorType",`
`24`	`24`	`"ErrorLocation",`
`25`	`25`	`"ErrorMessage",`
	`26`	`+ "RecordIndex",`
`26`	`27`	`"ReportingField",`
`27`	`28`	`"Category",`
`28`	`29`	`]`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@`
`61`	`61`	`"name": "Get median sequel rating",`
`62`	`62`	`"operation": "group_by",`
`63`	`63`	`"entity": "with_sequels",`
`64`		`- "group_by": "title",`
	`64`	`+ "group_by": ["__record_index__", "title"],`
`65`	`65`	`"agg_columns": {`
`66`	`66`	`"list_aggregate(sequel_rating, 'median')": "median_sequel_rating"`
`67`	`67`	`}`