NHSDigital
diff --git a/‎docs/README.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/dve/common/__init__.py‎ b/‎src/dve/common/__init__.py‎
diff --git a/‎src/dve/common/error_utils.py‎
Lines changed: 173 additions & 0 deletions b/‎src/dve/common/error_utils.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎src/dve/core_engine/backends/base/backend.py‎
Lines changed: 22 additions & 20 deletions b/‎src/dve/core_engine/backends/base/backend.py‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎src/dve/core_engine/backends/base/contract.py‎
Lines changed: 15 additions & 11 deletions b/‎src/dve/core_engine/backends/base/contract.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎src/dve/core_engine/backends/base/utilities.py‎
Lines changed: 12 additions & 0 deletions b/‎src/dve/core_engine/backends/base/utilities.py‎
Lines changed: 12 additions & 0 deletions
@@ -165,8 +165,8 @@ for entity in data_contract_config.schemas:
 
 # Data contract step here
 data_contract = SparkDataContract(spark_session=spark)
-entities, validation_messages, success = data_contract.apply_data_contract(
-    entities, data_contract_config
+entities, feedback_errors_uri, success = data_contract.apply_data_contract(
+    entities, None, data_contract_config
 )
 ```
 
 
@@ -0,0 +1,173 @@
+"""Utilities to support reporting"""
+
+import datetime as dt
+from itertools import chain
+import json
+import logging
+from multiprocessing import Queue
+from threading import Thread
+from typing import Iterable, Iterator, Optional, Union
+
+from dve.core_engine.message import UserMessage
+from dve.core_engine.loggers import get_logger
+import dve.parser.file_handling as fh
+from dve.core_engine.exceptions import CriticalProcessingError
+from dve.core_engine.type_hints import URI, DVEStage, Messages
+
+
+def get_feedback_errors_uri(working_folder: URI, step_name: DVEStage) -> URI:
+    """Determine the location of json lines file containing all errors generated in a step"""
+    return fh.joinuri(working_folder, "errors", f"{step_name}_errors.jsonl")
+
+def get_processing_errors_uri(working_folder: URI) -> URI:
+    """Determine the location of json lines file containing all processing
+       errors generated from DVE run"""
+    return fh.joinuri(working_folder, "errors", "processing_errors.jsonl")
+
+
+def dump_feedback_errors(
+    working_folder: URI,
+    step_name: DVEStage,
+    messages: Messages,
+    key_fields: Optional[dict[str, list[str]]] = None,
+) -> URI:
+    """Write out captured feedback error messages."""
+    if not working_folder:
+        raise AttributeError("processed files path not passed")
+
+    if not key_fields:
+        key_fields = {}
+
+    error_file = get_feedback_errors_uri(working_folder, step_name)
+    processed = []
+
+    for message in messages:
+        if message.original_entity is not None:
+            primary_keys = key_fields.get(message.original_entity, [])
+        elif message.entity is not None:
+            primary_keys = key_fields.get(message.entity, [])
+        else:
+            primary_keys = []
+
+        error = message.to_dict(
+            key_field=primary_keys,
+            value_separator=" -- ",
+            max_number_of_values=10,
+            record_converter=None,
+        )
+        error["Key"] = conditional_cast(error["Key"], primary_keys, value_separator=" -- ")
+        processed.append(error)
+
+    with fh.open_stream(error_file, "a") as f:
+        f.write("\n".join([json.dumps(rec, default=str) for rec in processed]) + "\n")
+    return error_file
+
+
+def dump_processing_errors(
+    working_folder: URI, step_name: DVEStage, errors: list[CriticalProcessingError]
+) -> URI:
+    """Write out critical processing errors"""
+    if not working_folder:
+        raise AttributeError("processed files path not passed")
+    if not step_name:
+        raise AttributeError("step name not passed")
+    if not errors:
+        raise AttributeError("errors list not passed")
+
+    error_file: URI = get_processing_errors_uri(working_folder)
+    processed = []
+
+    for error in errors:
+        processed.append(
+            {
+                "step_name": step_name,
+                "error_location": "processing",
+                "error_level": "integrity",
+                "error_message": error.error_message,
+            }
+        )
+
+    with fh.open_stream(error_file, "a") as f:
+            f.write("\n".join([json.dumps(rec, default=str) for rec in processed]) + "\n")
+    
+    return error_file
+
+def load_feedback_messages(feedback_messages_uri: URI) -> Iterable[UserMessage]:
+    if not fh.get_resource_exists(feedback_messages_uri):
+        return
+    with fh.open_stream(feedback_messages_uri) as errs:
+        yield from (UserMessage(**json.loads(err)) for err in errs.readlines())
+
+def load_all_error_messages(error_directory_uri: URI) -> Iterable[UserMessage]:
+    return chain.from_iterable([load_feedback_messages(err_file) for err_file, _ in fh.iter_prefix(error_directory_uri) if err_file.endswith(".jsonl")])
+
+class BackgroundMessageWriter:
+    def __init__(self,
+                 working_directory: URI,
+                 dve_stage: DVEStage,
+                 key_fields: Optional[dict[str, list[str]]] = None,
+                 logger: Optional[logging.Logger] = None):
+        self._working_directory = working_directory
+        self._dve_stage = dve_stage
+        self._feedback_message_uri = get_feedback_errors_uri(self._working_directory, self._dve_stage)
+        self._key_fields = key_fields
+        self.logger = logger or get_logger(type(self).__name__)
+        self._write_thread = None
+        self._queue = Queue()
+    
+    @property
+    def write_queue(self):
+        return self._queue
+    
+    @property
+    def write_thread(self):
+        if not self._write_thread:
+            self._write_thread = Thread(target=self._write_process_wrapper)
+        return self._write_thread
+        
+    
+    def _write_process_wrapper(self):
+        """Wrapper for dump feedback errors to run in background process"""
+        while True:
+            if msgs := self.write_queue.get():
+                dump_feedback_errors(self._working_directory, self._dve_stage, msgs, self._key_fields)
+            else:
+                break
+    
+    def __enter__(self) -> "BackgroundMessageWriter":
+        self.write_thread.start()
+        return self
+    
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            self.logger.exception(
+                "Issue occured during background write process:",
+                exc_info=(exc_type, exc_value, traceback)
+            )
+        self.write_queue.put(None)
+        self.write_thread.join()
+        
+    
+
+def write_process_wrapper(working_directory: URI, *, queue: Queue, key_fields: Optional[dict[str, list[str]]] = None):
+    """Wrapper for dump feedback errors to run in background process"""
+    while True:
+        if msgs := queue.get():
+            dump_feedback_errors(fh.joinuri(working_directory, "data_contract"), msgs, key_fields)
+        else:
+            break
+
+def conditional_cast(value, primary_keys: list[str], value_separator: str) -> Union[list[str], str]:
+    """Determines what to do with a value coming back from the error list"""
+    if isinstance(value, list):
+        casts = [
+            conditional_cast(val, primary_keys, value_separator) for val in value
+        ]  # type: ignore
+        return value_separator.join(
+            [f"{pk}: {id}" if pk else "" for pk, id in zip(primary_keys, casts)]
+        )
+    if isinstance(value, dt.date):
+        return value.isoformat()
+    if isinstance(value, dict):
+        return ""
+    return str(value)
@@ -24,6 +24,7 @@
     EntityParquetLocations,
     Messages,
 )
+from dve.parser.file_handling.service import get_parent, joinuri
 
 
 class BaseBackend(Generic[EntityType], ABC):
@@ -148,65 +149,66 @@ def convert_entities_to_spark(
 
     def apply(
         self,
+        working_dir: URI,
         entity_locations: EntityLocations,
         contract_metadata: DataContractMetadata,
         rule_metadata: RuleMetadata,
         submission_info: Optional[SubmissionInfo] = None,
-    ) -> tuple[Entities, Messages, StageSuccessful]:
+    ) -> tuple[Entities, URI, StageSuccessful]:
         """Apply the data contract and the rules, returning the entities and all
         generated messages.
 
         """
         reference_data = self.load_reference_data(
             rule_metadata.reference_data_config, submission_info
         )
-        entities, messages, successful = self.contract.apply(entity_locations, contract_metadata)
+        entities, dc_feedback_errors_uri, successful, processing_errors_uri = self.contract.apply(working_dir, entity_locations, contract_metadata)
         if not successful:
-            return entities, messages, successful
+            return entities, dc_feedback_errors_uri, successful, processing_errors_uri
 
         for entity_name, entity in entities.items():
             entities[entity_name] = self.step_implementations.add_row_id(entity)
 
         # TODO: Handle entity manager creation errors.
         entity_manager = EntityManager(entities, reference_data)
         # TODO: Add stage success to 'apply_rules'
-        rule_messages = self.step_implementations.apply_rules(entity_manager, rule_metadata)
-        messages.extend(rule_messages)
+        # TODO: In case of large errors in business rules, write messages to jsonl file and return uri to errors
+        _ = self.step_implementations.apply_rules(entity_manager, rule_metadata)
 
         for entity_name, entity in entity_manager.entities.items():
             entity_manager.entities[entity_name] = self.step_implementations.drop_row_id(entity)
 
-        return entity_manager.entities, messages, True
+        return entity_manager.entities, get_parent(dc_feedback_errors_uri), True
 
     def process(
         self,
+        working_dir: URI,
         entity_locations: EntityLocations,
         contract_metadata: DataContractMetadata,
         rule_metadata: RuleMetadata,
-        cache_prefix: URI,
         submission_info: Optional[SubmissionInfo] = None,
-    ) -> tuple[MutableMapping[EntityName, URI], Messages]:
+    ) -> tuple[MutableMapping[EntityName, URI], URI, URI]:
         """Apply the data contract and the rules, write the entities out to parquet
         and returning the entity locations and all generated messages.
 
         """
-        entities, messages, successful = self.apply(
-            entity_locations, contract_metadata, rule_metadata, submission_info
+        entities, feedback_errors_uri, successful, processing_errors_uri = self.apply(
+            working_dir, entity_locations, contract_metadata, rule_metadata, submission_info
         )
         if successful:
-            parquet_locations = self.write_entities_to_parquet(entities, cache_prefix)
+            parquet_locations = self.write_entities_to_parquet(entities, joinuri(working_dir, "outputs"))
         else:
             parquet_locations = {}
-        return parquet_locations, messages
+        return parquet_locations, feedback_errors_uri, processing_errors_uri
 
     def process_legacy(
         self,
+        working_dir: URI,
         entity_locations: EntityLocations,
         contract_metadata: DataContractMetadata,
         rule_metadata: RuleMetadata,
-        cache_prefix: URI,
         submission_info: Optional[SubmissionInfo] = None,
-    ) -> tuple[MutableMapping[EntityName, DataFrame], Messages]:
+    ) -> tuple[MutableMapping[EntityName, DataFrame], URI]:
         """Apply the data contract and the rules, create Spark `DataFrame`s from the
         entities and return the Spark entities and all generated messages.
 
@@ -221,17 +223,17 @@ def process_legacy(
             category=DeprecationWarning,
         )
 
-        entities, messages, successful = self.apply(
-            entity_locations, contract_metadata, rule_metadata, submission_info
+        entities, errors_uri, successful = self.apply(
+            working_dir, entity_locations, contract_metadata, rule_metadata, submission_info
         )
 
         if not successful:
-            return {}, messages
+            return {}, errors_uri
 
         if self.__entity_type__ == DataFrame:
-            return entities, messages  # type: ignore
+            return entities, errors_uri  # type: ignore
 
         return (
-            self.convert_entities_to_spark(entities, cache_prefix, _emit_deprecation_warning=False),
-            messages,
+            self.convert_entities_to_spark(entities, joinuri(working_dir, "outputs"), _emit_deprecation_warning=False),
+            errors_uri,
         )
@@ -27,8 +27,10 @@
     Messages,
     WrapDecorator,
 )
-from dve.parser.file_handling import get_file_suffix, get_resource_exists
+from dve.parser.file_handling import get_file_suffix, get_resource_exists, get_parent
+from dve.parser.file_handling.service import joinuri
 from dve.parser.type_hints import Extension
+from dve.common.error_utils import dump_processing_errors, get_feedback_errors_uri, get_processing_errors_uri
 
 T = TypeVar("T")
 ExtensionConfig = dict[Extension, "ReaderConfig"]
@@ -360,8 +362,8 @@ def read_raw_entities(
 
     @abstractmethod
     def apply_data_contract(
-        self, entities: Entities, contract_metadata: DataContractMetadata
-    ) -> tuple[Entities, Messages, StageSuccessful]:
+        self, working_dir: URI, entities: Entities, entity_locations: EntityLocations, contract_metadata: DataContractMetadata, key_fields: Optional[dict[str, list[str]]] = None
+    ) -> tuple[Entities, URI, StageSuccessful]:
         """Apply the data contract to the raw entities, returning the validated entities
         and any messages.
 
@@ -371,35 +373,37 @@ def apply_data_contract(
         raise NotImplementedError()
 
     def apply(
-        self, entity_locations: EntityLocations, contract_metadata: DataContractMetadata
-    ) -> tuple[Entities, Messages, StageSuccessful]:
+        self, working_dir: URI, entity_locations: EntityLocations, contract_metadata: DataContractMetadata, key_fields: Optional[dict[str, list[str]]] = None
+    ) -> tuple[Entities, URI, StageSuccessful, URI]:
         """Read the entities from the provided locations according to the data contract,
         and return the validated entities and any messages.
 
         """
+        feedback_errors_uri = get_feedback_errors_uri(working_dir, "data_contract")
+        processing_errors_uri = get_processing_errors_uri(working_dir)
         entities, messages, successful = self.read_raw_entities(entity_locations, contract_metadata)
         if not successful:
-            return {}, messages, successful
+            dump_processing_errors(working_dir, "data_contract", messages)
+            return {}, feedback_errors_uri, successful, processing_errors_uri
 
         try:
-            entities, contract_messages, successful = self.apply_data_contract(
-                entities, contract_metadata
+            entities, feedback_errors_uri, successful = self.apply_data_contract(
+                working_dir, entities, entity_locations, contract_metadata, key_fields
             )
-            messages.extend(contract_messages)
         except Exception as err:  # pylint: disable=broad-except
             successful = False
             new_messages = render_error(
                 err,
                 "data contract",
                 self.logger,
             )
-            messages.extend(new_messages)
+            dump_processing_errors(working_dir, "data_contract", new_messages)
 
         if contract_metadata.cache_originals:
             for entity_name in list(entities):
                 entities[f"Original{entity_name}"] = entities[entity_name]
 
-        return entities, messages, successful
+        return entities, feedback_errors_uri, successful, processing_errors_uri
 
     def read_parquet(self, path: URI, **kwargs) -> EntityType:
         """Method to read parquet files from stringified parquet output
 
@@ -5,8 +5,12 @@
 from collections.abc import Sequence
 from typing import Optional
 
+import pyarrow
+import pyarrow.parquet as pq
+
 from dve.core_engine.message import FeedbackMessage
 from dve.core_engine.type_hints import ExpressionArray, MultiExpression
+from dve.parser.type_hints import URI
 
 BRACKETS = {"(": ")", "{": "}", "[": "]", "<": ">"}
 """A mapping of opening brackets to their closing counterpart."""
@@ -131,3 +135,11 @@ def _get_non_heterogenous_type(types: Sequence[type]) -> type:
             + f"union types (got {type_list!r}) but nullable types are okay"
         )
     return type_list[0]
+
+def check_if_parquet_file(file_location: URI) -> bool:
+    """Check if a file path is valid parquet"""
+    try:
+        pq.ParquetFile(file_location)
+        return True
+    except (pyarrow.ArrowInvalid, pyarrow.ArrowIOError):
+        return False
Original file line number	Diff line number	Diff line change
`@@ -165,8 +165,8 @@ for entity in data_contract_config.schemas:`
`165`	`165`
`166`	`166`	`# Data contract step here`
`167`	`167`	`data_contract = SparkDataContract(spark_session=spark)`
`168`		`-entities, validation_messages, success = data_contract.apply_data_contract(`
`169`		`- entities, data_contract_config`
	`168`	`+entities, feedback_errors_uri, success = data_contract.apply_data_contract(`
	`169`	`+ entities, None, data_contract_config`
`170`	`170`	`)`
`171`	`171`	```
`172`	`172`