fix: included submission status (with additional processing failure check) in error report population to reduce chance of incorrect status

stevenhsd · stevenhsd · commit 967470066664 · 2026-02-17T21:28:50.000Z
diff --git a/src/dve/pipeline/pipeline.py b/src/dve/pipeline/pipeline.py
@@ -20,6 +20,7 @@
     dump_feedback_errors,
     dump_processing_errors,
     get_feedback_errors_uri,
+    get_processing_errors_uri,
     load_feedback_messages,
 )
 from dve.core_engine.backends.base.auditing import BaseAuditingManager
@@ -769,6 +770,13 @@ def error_report(
                 "error_report", submission_info.submission_id
             )
 
+        if not submission_status.processing_failed:
+            submission_status.processing_failed = fh.get_resource_exists(
+                get_processing_errors_uri(
+                    fh.joinuri(self.processed_files_path, submission_info.submission_id)
+                )
+            )
+
         if not self.processed_files_path:
             raise AttributeError("processed files path not provided")
 
@@ -797,6 +805,7 @@ def error_report(
             if value is not None and not key.endswith("_updated")
         }
         summary_items = er.SummaryItems(
+            submission_status=submission_status,
             summary_dict=summary_dict,
             row_headings=["Submission Failure", "Warning"],
         )
diff --git a/src/dve/reporting/excel_report.py b/src/dve/reporting/excel_report.py
@@ -16,11 +16,15 @@
 from polars import DataFrame
 from polars.exceptions import ColumnNotFoundError
 
+from dve.pipeline.utils import SubmissionStatus
+
 
 @dataclass
 class SummaryItems:
     """Items to go into the Summary sheet"""
 
+    submission_status: SubmissionStatus = field(default_factory=SubmissionStatus)
+    """The status of the submission"""
     summary_dict: dict[str, Any] = field(default_factory=dict)
     """Dictionary of items to show in the front sheet key is put into Column B
     and value in column C"""
@@ -84,9 +88,12 @@ def create_summary_sheet(
 
         return summary
 
-    @staticmethod
-    def get_submission_status(aggregates: DataFrame) -> str:
+    def get_submission_status(self, aggregates: DataFrame) -> str:
         """Returns the status of the submission based on the error data"""
+        if self.submission_status.processing_failed:
+            return "There was an issue processing the submission. This will be investigated."
+        if self.submission_status.validation_failed:
+            return "File has been rejected"
         if aggregates.is_empty():
             return "File has been accepted, no issues to report"
         failures = aggregates["Type"].unique()
@@ -134,149 +141,6 @@ def _add_submission_info(self, status: str, summary: Worksheet):
         summary.append(["", ""])
 
 
-@dataclass
-class CombinedSummary(SummaryItems):
-    """Writes the combined report summary tables
-
-    These get split out of multiple lines based on the partition key of the dataset.
-
-    Each of these sub tables has rows, with the row being defined by row_field
-    and columns, with the each one being filtered by column field.
-
-    An example would look like this...
-
-    {Current partition} Table heading
-    partition_key   column_field_n  column_field_m  additional_column_1 addition_column_2  etc.
-    first_partition 0               2               10                  14
-    2nd_partition   3               4               11                  15
-
-    {next partition} Table heading
-    partition_key   column_field_n  column_field_m  additional_column_1 addition_column_2  etc.
-    first_partition 0               5               10                  14
-    2nd_partition   3               4               12                  15
-
-    ...by default the value in the first_partition x column_field_n cell will be the "Count" field
-    so it's the number of times that a partiticular column has occured within a partition.
-
-    or more concretly, in a dataset where the columns are `Submission_error` and `warning`, and the
-    partition key is `file name` - the result would be the number of times a submission error or
-    warning has occured within a file.
-
-    In the parent class there is an aggregations property, which allows custom aggregations to
-    be added. If an aggregation is added to a field not in the column field
-    (e.g. an additional column) then an aggregation and column mapping needs to be added for it.
-
-    """
-
-    column_field: str = "Type"
-    """Field to display across the top of the table"""
-    row_field: str = "file_name"
-    """Field to display along the side of the table"""
-    partition_key: str = "FeedType"
-    """Key to split the data into multiple tables"""
-    table_heading: str = "Files processed"
-    """Heading for each partitioned table"""
-    table_mapping: dict = field(default_factory=dict)
-    """Mapping of a given column to a column in the dataframe, defaults to using Count"""
-
-    def create_summary_sheet(
-        self,
-        summary: Worksheet,
-        aggregates: DataFrame,
-        status: str,
-    ):
-        """Creates a summary sheet for a combined error report"""
-        self._add_submission_info(status, summary)
-
-        try:
-            agg_tables = aggregates[self.column_field].unique().to_list()
-        except ColumnNotFoundError:
-            agg_tables = []
-        tables = self.table_columns or agg_tables
-        tables = tables.copy()  # make sure not to mutate the original
-        difference = set(agg_tables).difference(tables)
-        if difference:
-            tables.extend(difference)
-
-        if self.additional_columns:
-            tables.extend(self.additional_columns)
-
-        if aggregates.is_empty():
-            error_summary = aggregates
-        else:
-            groups = [self.column_field, self.row_field, self.partition_key]
-
-            error_summary = (
-                # chaining methods on dataframes seems to confuse mypy
-                aggregates.group_by(groups).agg(*self.aggregations)  # type: ignore
-            )
-        tables = [table for table in tables if table is not None]
-        column = self.partition_key
-        keys = error_summary[column].unique()
-        for item in sorted(str(key) for key in keys if key is not None):
-            summary.append(["", f"{item} {self.table_heading}"])
-            self._write_combined_table(
-                summary,
-                tables,
-                error_summary.filter(pl.col(column) == pl.lit(item)),
-            )
-            summary.append([""])
-        return summary
-
-    @staticmethod
-    def get_submission_status(aggregates: DataFrame) -> str:
-        """Returns the status of the submission based on the error data"""
-        if aggregates.is_empty():
-            return "Overall submission has been accepted, no issues to report"
-        failures = aggregates["Type"].unique()
-        if "Submission Failure" in failures:
-            status = "Submission Failures found, overall submission has been rejected"
-        elif "Warning" in failures:
-            status = "Overall submission has been accepted, warnings found"
-        else:
-            status = "Overall submission has been accepted, no issues to report"
-        return status
-
-    def _write_combined_table(
-        self,
-        summary: Worksheet,
-        tables: list[str],
-        error_summary: DataFrame,
-    ):
-        try:
-            agg_types = error_summary[self.row_field].unique().to_list()
-        except ColumnNotFoundError:
-            agg_types = []
-
-        row_headings = self.row_headings or agg_types
-        difference = set(row_headings).difference(agg_types)
-        if difference:
-            row_headings.extend(difference)
-
-        row_headings = filter(bool, row_headings)
-
-        summary.append(["", self.row_field.capitalize(), *map(str.capitalize, tables)])
-        for row_type in sorted(row_headings):
-            row: list[Any] = ["", row_type]
-            for table in tables:
-                count_field = self.table_mapping.get(table, "Count")
-                if table in self.table_columns:
-                    column_filter = pl.col(self.column_field) == pl.lit(table)
-                else:
-                    column_filter = True
-                if error_summary.is_empty():
-                    counts = error_summary
-                else:
-                    counts = error_summary.filter(  # type: ignore
-                        column_filter & (pl.col(self.row_field) == pl.lit(row_type))
-                    )[count_field]
-                if counts.is_empty():
-                    row.append(0)
-                else:
-                    row.append(counts[0])
-            summary.append(row)
-
-
 class ExcelFormat:
     """Formats error data into an excel file"""
 
diff --git a/tests/test_reporting/test_excel_report.py b/tests/test_reporting/test_excel_report.py
@@ -5,13 +5,14 @@
 import pytest
 
 from dve.core_engine.message import FeedbackMessage
+from dve.pipeline.utils import SubmissionStatus
 from dve.reporting.error_report import (
     create_error_dataframe,
     generate_report_dataframes,
     get_error_codes,
     populate_error_codes,
 )
-from dve.reporting.excel_report import CombinedSummary, ExcelFormat, SummaryItems
+from dve.reporting.excel_report import ExcelFormat, SummaryItems
 
 from ..conftest import get_test_file_path
 from ..fixtures import temp_dir
@@ -137,41 +138,6 @@ def test_excel_report(report_dfs):
     ]
 
 
-def test_excel_combined_report(report_dfs):
-    error_df, aggregate_df = report_dfs
-    error_dfs = {
-        "MilkyWay": error_df,
-        "Andromeda": error_df,
-        "BlackEye": error_df,
-        "Cartwheel": error_df,
-    }
-    summary_df = aggregate_df.with_columns(file_name=pl.lit("filename"), Galaxy=pl.lit("galaxy"))
-    report = ExcelFormat(error_dfs, aggregate_df, summary_aggregates=summary_df)
-    summary_items = CombinedSummary(
-        summary_dict={
-            "Sender": "X26",
-            "Datetime_sent": datetime.datetime.now(),
-            "Datetime_processed": datetime.datetime.now(),
-        },
-        row_field="file_name",
-        column_field="Type",
-        table_columns=["Planet", "Derived"],
-        partition_key="Galaxy",
-        aggregations=[pl.sum("Count")],
-    )
-    workbook = report.excel_format(
-        summary_items=summary_items,
-    )
-    assert workbook.sheetnames == [
-        "Summary",
-        "Error Summary",
-        "MilkyWay",
-        "Andromeda",
-        "BlackEye",
-        "Cartwheel",
-    ]
-
-
 def test_excel_report_overflow(big_report_dfs):
     error_df, aggregate_df = big_report_dfs
     error_dfs = {"MilkyWay": error_df}
@@ -215,3 +181,21 @@ def test_excel_report_empty_dfs():
     assert workbook.sheetnames == ["Summary", "Error Summary", "Error Data"]
     assert not all(cell.value for cell in workbook["Error Data"]["2"])  # no errors
     assert not all(cell.value for cell in workbook["Error Summary"]["2"])  # no aggregates
+
+def test_sub_status_failed_processing():
+    """Check that the submission status is used to determine the """
+    
+    summary_items = SummaryItems(
+        submission_status=SubmissionStatus(processing_failed=True),
+        summary_dict={
+            "Sender": "X26",
+            "Datetime_sent": datetime.datetime.now(),
+            "Datetime_processed": datetime.datetime.now(),
+        },
+        row_headings=["Submission Failure", "Warning"],
+        table_columns=["Planet", "Derived"],
+    )
+    assert summary_items.get_submission_status(pl.DataFrame()) == "There was an issue processing the submission. This will be investigated."
+    summary_items.submission_status = SubmissionStatus(validation_failed=True)
+    assert summary_items.get_submission_status(pl.DataFrame()) == "File has been rejected"
+