Skip to content

Commit 2cf087a

Browse files
Merge pull request #48 from NHSDigital/bug/ndit-949_process_fail_check_in_err_rpts
fix: included submission status (with additional processing failure check) in error report population to reduce chance of incorrect status
2 parents fb30b1f + 4dc0923 commit 2cf087a

3 files changed

Lines changed: 38 additions & 181 deletions

File tree

src/dve/pipeline/pipeline.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
dump_feedback_errors,
2121
dump_processing_errors,
2222
get_feedback_errors_uri,
23+
get_processing_errors_uri,
2324
load_feedback_messages,
2425
)
2526
from dve.core_engine.backends.base.auditing import BaseAuditingManager
@@ -769,6 +770,13 @@ def error_report(
769770
"error_report", submission_info.submission_id
770771
)
771772

773+
if not submission_status.processing_failed:
774+
submission_status.processing_failed = fh.get_resource_exists(
775+
get_processing_errors_uri(
776+
fh.joinuri(self.processed_files_path, submission_info.submission_id)
777+
)
778+
)
779+
772780
if not self.processed_files_path:
773781
raise AttributeError("processed files path not provided")
774782

@@ -797,6 +805,7 @@ def error_report(
797805
if value is not None and not key.endswith("_updated")
798806
}
799807
summary_items = er.SummaryItems(
808+
submission_status=submission_status,
800809
summary_dict=summary_dict,
801810
row_headings=["Submission Failure", "Warning"],
802811
)

src/dve/reporting/excel_report.py

Lines changed: 9 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,15 @@
1616
from polars import DataFrame
1717
from polars.exceptions import ColumnNotFoundError
1818

19+
from dve.pipeline.utils import SubmissionStatus
20+
1921

2022
@dataclass
2123
class SummaryItems:
2224
"""Items to go into the Summary sheet"""
2325

26+
submission_status: SubmissionStatus = field(default_factory=SubmissionStatus)
27+
"""The status of the submission"""
2428
summary_dict: dict[str, Any] = field(default_factory=dict)
2529
"""Dictionary of items to show in the front sheet key is put into Column B
2630
and value in column C"""
@@ -84,9 +88,12 @@ def create_summary_sheet(
8488

8589
return summary
8690

87-
@staticmethod
88-
def get_submission_status(aggregates: DataFrame) -> str:
91+
def get_submission_status(self, aggregates: DataFrame) -> str:
8992
"""Returns the status of the submission based on the error data"""
93+
if self.submission_status.processing_failed:
94+
return "There was an issue processing the submission. Please contact support."
95+
if self.submission_status.validation_failed:
96+
return "File has been rejected"
9097
if aggregates.is_empty():
9198
return "File has been accepted, no issues to report"
9299
failures = aggregates["Type"].unique()
@@ -134,149 +141,6 @@ def _add_submission_info(self, status: str, summary: Worksheet):
134141
summary.append(["", ""])
135142

136143

137-
@dataclass
138-
class CombinedSummary(SummaryItems):
139-
"""Writes the combined report summary tables
140-
141-
These get split out of multiple lines based on the partition key of the dataset.
142-
143-
Each of these sub tables has rows, with the row being defined by row_field
144-
and columns, with the each one being filtered by column field.
145-
146-
An example would look like this...
147-
148-
{Current partition} Table heading
149-
partition_key column_field_n column_field_m additional_column_1 addition_column_2 etc.
150-
first_partition 0 2 10 14
151-
2nd_partition 3 4 11 15
152-
153-
{next partition} Table heading
154-
partition_key column_field_n column_field_m additional_column_1 addition_column_2 etc.
155-
first_partition 0 5 10 14
156-
2nd_partition 3 4 12 15
157-
158-
...by default the value in the first_partition x column_field_n cell will be the "Count" field
159-
so it's the number of times that a partiticular column has occured within a partition.
160-
161-
or more concretly, in a dataset where the columns are `Submission_error` and `warning`, and the
162-
partition key is `file name` - the result would be the number of times a submission error or
163-
warning has occured within a file.
164-
165-
In the parent class there is an aggregations property, which allows custom aggregations to
166-
be added. If an aggregation is added to a field not in the column field
167-
(e.g. an additional column) then an aggregation and column mapping needs to be added for it.
168-
169-
"""
170-
171-
column_field: str = "Type"
172-
"""Field to display across the top of the table"""
173-
row_field: str = "file_name"
174-
"""Field to display along the side of the table"""
175-
partition_key: str = "FeedType"
176-
"""Key to split the data into multiple tables"""
177-
table_heading: str = "Files processed"
178-
"""Heading for each partitioned table"""
179-
table_mapping: dict = field(default_factory=dict)
180-
"""Mapping of a given column to a column in the dataframe, defaults to using Count"""
181-
182-
def create_summary_sheet(
183-
self,
184-
summary: Worksheet,
185-
aggregates: DataFrame,
186-
status: str,
187-
):
188-
"""Creates a summary sheet for a combined error report"""
189-
self._add_submission_info(status, summary)
190-
191-
try:
192-
agg_tables = aggregates[self.column_field].unique().to_list()
193-
except ColumnNotFoundError:
194-
agg_tables = []
195-
tables = self.table_columns or agg_tables
196-
tables = tables.copy() # make sure not to mutate the original
197-
difference = set(agg_tables).difference(tables)
198-
if difference:
199-
tables.extend(difference)
200-
201-
if self.additional_columns:
202-
tables.extend(self.additional_columns)
203-
204-
if aggregates.is_empty():
205-
error_summary = aggregates
206-
else:
207-
groups = [self.column_field, self.row_field, self.partition_key]
208-
209-
error_summary = (
210-
# chaining methods on dataframes seems to confuse mypy
211-
aggregates.group_by(groups).agg(*self.aggregations) # type: ignore
212-
)
213-
tables = [table for table in tables if table is not None]
214-
column = self.partition_key
215-
keys = error_summary[column].unique()
216-
for item in sorted(str(key) for key in keys if key is not None):
217-
summary.append(["", f"{item} {self.table_heading}"])
218-
self._write_combined_table(
219-
summary,
220-
tables,
221-
error_summary.filter(pl.col(column) == pl.lit(item)),
222-
)
223-
summary.append([""])
224-
return summary
225-
226-
@staticmethod
227-
def get_submission_status(aggregates: DataFrame) -> str:
228-
"""Returns the status of the submission based on the error data"""
229-
if aggregates.is_empty():
230-
return "Overall submission has been accepted, no issues to report"
231-
failures = aggregates["Type"].unique()
232-
if "Submission Failure" in failures:
233-
status = "Submission Failures found, overall submission has been rejected"
234-
elif "Warning" in failures:
235-
status = "Overall submission has been accepted, warnings found"
236-
else:
237-
status = "Overall submission has been accepted, no issues to report"
238-
return status
239-
240-
def _write_combined_table(
241-
self,
242-
summary: Worksheet,
243-
tables: list[str],
244-
error_summary: DataFrame,
245-
):
246-
try:
247-
agg_types = error_summary[self.row_field].unique().to_list()
248-
except ColumnNotFoundError:
249-
agg_types = []
250-
251-
row_headings = self.row_headings or agg_types
252-
difference = set(row_headings).difference(agg_types)
253-
if difference:
254-
row_headings.extend(difference)
255-
256-
row_headings = filter(bool, row_headings)
257-
258-
summary.append(["", self.row_field.capitalize(), *map(str.capitalize, tables)])
259-
for row_type in sorted(row_headings):
260-
row: list[Any] = ["", row_type]
261-
for table in tables:
262-
count_field = self.table_mapping.get(table, "Count")
263-
if table in self.table_columns:
264-
column_filter = pl.col(self.column_field) == pl.lit(table)
265-
else:
266-
column_filter = True
267-
if error_summary.is_empty():
268-
counts = error_summary
269-
else:
270-
counts = error_summary.filter( # type: ignore
271-
column_filter & (pl.col(self.row_field) == pl.lit(row_type))
272-
)[count_field]
273-
if counts.is_empty():
274-
row.append(0)
275-
else:
276-
row.append(counts[0])
277-
summary.append(row)
278-
279-
280144
class ExcelFormat:
281145
"""Formats error data into an excel file"""
282146

tests/test_reporting/test_excel_report.py

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
import pytest
66

77
from dve.core_engine.message import FeedbackMessage
8+
from dve.pipeline.utils import SubmissionStatus
89
from dve.reporting.error_report import (
910
create_error_dataframe,
1011
generate_report_dataframes,
1112
get_error_codes,
1213
populate_error_codes,
1314
)
14-
from dve.reporting.excel_report import CombinedSummary, ExcelFormat, SummaryItems
15+
from dve.reporting.excel_report import ExcelFormat, SummaryItems
1516

1617
from ..conftest import get_test_file_path
1718
from ..fixtures import temp_dir
@@ -137,41 +138,6 @@ def test_excel_report(report_dfs):
137138
]
138139

139140

140-
def test_excel_combined_report(report_dfs):
141-
error_df, aggregate_df = report_dfs
142-
error_dfs = {
143-
"MilkyWay": error_df,
144-
"Andromeda": error_df,
145-
"BlackEye": error_df,
146-
"Cartwheel": error_df,
147-
}
148-
summary_df = aggregate_df.with_columns(file_name=pl.lit("filename"), Galaxy=pl.lit("galaxy"))
149-
report = ExcelFormat(error_dfs, aggregate_df, summary_aggregates=summary_df)
150-
summary_items = CombinedSummary(
151-
summary_dict={
152-
"Sender": "X26",
153-
"Datetime_sent": datetime.datetime.now(),
154-
"Datetime_processed": datetime.datetime.now(),
155-
},
156-
row_field="file_name",
157-
column_field="Type",
158-
table_columns=["Planet", "Derived"],
159-
partition_key="Galaxy",
160-
aggregations=[pl.sum("Count")],
161-
)
162-
workbook = report.excel_format(
163-
summary_items=summary_items,
164-
)
165-
assert workbook.sheetnames == [
166-
"Summary",
167-
"Error Summary",
168-
"MilkyWay",
169-
"Andromeda",
170-
"BlackEye",
171-
"Cartwheel",
172-
]
173-
174-
175141
def test_excel_report_overflow(big_report_dfs):
176142
error_df, aggregate_df = big_report_dfs
177143
error_dfs = {"MilkyWay": error_df}
@@ -215,3 +181,21 @@ def test_excel_report_empty_dfs():
215181
assert workbook.sheetnames == ["Summary", "Error Summary", "Error Data"]
216182
assert not all(cell.value for cell in workbook["Error Data"]["2"]) # no errors
217183
assert not all(cell.value for cell in workbook["Error Summary"]["2"]) # no aggregates
184+
185+
def test_sub_status_failed_processing():
186+
"""Check that the submission status is used to determine the """
187+
188+
summary_items = SummaryItems(
189+
submission_status=SubmissionStatus(processing_failed=True),
190+
summary_dict={
191+
"Sender": "X26",
192+
"Datetime_sent": datetime.datetime.now(),
193+
"Datetime_processed": datetime.datetime.now(),
194+
},
195+
row_headings=["Submission Failure", "Warning"],
196+
table_columns=["Planet", "Derived"],
197+
)
198+
assert summary_items.get_submission_status(pl.DataFrame()) == "There was an issue processing the submission. This will be investigated."
199+
summary_items.submission_status = SubmissionStatus(validation_failed=True)
200+
assert summary_items.get_submission_status(pl.DataFrame()) == "File has been rejected"
201+

0 commit comments

Comments
 (0)