Skip to content

Commit dc161e4

Browse files
committed
🐛 Include item_id in deduplication
1 parent ebaad74 commit dc161e4

1 file changed

Lines changed: 8 additions & 3 deletions

File tree

src/mindlogger_data_export/processors.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class DeduplicateResponsesProcessor(ReportProcessor):
108108
"""Deduplicate responses, keeping latest by "activity_end_time".
109109
110110
This processor removes duplicate item responses for the same
111-
user/activity/submission combination, keeping only the most recent entry.
111+
user/activity/submission/ITEM combination, keeping only the most recent entry.
112112
"""
113113

114114
NAME = "DeduplicateResponses"
@@ -117,8 +117,13 @@ class DeduplicateResponsesProcessor(ReportProcessor):
117117

118118
def _run(self, report: pl.DataFrame) -> pl.DataFrame:
119119
"""Deduplicate report by keeping latest activity_end_time."""
120-
# Define the columns that should be unique
121-
unique_cols = ["target_user_secret_id", "source_user_secret_id", "activity_id"]
120+
# Define the columns that should be unique PER ITEM
121+
unique_cols = [
122+
"target_user_secret_id",
123+
"source_user_secret_id",
124+
"activity_id",
125+
"item_id",
126+
]
122127

123128
# Check which columns actually exist in the report
124129
existing_unique_cols = [col for col in unique_cols if col in report.columns]

0 commit comments

Comments
 (0)