👔 Add start_date column to REDCap outputs

shnizzedy · shnizzedy · commit 9cdf46a3f735 · 2026-04-02T19:07:21.000-04:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,12 @@
+{
+    "chat.agentSkillsLocations": {
+        ".github/skills": true,
+        ".agents/skills": true,
+        ".claude/skills": true,
+        "~/.copilot/skills": true,
+        "~/.agents/skills": true,
+        "~/.claude/skills": true,
+        "~/.vscode/extensions/synapsevscode.synapse-1.21.0/copilot/skills": true,
+        "~/.vscode/extensions/synapsevscode.synapse-1.22.0/copilot/skills": true
+    }
+}
diff --git a/src/mindlogger_data_export/outputs.py b/src/mindlogger_data_export/outputs.py
@@ -415,168 +415,201 @@ def _normalize_column_name(col: str) -> str:
         """Replace chains of underscores with single underscore."""
         return re.sub(r"_+", "_", col).lower()
 
-    def _prepare_activity_columns(
-        self, df: pl.DataFrame, activity_prefix: str
-    ) -> pl.DataFrame:
-        """Rename and transform columns for a single activity."""
-        # Prepend activity prefix and normalize underscores
-        df = df.rename(
-            {
-                col: self._normalize_column_name(f"{activity_prefix}_{col}")
-                for col in df.columns
-            }
-        )
-
-        # Clean up common suffixes
-        df = df.rename({col: col.replace("_user", "") for col in df.columns}).rename(
-            {
-                col: col[:-5]
-                for col in df.columns
-                if col.endswith(("_start_time", "_end_time"))
-            }
-        )
+    @staticmethod
+    def _get_column_bases(df: pl.DataFrame, suffix: str) -> set[str]:
+        """Extract base names for columns with a given suffix."""
+        return {col.replace(suffix, "") for col in df.columns if col.endswith(suffix)}
 
-        # Stringify `_options` columns
+    def _stringify_options_columns(self, df: pl.DataFrame) -> pl.DataFrame:
+        """Convert list-type option columns to JSON string format."""
         options_cols = [
             col
             for col in df.columns
             if col.endswith("_options") or "_response_options_" in col
         ]
-        for col in options_cols:
-            df = df.with_columns(
-                [
-                    pl.format(
-                        "[{}]",
-                        pl.col(col)
-                        .list.eval(pl.element().struct.json_encode())
-                        .list.join(", "),
-                    ).alias(col)
-                ]
-            )
+        if not options_cols:
+            return df
+
+        return df.with_columns(
+            [
+                pl.format(
+                    "[{}]",
+                    pl.col(col)
+                    .list.eval(pl.element().struct.json_encode())
+                    .list.join(", "),
+                ).alias(col)
+                for col in options_cols
+            ]
+        )
 
-        # Handle text items uniquely
-        response_cols = [col for col in df.columns if col.endswith("_response")]
+    def _create_score_from_index(self, df: pl.DataFrame) -> pl.DataFrame:
+        """For items with _index but no _score, create _score from _index."""
         index_cols = [col for col in df.columns if col.endswith("_index")]
-        index_bases = {col.replace("_index", "") for col in index_cols}
-        text_item_response_cols = [
-            col
-            for col in response_cols
-            if col.replace("_response", "") not in index_bases
+        score_bases = self._get_column_bases(df, "_score")
+
+        new_score_cols = [
+            pl.col(col).alias(f"{col.replace('_index', '')}_score")
+            for col in index_cols
+            if col.replace("_index", "") not in score_bases
         ]
 
-        # For items with `_index` but no `_score`, create `_score` from `_index`
-        score_cols = [col for col in df.columns if col.endswith("_score")]
-        score_bases = {col.replace("_score", "") for col in score_cols}
-        for col in index_cols:
-            base_name = col.replace("_index", "")
-            if base_name not in score_bases:
-                score_col = f"{base_name}_score"
-                df = df.with_columns([pl.col(col).alias(score_col)])
+        return df.with_columns(new_score_cols) if new_score_cols else df
 
-        # Drop multiselect response_options columns (they're redundant)
-        df = df.select(
-            [
-                col
-                for col in df.columns
-                if not (
-                    "_response_options_" in col
-                    and col.split("_response_options_")[-1].split("_")[-1].isdigit()
-                )
-            ]
-        )
+    def _create_redcap_response_columns(self, df: pl.DataFrame) -> pl.DataFrame:
+        """Create REDCap _response columns from _index columns.
+
+        If the original response value starts with a number, use that number;
+        otherwise use index + 1.
+        """
+        index_cols = [col for col in df.columns if col.endswith("_index")]
+        index_bases = self._get_column_bases(df, "_index")
+        response_bases = self._get_column_bases(df, "_response")
+
+        # Skip text items (those with _response but no _index)
+        text_item_response_cols = response_bases - index_bases
+
+        response_exprs = []
+        for col in index_cols:
+            # Skip if this is a text item
+            if col.replace("_index", "") in text_item_response_cols:
+                continue
 
-        # Create REDCap `_response` columns
-        # If the original response value starts with a number, use that number; otherwise use index + 1
-        for col in [_ for _ in index_cols if _ not in text_item_response_cols]:
             response_col = col.replace("_index", "_response")
             base_name = col.replace("_index", "")
-            # Check if there's an existing response column with values that start with numbers
             original_response_col = f"{base_name}_response"
+
             if original_response_col in df.columns:
                 # Try to extract leading number from response value, fall back to index + 1
-                df = df.with_columns(
-                    [
-                        pl.when(
-                            pl.col(original_response_col)
-                            .cast(pl.Utf8)
-                            .str.extract(r"^(\d+)", 1)
-                            .is_not_null()
-                        )
-                        .then(
-                            pl.col(original_response_col)
-                            .cast(pl.Utf8)
-                            .str.extract(r"^(\d+)", 1)
-                            .cast(pl.Int64)
-                        )
-                        .otherwise(pl.col(col) + 1)
-                        .alias(response_col)
-                    ]
+                response_exprs.append(
+                    pl.when(
+                        pl.col(original_response_col)
+                        .cast(pl.Utf8)
+                        .str.extract(r"^(\d+)", 1)
+                        .is_not_null()
+                    )
+                    .then(
+                        pl.col(original_response_col)
+                        .cast(pl.Utf8)
+                        .str.extract(r"^(\d+)", 1)
+                        .cast(pl.Int64)
+                    )
+                    .otherwise(pl.col(col) + 1)
+                    .alias(response_col)
                 )
             else:
                 # No original response column, use index + 1
-                df = df.with_columns([(pl.col(col) + 1).alias(response_col)])
+                response_exprs.append((pl.col(col) + 1).alias(response_col))
 
-        # Drop bare item columns that have _response, _score, or _index versions
-        response_bases = {
-            col.replace("_response", "")
+        return df.with_columns(response_exprs) if response_exprs else df
+
+    def _drop_multiselect_response_options(self, df: pl.DataFrame) -> pl.DataFrame:
+        """Drop redundant multiselect response_options columns."""
+        cols_to_keep = [
+            col
             for col in df.columns
-            if col.endswith("_response")
-        }
-        score_bases = {
-            col.replace("_score", "") for col in df.columns if col.endswith("_score")
-        }
-        index_bases = {
-            col.replace("_index", "") for col in df.columns if col.endswith("_index")
-        }
+            if not (
+                "_response_options_" in col
+                and col.split("_response_options_")[-1].split("_")[-1].isdigit()
+            )
+        ]
+        return df.select(cols_to_keep)
 
+    def _cleanup_response_column_names(
+        self, df: pl.DataFrame, activity_prefix: str
+    ) -> pl.DataFrame:
+        """Rename nested response columns to cleaner names."""
+        rename_map = {}
+
+        for col in df.columns:
+            if f"{activity_prefix}_response_response_" in col:
+                new_name = (
+                    col.replace(
+                        f"{activity_prefix}_response_response_", f"{activity_prefix}_"
+                    ).replace("_response_response_", "_")
+                    + "_response"
+                )
+                rename_map[col] = new_name
+            elif f"{activity_prefix}_response_value_" in col:
+                new_name = (
+                    col.replace(
+                        f"{activity_prefix}_response_value_", f"{activity_prefix}_"
+                    ).replace("_response_value_", "_")
+                    + "_score"
+                )
+                rename_map[col] = new_name
+
+        return df.rename(rename_map) if rename_map else df
+
+    def _drop_redundant_base_columns(self, df: pl.DataFrame) -> pl.DataFrame:
+        """Drop bare item columns that have _response, _score, or _index versions."""
+        response_bases = self._get_column_bases(df, "_response")
+        score_bases = self._get_column_bases(df, "_score")
+        index_bases = self._get_column_bases(df, "_index")
+
+        redundant_bases = response_bases | score_bases | index_bases
+        cols_to_keep = [col for col in df.columns if col not in redundant_bases]
+
+        return df.select(cols_to_keep)
+
+    def _prepare_activity_columns(
+        self, df: pl.DataFrame, activity_prefix: str
+    ) -> pl.DataFrame:
+        """Rename and transform columns for a single activity."""
+        # Prepend activity prefix and normalize underscores
         df = df.rename(
             {
-                col: col.replace(
-                    f"{activity_prefix}_response_response_", f"{activity_prefix}_"
-                ).replace("_response_response_", "_")
-                + "_response"
+                col: self._normalize_column_name(f"{activity_prefix}_{col}")
                 for col in df.columns
-                if f"{activity_prefix}_response_response_" in col
             }
-        ).rename(
+        )
+
+        # Clean up common suffixes
+        df = df.rename({col: col.replace("_user", "") for col in df.columns}).rename(
             {
-                col: col.replace(
-                    f"{activity_prefix}_response_value_", f"{activity_prefix}_"
-                ).replace("_response_value_", "_")
-                + "_score"
+                col: col[:-5]
                 for col in df.columns
-                if f"{activity_prefix}_response_value_" in col
+                if col.endswith(("_start_time", "_end_time"))
             }
         )
-        return df.select(
-            [
-                col
-                for col in df.columns
-                if col not in (response_bases | score_bases | index_bases)
-            ]
-        )
+
+        # Apply transformations in sequence
+        df = self._stringify_options_columns(df)
+        df = self._create_score_from_index(df)
+        df = self._drop_multiselect_response_options(df)
+        df = self._create_redcap_response_columns(df)
+        df = self._cleanup_response_column_names(df, activity_prefix)
+        return self._drop_redundant_base_columns(df)
 
     def _format_activity(self, df: pl.DataFrame, activity_name: str) -> pl.DataFrame:
         """Format a single activity's data for REDCap import."""
         activity_prefix = activity_name.lower()
 
-        # Extract record_id BEFORE column transformations
+        # Extract metadata BEFORE column transformations
         record_id = df.select("target_user_secret_id")
+        start_date = df.select(
+            pl.col("activity_time_start_time")
+            .dt.strftime("%m-%d-%Y")
+            .alias("start_date")
+        )
 
         df = self._prepare_activity_columns(df, activity_prefix)
-        df = self._add_redcap_metadata(df, activity_prefix, record_id)
+        df = self._add_redcap_metadata(df, activity_prefix, record_id, start_date)
 
         # Track row count for this instrument
         self._instrument_row_count[activity_name] = df.shape[0]
 
         return df
 
     def _add_redcap_metadata(
-        self, df: pl.DataFrame, activity_prefix: str, record_id: pl.DataFrame
+        self,
+        df: pl.DataFrame,
+        activity_prefix: str,
+        record_id: pl.DataFrame,
+        start_date: pl.DataFrame,
     ) -> pl.DataFrame:
         """Add REDCap-required columns and form completion status."""
-        # Add required REDCap columns using pre-extracted record_id
-        _project = (
+        # Add required REDCap columns
+        project = (
             self._project.get(activity_prefix, "")
             if isinstance(self._project, dict)
             else self._project or ""
@@ -585,14 +618,15 @@ def _add_redcap_metadata(
         df = df.with_columns(
             [
                 record_id.to_series().alias("record_id"),
-                pl.lit(_project).alias("redcap_event_name"),
+                pl.lit(project).alias("redcap_event_name"),
+                start_date.to_series().alias(f"{activity_prefix}_start_date"),
             ]
         )
 
         # Remove all-null columns
         df = df.select([s for s in df if s.null_count() != len(s)])
 
-        # Reorder: required columns first, then data columns
+        # Reorder columns: required first, then data, exclude account columns
         required_cols = ["record_id", "redcap_event_name"]
         account_cols = [
             col
@@ -604,7 +638,6 @@ def _add_redcap_metadata(
             for col in df.columns
             if col not in required_cols and col not in account_cols
         ]
-
         df = df.select(required_cols + data_cols)
 
         # Add form completion status (2 = Complete)
@@ -625,8 +658,7 @@ def _format(self, data: MindloggerData) -> list[NamedOutput]:
         # Format each activity for REDCap
         outputs = []
         for wide_output in wide_outputs:
-            activity_name = wide_output.name.translate({32: None, 45: None, 43: None})
-            """Output name without spaces, minuses, or pluses."""
+            activity_name = wide_output.name.translate({32: 95, 45: None, 43: None})
             formatted_df = self._format_activity(wide_output.output, activity_name)
             outputs.append(NamedOutput(f"{activity_name}_redcap", formatted_df))
 
diff --git a/uv.lock b/uv.lock