SingleSelect fields represented as score or name in single column

gsch-cmi · gsch-cmi · commit a69332c06f2d · 2025-07-17T13:25:22.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mindlogger-data-export"
-version = "0.1.5"
+version = "0.1.6"
 description = "Add your description here"
 readme = "README.md"
 authors = [
diff --git a/src/mindlogger_data_export/outputs.py b/src/mindlogger_data_export/outputs.py
@@ -4,7 +4,7 @@
 
 import logging
 from abc import ABC
-from collections.abc import Callable
+from collections.abc import Callable, Generator
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -87,7 +87,7 @@ def _pivot_multiselect(
                 # response_index=pl.col("item_option").struct.field("value"),
                 # response_name=pl.col("item_option").struct.field("name"),
             )
-            .drop("response_value", "response_raw_score")
+            .drop("response_value")
             # Generate pivot column.
             .with_columns(
                 item_option_pivot=pl.concat_str(
@@ -107,6 +107,16 @@ def _map_response_column_names(cname: str) -> str:
         parts = cname.split("__", 1)
         return "_".join([parts[1], parts[0].removeprefix("response")])
 
+    @staticmethod
+    def _fill_item_response(*response_columns: str) -> Generator[pl.Expr, None, None]:
+        for response_col in response_columns:
+            yield (
+                pl.when(pl.col(response_col).is_null())
+                .then(pl.col(f"{response_col}__name"))
+                .otherwise(pl.col(response_col))
+                .alias(response_col)
+            )
+
     @staticmethod
     def _pivot_singleselect(
         df: pl.DataFrame, option_scores: pl.DataFrame
@@ -118,27 +128,25 @@ def _pivot_singleselect(
             pl.col("item_option_name").alias("response_name"),
         ).drop("item_option_score", "item_option_value", "item_option_name")
 
-        return (
-            (
-                df.with_columns(
-                    response_index=pl.col("response_value").struct.field("single_value")
-                )
-                .drop("response_value")
-                .join(
-                    response_options,
-                    on=[
-                        "applet_version",
-                        "activity_flow",
-                        "activity",
-                        "item",
-                        "response_index",
-                    ],
-                    how="left",
-                    validate="m:1",
-                )
-                .with_columns(item_name=pl.col("item").struct.field("name"))
-                .drop("item")
+        df = (
+            df.with_columns(
+                response_index=pl.col("response_value").struct.field("single_value")
             )
+            .drop("response_value")
+            .join(
+                response_options,
+                on=[
+                    "applet_version",
+                    "activity_flow",
+                    "activity",
+                    "item",
+                    "response_index",
+                ],
+                how="left",
+                validate="m:1",
+            )
+            .with_columns(item_name=pl.col("item").struct.field("name"))
+            .drop("item")
             .pivot(on="item_name", values=cs.starts_with("response"), separator="__")
             .with_columns(
                 cs.starts_with("response").name.map(
@@ -148,6 +156,18 @@ def _pivot_singleselect(
             .drop(cs.starts_with("response"))
         )
 
+        response_columns = {
+            s: s.rsplit("__")[0]
+            for s in cs.expand_selector(df, cs.ends_with("__score"))
+        }
+        return (
+            df.rename(response_columns)  # Rename <QUESTION>__score to <QUESTION>.
+            .with_columns(
+                WideFormat._fill_item_response(*response_columns.values())
+            )  # Use value of __name if __score is null.
+            .drop(cs.ends_with("__index", "__name"))
+        )
+
     @staticmethod
     def _pivot_text(df: pl.DataFrame, option_scores: pl.DataFrame) -> pl.DataFrame:
         del option_scores
@@ -156,7 +176,7 @@ def _pivot_text(df: pl.DataFrame, option_scores: pl.DataFrame) -> pl.DataFrame:
                 response_value=pl.col("response_value").struct.field("text"),
                 item_name=pl.col("item").struct.field("name"),
             )
-            .drop("response_raw_score", "item")
+            .drop("item")
             .pivot(on="item_name", values="response_value")
         )
 
@@ -168,7 +188,7 @@ def _pivot_subscale(df: pl.DataFrame, option_scores: pl.DataFrame) -> pl.DataFra
                 response_value=pl.col("response_value").struct.field("subscale"),
                 item_name=pl.col("item").struct.field("name"),
             )
-            .drop("response_raw_score", "item")
+            .drop("item")
             .pivot(on="item_name", values="response_value")
         )
 
@@ -188,10 +208,7 @@ def _typed_pivot(
         self, df: pl.DataFrame, option_scores: pl.DataFrame
     ) -> pl.DataFrame:
         df = (
-            df.with_columns(
-                response_value=pl.col("response").struct.field("value"),
-                response_raw_score=pl.col("response").struct.field("raw_score"),
-            )
+            df.with_columns(response_value=pl.col("response").struct.field("value"))
             .drop("response")
             .with_columns(
                 response_value=pl.struct(
@@ -219,8 +236,7 @@ def _typed_pivot(
             self._get_pivot_fn(partition_type)(partition_df, option_scores)
             for partition_type, partition_df in typed_partitions.items()
         ]
-        metadata_columns = ["legacy_user_id", "applet_version"]
-        index_struct_columns = [
+        struct_idx_columns = [
             "target_user",
             "source_user",
             "input_user",
@@ -231,17 +247,15 @@ def _typed_pivot(
             "activity_time",
             "activity_schedule",
         ]
-        return (
-            pl.concat(pivoted_dfs, how="diagonal_relaxed")
-            .with_columns(util.unnest_structs(*index_struct_columns))
-            .drop(index_struct_columns)
-            .select(
-                pl.col("legacy_user_id"),
-                pl.col("applet_version"),
-                cs.starts_with(*index_struct_columns),
-                ~cs.starts_with(*(index_struct_columns + metadata_columns)),
-            )
+
+        df = (
+            pl.concat(pivoted_dfs, how="align")
+            .with_columns(util.unnest_structs(*struct_idx_columns))
+            .drop(struct_idx_columns)
         )
+        idx_columns = cs.starts_with(*(["applet_version"] + struct_idx_columns))
+        response_columns = cs.by_name(sorted(cs.expand_selector(df, ~idx_columns)))
+        return df.select(idx_columns, response_columns)
 
     def _format(self, data: MindloggerData) -> list[NamedOutput]:
         if (
diff --git a/src/mindlogger_data_export/processors.py b/src/mindlogger_data_export/processors.py
@@ -66,6 +66,7 @@ class DropLegacyUserIdProcessor(ReportProcessor):
 
     NAME = "DropLegacyUserId"
     PRIORITY = 0
+    ENABLE = True
 
     def _run(self, report: pl.DataFrame) -> pl.DataFrame:
         return (
diff --git a/uv.lock b/uv.lock