|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
5 | 5 | import logging |
| 6 | +import re |
6 | 7 | from abc import ABC |
7 | 8 | from collections.abc import Callable, Generator |
8 | 9 | from dataclasses import dataclass |
@@ -313,6 +314,149 @@ def _format(self, data: MindloggerData) -> list[NamedOutput]: |
313 | 314 | ] |
314 | 315 |
|
315 | 316 |
|
| 317 | +class RedcapImportFormat(WideFormat): |
| 318 | + """Wide format specific for importing to REDCap.""" |
| 319 | + |
| 320 | + NAME = "redcap" |
| 321 | + |
| 322 | + def __init__(self, project: str = "curious_parent_arm_1", *args, **kwargs): |
| 323 | + super().__init__(*args, **kwargs) |
| 324 | + self._project = project |
| 325 | + self._instrument_row_count: dict[str, int | None] = {} |
| 326 | + |
| 327 | + @staticmethod |
| 328 | + def _normalize_column_name(col: str) -> str: |
| 329 | + """Replace chains of underscores with single underscore.""" |
| 330 | + return re.sub(r"_+", "_", col).lower() |
| 331 | + |
| 332 | + def _prepare_activity_columns( |
| 333 | + self, df: pl.DataFrame, activity_prefix: str |
| 334 | + ) -> pl.DataFrame: |
| 335 | + """Rename and transform columns for a single activity.""" |
| 336 | + # Prepend activity prefix and normalize underscores |
| 337 | + df = df.rename( |
| 338 | + { |
| 339 | + col: self._normalize_column_name(f"{activity_prefix}_{col}") |
| 340 | + for col in df.columns |
| 341 | + } |
| 342 | + ) |
| 343 | + |
| 344 | + # Clean up common suffixes |
| 345 | + df = df.rename({col: col.replace("_user", "") for col in df.columns}).rename( |
| 346 | + { |
| 347 | + col: col[:-5] |
| 348 | + for col in df.columns |
| 349 | + if col.endswith("_start_time", "_end_time") |
| 350 | + } |
| 351 | + ) |
| 352 | + |
| 353 | + # Handle response columns |
| 354 | + # Text remains the same, selections = index + 1 |
| 355 | + response_cols = [col for col in df.columns if col.endswith("_response")] |
| 356 | + index_cols = [col for col in df.columns if col.endswith("_index")] |
| 357 | + index_bases = {col.replace("_index", "") for col in index_cols} |
| 358 | + text_item_response_cols = [ |
| 359 | + col |
| 360 | + for col in response_cols |
| 361 | + if col.replace("_response", "") not in index_bases |
| 362 | + ] |
| 363 | + df = df.select( |
| 364 | + [ |
| 365 | + col |
| 366 | + for col in df.columns |
| 367 | + if not ( |
| 368 | + col.endswith("_response") and col not in text_item_response_cols |
| 369 | + ) |
| 370 | + ] |
| 371 | + ) |
| 372 | + for col in index_cols: |
| 373 | + response_col = col.replace("_index", "_response") |
| 374 | + df = df.with_columns([(pl.col(col) + 1).alias(response_col)]) |
| 375 | + df = df.select([col for col in df.columns if not col.endswith("_index")]) |
| 376 | + |
| 377 | + # Drop bare item columns that have a corresponding _response column |
| 378 | + # These are score columns that we don't need |
| 379 | + response_bases = { |
| 380 | + col.replace("_response", "") |
| 381 | + for col in df.columns |
| 382 | + if col.endswith("_response") |
| 383 | + } |
| 384 | + return df.select([col for col in df.columns if col not in response_bases]) |
| 385 | + |
| 386 | + def _format_activity(self, df: pl.DataFrame, activity_name: str) -> pl.DataFrame: |
| 387 | + """Format a single activity's data for REDCap import.""" |
| 388 | + activity_prefix = activity_name.lower() |
| 389 | + |
| 390 | + # Extract record_id BEFORE column transformations |
| 391 | + record_id = df.select("target_user_secret_id") |
| 392 | + |
| 393 | + df = self._prepare_activity_columns(df, activity_prefix) |
| 394 | + df = self._add_redcap_metadata(df, activity_prefix, record_id) |
| 395 | + |
| 396 | + # Track row count for this instrument |
| 397 | + self._instrument_row_count[activity_name] = df.shape[0] |
| 398 | + |
| 399 | + return df |
| 400 | + |
| 401 | + def _add_redcap_metadata( |
| 402 | + self, df: pl.DataFrame, activity_prefix: str, record_id: pl.DataFrame |
| 403 | + ) -> pl.DataFrame: |
| 404 | + """Add REDCap-required columns and form completion status.""" |
| 405 | + # Add required REDCap columns using pre-extracted record_id |
| 406 | + df = df.with_columns( |
| 407 | + [ |
| 408 | + record_id.to_series().alias("record_id"), |
| 409 | + pl.lit(self._project).alias("redcap_event_name"), |
| 410 | + ] |
| 411 | + ) |
| 412 | + |
| 413 | + # Remove all-null columns |
| 414 | + df = df.select([s for s in df if s.null_count() != len(s)]) |
| 415 | + |
| 416 | + # Reorder: required columns first, then data columns |
| 417 | + required_cols = ["record_id", "redcap_event_name"] |
| 418 | + account_cols = [ |
| 419 | + col |
| 420 | + for col in df.columns |
| 421 | + if "account_id" in col or "account_secret_id" in col |
| 422 | + ] |
| 423 | + data_cols = [ |
| 424 | + col |
| 425 | + for col in df.columns |
| 426 | + if col not in required_cols and col not in account_cols |
| 427 | + ] |
| 428 | + |
| 429 | + df = df.select(required_cols + data_cols) |
| 430 | + |
| 431 | + # Add form completion status (2 = Complete) |
| 432 | + return df.with_columns([pl.lit(2).alias(f"{activity_prefix}_complete")]) |
| 433 | + |
| 434 | + def _format(self, data: MindloggerData) -> list[NamedOutput]: |
| 435 | + """Format data for REDCap import, split by activity.""" |
| 436 | + # Force split_activities to be True for REDCap format |
| 437 | + original_extra = self._extra.copy() if hasattr(self, "_extra") else {} |
| 438 | + self._extra = {**original_extra, "split_activities": "true"} |
| 439 | + |
| 440 | + # Get wide format outputs (one per activity) |
| 441 | + wide_outputs = super()._format(data) |
| 442 | + |
| 443 | + # Restore original extra |
| 444 | + self._extra = original_extra |
| 445 | + |
| 446 | + # Format each activity for REDCap |
| 447 | + outputs = [] |
| 448 | + for wide_output in wide_outputs: |
| 449 | + activity_name = wide_output.name |
| 450 | + formatted_df = self._format_activity(wide_output.output, activity_name) |
| 451 | + outputs.append(NamedOutput(f"{activity_name}_redcap", formatted_df)) |
| 452 | + |
| 453 | + return outputs |
| 454 | + |
| 455 | + def get_instrument_row_counts(self) -> dict[str, int | None]: |
| 456 | + """Return the row count for each instrument processed.""" |
| 457 | + return self._instrument_row_count.copy() |
| 458 | + |
| 459 | + |
316 | 460 | class LongDataFormat(Output): |
317 | 461 | """Long data format with all parsed nested types unnested / exploded.""" |
318 | 462 |
|
|
0 commit comments