Skip to content

Commit 0c7b569

Browse files
committed
✨ Add RedcapImportFormat
1 parent 33eb875 commit 0c7b569

3 files changed

Lines changed: 1392 additions & 1248 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
[project]
22
name = "mindlogger-data-export"
3-
version = "0.1.10"
3+
version = "0.1.11"
44
description = "Add your description here"
55
readme = "README.md"
66
authors = [
77
{ name = "Gabriel Schubiner", email = "gabriel.schubiner@childmind.org" },
8+
{ name = "Jon Cluce", email = "jon.clucas@childmind.org"}
89
]
910
requires-python = ">=3.11"
1011
dependencies = [

src/mindlogger_data_export/outputs.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
import logging
6+
import re
67
from abc import ABC
78
from collections.abc import Callable, Generator
89
from dataclasses import dataclass
@@ -313,6 +314,149 @@ def _format(self, data: MindloggerData) -> list[NamedOutput]:
313314
]
314315

315316

317+
class RedcapImportFormat(WideFormat):
318+
"""Wide format specific for importing to REDCap."""
319+
320+
NAME = "redcap"
321+
322+
def __init__(self, project: str = "curious_parent_arm_1", *args, **kwargs):
323+
super().__init__(*args, **kwargs)
324+
self._project = project
325+
self._instrument_row_count: dict[str, int | None] = {}
326+
327+
@staticmethod
328+
def _normalize_column_name(col: str) -> str:
329+
"""Replace chains of underscores with single underscore."""
330+
return re.sub(r"_+", "_", col).lower()
331+
332+
def _prepare_activity_columns(
333+
self, df: pl.DataFrame, activity_prefix: str
334+
) -> pl.DataFrame:
335+
"""Rename and transform columns for a single activity."""
336+
# Prepend activity prefix and normalize underscores
337+
df = df.rename(
338+
{
339+
col: self._normalize_column_name(f"{activity_prefix}_{col}")
340+
for col in df.columns
341+
}
342+
)
343+
344+
# Clean up common suffixes
345+
df = df.rename({col: col.replace("_user", "") for col in df.columns}).rename(
346+
{
347+
col: col[:-5]
348+
for col in df.columns
349+
if col.endswith("_start_time", "_end_time")
350+
}
351+
)
352+
353+
# Handle response columns
354+
# Text remains the same, selections = index + 1
355+
response_cols = [col for col in df.columns if col.endswith("_response")]
356+
index_cols = [col for col in df.columns if col.endswith("_index")]
357+
index_bases = {col.replace("_index", "") for col in index_cols}
358+
text_item_response_cols = [
359+
col
360+
for col in response_cols
361+
if col.replace("_response", "") not in index_bases
362+
]
363+
df = df.select(
364+
[
365+
col
366+
for col in df.columns
367+
if not (
368+
col.endswith("_response") and col not in text_item_response_cols
369+
)
370+
]
371+
)
372+
for col in index_cols:
373+
response_col = col.replace("_index", "_response")
374+
df = df.with_columns([(pl.col(col) + 1).alias(response_col)])
375+
df = df.select([col for col in df.columns if not col.endswith("_index")])
376+
377+
# Drop bare item columns that have a corresponding _response column
378+
# These are score columns that we don't need
379+
response_bases = {
380+
col.replace("_response", "")
381+
for col in df.columns
382+
if col.endswith("_response")
383+
}
384+
return df.select([col for col in df.columns if col not in response_bases])
385+
386+
def _format_activity(self, df: pl.DataFrame, activity_name: str) -> pl.DataFrame:
387+
"""Format a single activity's data for REDCap import."""
388+
activity_prefix = activity_name.lower()
389+
390+
# Extract record_id BEFORE column transformations
391+
record_id = df.select("target_user_secret_id")
392+
393+
df = self._prepare_activity_columns(df, activity_prefix)
394+
df = self._add_redcap_metadata(df, activity_prefix, record_id)
395+
396+
# Track row count for this instrument
397+
self._instrument_row_count[activity_name] = df.shape[0]
398+
399+
return df
400+
401+
def _add_redcap_metadata(
402+
self, df: pl.DataFrame, activity_prefix: str, record_id: pl.DataFrame
403+
) -> pl.DataFrame:
404+
"""Add REDCap-required columns and form completion status."""
405+
# Add required REDCap columns using pre-extracted record_id
406+
df = df.with_columns(
407+
[
408+
record_id.to_series().alias("record_id"),
409+
pl.lit(self._project).alias("redcap_event_name"),
410+
]
411+
)
412+
413+
# Remove all-null columns
414+
df = df.select([s for s in df if s.null_count() != len(s)])
415+
416+
# Reorder: required columns first, then data columns
417+
required_cols = ["record_id", "redcap_event_name"]
418+
account_cols = [
419+
col
420+
for col in df.columns
421+
if "account_id" in col or "account_secret_id" in col
422+
]
423+
data_cols = [
424+
col
425+
for col in df.columns
426+
if col not in required_cols and col not in account_cols
427+
]
428+
429+
df = df.select(required_cols + data_cols)
430+
431+
# Add form completion status (2 = Complete)
432+
return df.with_columns([pl.lit(2).alias(f"{activity_prefix}_complete")])
433+
434+
def _format(self, data: MindloggerData) -> list[NamedOutput]:
435+
"""Format data for REDCap import, split by activity."""
436+
# Force split_activities to be True for REDCap format
437+
original_extra = self._extra.copy() if hasattr(self, "_extra") else {}
438+
self._extra = {**original_extra, "split_activities": "true"}
439+
440+
# Get wide format outputs (one per activity)
441+
wide_outputs = super()._format(data)
442+
443+
# Restore original extra
444+
self._extra = original_extra
445+
446+
# Format each activity for REDCap
447+
outputs = []
448+
for wide_output in wide_outputs:
449+
activity_name = wide_output.name
450+
formatted_df = self._format_activity(wide_output.output, activity_name)
451+
outputs.append(NamedOutput(f"{activity_name}_redcap", formatted_df))
452+
453+
return outputs
454+
455+
def get_instrument_row_counts(self) -> dict[str, int | None]:
456+
"""Return the row count for each instrument processed."""
457+
return self._instrument_row_count.copy()
458+
459+
316460
class LongDataFormat(Output):
317461
"""Long data format with all parsed nested types unnested / exploded."""
318462

0 commit comments

Comments
 (0)