Skip to content

Commit 9cdf46a

Browse files
committed
👔 Add start_date column to REDCap outputs
1 parent ef884d3 commit 9cdf46a

3 files changed

Lines changed: 1637 additions & 1463 deletions

File tree

.vscode/settings.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"chat.agentSkillsLocations": {
3+
".github/skills": true,
4+
".agents/skills": true,
5+
".claude/skills": true,
6+
"~/.copilot/skills": true,
7+
"~/.agents/skills": true,
8+
"~/.claude/skills": true,
9+
"~/.vscode/extensions/synapsevscode.synapse-1.21.0/copilot/skills": true,
10+
"~/.vscode/extensions/synapsevscode.synapse-1.22.0/copilot/skills": true
11+
}
12+
}

src/mindlogger_data_export/outputs.py

Lines changed: 151 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -415,168 +415,201 @@ def _normalize_column_name(col: str) -> str:
415415
"""Replace chains of underscores with single underscore."""
416416
return re.sub(r"_+", "_", col).lower()
417417

418-
def _prepare_activity_columns(
419-
self, df: pl.DataFrame, activity_prefix: str
420-
) -> pl.DataFrame:
421-
"""Rename and transform columns for a single activity."""
422-
# Prepend activity prefix and normalize underscores
423-
df = df.rename(
424-
{
425-
col: self._normalize_column_name(f"{activity_prefix}_{col}")
426-
for col in df.columns
427-
}
428-
)
429-
430-
# Clean up common suffixes
431-
df = df.rename({col: col.replace("_user", "") for col in df.columns}).rename(
432-
{
433-
col: col[:-5]
434-
for col in df.columns
435-
if col.endswith(("_start_time", "_end_time"))
436-
}
437-
)
418+
@staticmethod
419+
def _get_column_bases(df: pl.DataFrame, suffix: str) -> set[str]:
420+
"""Extract base names for columns with a given suffix."""
421+
return {col.replace(suffix, "") for col in df.columns if col.endswith(suffix)}
438422

439-
# Stringify `_options` columns
423+
def _stringify_options_columns(self, df: pl.DataFrame) -> pl.DataFrame:
424+
"""Convert list-type option columns to JSON string format."""
440425
options_cols = [
441426
col
442427
for col in df.columns
443428
if col.endswith("_options") or "_response_options_" in col
444429
]
445-
for col in options_cols:
446-
df = df.with_columns(
447-
[
448-
pl.format(
449-
"[{}]",
450-
pl.col(col)
451-
.list.eval(pl.element().struct.json_encode())
452-
.list.join(", "),
453-
).alias(col)
454-
]
455-
)
430+
if not options_cols:
431+
return df
432+
433+
return df.with_columns(
434+
[
435+
pl.format(
436+
"[{}]",
437+
pl.col(col)
438+
.list.eval(pl.element().struct.json_encode())
439+
.list.join(", "),
440+
).alias(col)
441+
for col in options_cols
442+
]
443+
)
456444

457-
# Handle text items uniquely
458-
response_cols = [col for col in df.columns if col.endswith("_response")]
445+
def _create_score_from_index(self, df: pl.DataFrame) -> pl.DataFrame:
446+
"""For items with _index but no _score, create _score from _index."""
459447
index_cols = [col for col in df.columns if col.endswith("_index")]
460-
index_bases = {col.replace("_index", "") for col in index_cols}
461-
text_item_response_cols = [
462-
col
463-
for col in response_cols
464-
if col.replace("_response", "") not in index_bases
448+
score_bases = self._get_column_bases(df, "_score")
449+
450+
new_score_cols = [
451+
pl.col(col).alias(f"{col.replace('_index', '')}_score")
452+
for col in index_cols
453+
if col.replace("_index", "") not in score_bases
465454
]
466455

467-
# For items with `_index` but no `_score`, create `_score` from `_index`
468-
score_cols = [col for col in df.columns if col.endswith("_score")]
469-
score_bases = {col.replace("_score", "") for col in score_cols}
470-
for col in index_cols:
471-
base_name = col.replace("_index", "")
472-
if base_name not in score_bases:
473-
score_col = f"{base_name}_score"
474-
df = df.with_columns([pl.col(col).alias(score_col)])
456+
return df.with_columns(new_score_cols) if new_score_cols else df
475457

476-
# Drop multiselect response_options columns (they're redundant)
477-
df = df.select(
478-
[
479-
col
480-
for col in df.columns
481-
if not (
482-
"_response_options_" in col
483-
and col.split("_response_options_")[-1].split("_")[-1].isdigit()
484-
)
485-
]
486-
)
458+
def _create_redcap_response_columns(self, df: pl.DataFrame) -> pl.DataFrame:
459+
"""Create REDCap _response columns from _index columns.
460+
461+
If the original response value starts with a number, use that number;
462+
otherwise use index + 1.
463+
"""
464+
index_cols = [col for col in df.columns if col.endswith("_index")]
465+
index_bases = self._get_column_bases(df, "_index")
466+
response_bases = self._get_column_bases(df, "_response")
467+
468+
# Skip text items (those with _response but no _index)
469+
text_item_response_cols = response_bases - index_bases
470+
471+
response_exprs = []
472+
for col in index_cols:
473+
# Skip if this is a text item
474+
if col.replace("_index", "") in text_item_response_cols:
475+
continue
487476

488-
# Create REDCap `_response` columns
489-
# If the original response value starts with a number, use that number; otherwise use index + 1
490-
for col in [_ for _ in index_cols if _ not in text_item_response_cols]:
491477
response_col = col.replace("_index", "_response")
492478
base_name = col.replace("_index", "")
493-
# Check if there's an existing response column with values that start with numbers
494479
original_response_col = f"{base_name}_response"
480+
495481
if original_response_col in df.columns:
496482
# Try to extract leading number from response value, fall back to index + 1
497-
df = df.with_columns(
498-
[
499-
pl.when(
500-
pl.col(original_response_col)
501-
.cast(pl.Utf8)
502-
.str.extract(r"^(\d+)", 1)
503-
.is_not_null()
504-
)
505-
.then(
506-
pl.col(original_response_col)
507-
.cast(pl.Utf8)
508-
.str.extract(r"^(\d+)", 1)
509-
.cast(pl.Int64)
510-
)
511-
.otherwise(pl.col(col) + 1)
512-
.alias(response_col)
513-
]
483+
response_exprs.append(
484+
pl.when(
485+
pl.col(original_response_col)
486+
.cast(pl.Utf8)
487+
.str.extract(r"^(\d+)", 1)
488+
.is_not_null()
489+
)
490+
.then(
491+
pl.col(original_response_col)
492+
.cast(pl.Utf8)
493+
.str.extract(r"^(\d+)", 1)
494+
.cast(pl.Int64)
495+
)
496+
.otherwise(pl.col(col) + 1)
497+
.alias(response_col)
514498
)
515499
else:
516500
# No original response column, use index + 1
517-
df = df.with_columns([(pl.col(col) + 1).alias(response_col)])
501+
response_exprs.append((pl.col(col) + 1).alias(response_col))
518502

519-
# Drop bare item columns that have _response, _score, or _index versions
520-
response_bases = {
521-
col.replace("_response", "")
503+
return df.with_columns(response_exprs) if response_exprs else df
504+
505+
def _drop_multiselect_response_options(self, df: pl.DataFrame) -> pl.DataFrame:
506+
"""Drop redundant multiselect response_options columns."""
507+
cols_to_keep = [
508+
col
522509
for col in df.columns
523-
if col.endswith("_response")
524-
}
525-
score_bases = {
526-
col.replace("_score", "") for col in df.columns if col.endswith("_score")
527-
}
528-
index_bases = {
529-
col.replace("_index", "") for col in df.columns if col.endswith("_index")
530-
}
510+
if not (
511+
"_response_options_" in col
512+
and col.split("_response_options_")[-1].split("_")[-1].isdigit()
513+
)
514+
]
515+
return df.select(cols_to_keep)
531516

517+
def _cleanup_response_column_names(
518+
self, df: pl.DataFrame, activity_prefix: str
519+
) -> pl.DataFrame:
520+
"""Rename nested response columns to cleaner names."""
521+
rename_map = {}
522+
523+
for col in df.columns:
524+
if f"{activity_prefix}_response_response_" in col:
525+
new_name = (
526+
col.replace(
527+
f"{activity_prefix}_response_response_", f"{activity_prefix}_"
528+
).replace("_response_response_", "_")
529+
+ "_response"
530+
)
531+
rename_map[col] = new_name
532+
elif f"{activity_prefix}_response_value_" in col:
533+
new_name = (
534+
col.replace(
535+
f"{activity_prefix}_response_value_", f"{activity_prefix}_"
536+
).replace("_response_value_", "_")
537+
+ "_score"
538+
)
539+
rename_map[col] = new_name
540+
541+
return df.rename(rename_map) if rename_map else df
542+
543+
def _drop_redundant_base_columns(self, df: pl.DataFrame) -> pl.DataFrame:
544+
"""Drop bare item columns that have _response, _score, or _index versions."""
545+
response_bases = self._get_column_bases(df, "_response")
546+
score_bases = self._get_column_bases(df, "_score")
547+
index_bases = self._get_column_bases(df, "_index")
548+
549+
redundant_bases = response_bases | score_bases | index_bases
550+
cols_to_keep = [col for col in df.columns if col not in redundant_bases]
551+
552+
return df.select(cols_to_keep)
553+
554+
def _prepare_activity_columns(
555+
self, df: pl.DataFrame, activity_prefix: str
556+
) -> pl.DataFrame:
557+
"""Rename and transform columns for a single activity."""
558+
# Prepend activity prefix and normalize underscores
532559
df = df.rename(
533560
{
534-
col: col.replace(
535-
f"{activity_prefix}_response_response_", f"{activity_prefix}_"
536-
).replace("_response_response_", "_")
537-
+ "_response"
561+
col: self._normalize_column_name(f"{activity_prefix}_{col}")
538562
for col in df.columns
539-
if f"{activity_prefix}_response_response_" in col
540563
}
541-
).rename(
564+
)
565+
566+
# Clean up common suffixes
567+
df = df.rename({col: col.replace("_user", "") for col in df.columns}).rename(
542568
{
543-
col: col.replace(
544-
f"{activity_prefix}_response_value_", f"{activity_prefix}_"
545-
).replace("_response_value_", "_")
546-
+ "_score"
569+
col: col[:-5]
547570
for col in df.columns
548-
if f"{activity_prefix}_response_value_" in col
571+
if col.endswith(("_start_time", "_end_time"))
549572
}
550573
)
551-
return df.select(
552-
[
553-
col
554-
for col in df.columns
555-
if col not in (response_bases | score_bases | index_bases)
556-
]
557-
)
574+
575+
# Apply transformations in sequence
576+
df = self._stringify_options_columns(df)
577+
df = self._create_score_from_index(df)
578+
df = self._drop_multiselect_response_options(df)
579+
df = self._create_redcap_response_columns(df)
580+
df = self._cleanup_response_column_names(df, activity_prefix)
581+
return self._drop_redundant_base_columns(df)
558582

559583
def _format_activity(self, df: pl.DataFrame, activity_name: str) -> pl.DataFrame:
560584
"""Format a single activity's data for REDCap import."""
561585
activity_prefix = activity_name.lower()
562586

563-
# Extract record_id BEFORE column transformations
587+
# Extract metadata BEFORE column transformations
564588
record_id = df.select("target_user_secret_id")
589+
start_date = df.select(
590+
pl.col("activity_time_start_time")
591+
.dt.strftime("%m-%d-%Y")
592+
.alias("start_date")
593+
)
565594

566595
df = self._prepare_activity_columns(df, activity_prefix)
567-
df = self._add_redcap_metadata(df, activity_prefix, record_id)
596+
df = self._add_redcap_metadata(df, activity_prefix, record_id, start_date)
568597

569598
# Track row count for this instrument
570599
self._instrument_row_count[activity_name] = df.shape[0]
571600

572601
return df
573602

574603
def _add_redcap_metadata(
575-
self, df: pl.DataFrame, activity_prefix: str, record_id: pl.DataFrame
604+
self,
605+
df: pl.DataFrame,
606+
activity_prefix: str,
607+
record_id: pl.DataFrame,
608+
start_date: pl.DataFrame,
576609
) -> pl.DataFrame:
577610
"""Add REDCap-required columns and form completion status."""
578-
# Add required REDCap columns using pre-extracted record_id
579-
_project = (
611+
# Add required REDCap columns
612+
project = (
580613
self._project.get(activity_prefix, "")
581614
if isinstance(self._project, dict)
582615
else self._project or ""
@@ -585,14 +618,15 @@ def _add_redcap_metadata(
585618
df = df.with_columns(
586619
[
587620
record_id.to_series().alias("record_id"),
588-
pl.lit(_project).alias("redcap_event_name"),
621+
pl.lit(project).alias("redcap_event_name"),
622+
start_date.to_series().alias(f"{activity_prefix}_start_date"),
589623
]
590624
)
591625

592626
# Remove all-null columns
593627
df = df.select([s for s in df if s.null_count() != len(s)])
594628

595-
# Reorder: required columns first, then data columns
629+
# Reorder columns: required first, then data, exclude account columns
596630
required_cols = ["record_id", "redcap_event_name"]
597631
account_cols = [
598632
col
@@ -604,7 +638,6 @@ def _add_redcap_metadata(
604638
for col in df.columns
605639
if col not in required_cols and col not in account_cols
606640
]
607-
608641
df = df.select(required_cols + data_cols)
609642

610643
# Add form completion status (2 = Complete)
@@ -625,8 +658,7 @@ def _format(self, data: MindloggerData) -> list[NamedOutput]:
625658
# Format each activity for REDCap
626659
outputs = []
627660
for wide_output in wide_outputs:
628-
activity_name = wide_output.name.translate({32: None, 45: None, 43: None})
629-
"""Output name without spaces, minuses, or pluses."""
661+
activity_name = wide_output.name.translate({32: 95, 45: None, 43: None})
630662
formatted_df = self._format_activity(wide_output.output, activity_name)
631663
outputs.append(NamedOutput(f"{activity_name}_redcap", formatted_df))
632664

0 commit comments

Comments
 (0)