@@ -415,168 +415,201 @@ def _normalize_column_name(col: str) -> str:
415415 """Replace chains of underscores with single underscore."""
416416 return re .sub (r"_+" , "_" , col ).lower ()
417417
418- def _prepare_activity_columns (
419- self , df : pl .DataFrame , activity_prefix : str
420- ) -> pl .DataFrame :
421- """Rename and transform columns for a single activity."""
422- # Prepend activity prefix and normalize underscores
423- df = df .rename (
424- {
425- col : self ._normalize_column_name (f"{ activity_prefix } _{ col } " )
426- for col in df .columns
427- }
428- )
429-
430- # Clean up common suffixes
431- df = df .rename ({col : col .replace ("_user" , "" ) for col in df .columns }).rename (
432- {
433- col : col [:- 5 ]
434- for col in df .columns
435- if col .endswith (("_start_time" , "_end_time" ))
436- }
437- )
418+ @staticmethod
419+ def _get_column_bases (df : pl .DataFrame , suffix : str ) -> set [str ]:
420+ """Extract base names for columns with a given suffix."""
421+ return {col .replace (suffix , "" ) for col in df .columns if col .endswith (suffix )}
438422
439- # Stringify `_options` columns
423+ def _stringify_options_columns (self , df : pl .DataFrame ) -> pl .DataFrame :
424+ """Convert list-type option columns to JSON string format."""
440425 options_cols = [
441426 col
442427 for col in df .columns
443428 if col .endswith ("_options" ) or "_response_options_" in col
444429 ]
445- for col in options_cols :
446- df = df .with_columns (
447- [
448- pl .format (
449- "[{}]" ,
450- pl .col (col )
451- .list .eval (pl .element ().struct .json_encode ())
452- .list .join (", " ),
453- ).alias (col )
454- ]
455- )
430+ if not options_cols :
431+ return df
432+
433+ return df .with_columns (
434+ [
435+ pl .format (
436+ "[{}]" ,
437+ pl .col (col )
438+ .list .eval (pl .element ().struct .json_encode ())
439+ .list .join (", " ),
440+ ).alias (col )
441+ for col in options_cols
442+ ]
443+ )
456444
457- # Handle text items uniquely
458- response_cols = [ col for col in df . columns if col . endswith ( "_response" )]
445+ def _create_score_from_index ( self , df : pl . DataFrame ) -> pl . DataFrame :
446+ """For items with _index but no _score, create _score from _index."""
459447 index_cols = [col for col in df .columns if col .endswith ("_index" )]
460- index_bases = {col .replace ("_index" , "" ) for col in index_cols }
461- text_item_response_cols = [
462- col
463- for col in response_cols
464- if col .replace ("_response" , "" ) not in index_bases
448+ score_bases = self ._get_column_bases (df , "_score" )
449+
450+ new_score_cols = [
451+ pl .col (col ).alias (f"{ col .replace ('_index' , '' )} _score" )
452+ for col in index_cols
453+ if col .replace ("_index" , "" ) not in score_bases
465454 ]
466455
467- # For items with `_index` but no `_score`, create `_score` from `_index`
468- score_cols = [col for col in df .columns if col .endswith ("_score" )]
469- score_bases = {col .replace ("_score" , "" ) for col in score_cols }
470- for col in index_cols :
471- base_name = col .replace ("_index" , "" )
472- if base_name not in score_bases :
473- score_col = f"{ base_name } _score"
474- df = df .with_columns ([pl .col (col ).alias (score_col )])
456+ return df .with_columns (new_score_cols ) if new_score_cols else df
475457
476- # Drop multiselect response_options columns (they're redundant)
477- df = df .select (
478- [
479- col
480- for col in df .columns
481- if not (
482- "_response_options_" in col
483- and col .split ("_response_options_" )[- 1 ].split ("_" )[- 1 ].isdigit ()
484- )
485- ]
486- )
458+ def _create_redcap_response_columns (self , df : pl .DataFrame ) -> pl .DataFrame :
459+ """Create REDCap _response columns from _index columns.
460+
461+ If the original response value starts with a number, use that number;
462+ otherwise use index + 1.
463+ """
464+ index_cols = [col for col in df .columns if col .endswith ("_index" )]
465+ index_bases = self ._get_column_bases (df , "_index" )
466+ response_bases = self ._get_column_bases (df , "_response" )
467+
468+ # Skip text items (those with _response but no _index)
469+ text_item_response_cols = response_bases - index_bases
470+
471+ response_exprs = []
472+ for col in index_cols :
473+ # Skip if this is a text item
474+ if col .replace ("_index" , "" ) in text_item_response_cols :
475+ continue
487476
488- # Create REDCap `_response` columns
489- # If the original response value starts with a number, use that number; otherwise use index + 1
490- for col in [_ for _ in index_cols if _ not in text_item_response_cols ]:
491477 response_col = col .replace ("_index" , "_response" )
492478 base_name = col .replace ("_index" , "" )
493- # Check if there's an existing response column with values that start with numbers
494479 original_response_col = f"{ base_name } _response"
480+
495481 if original_response_col in df .columns :
496482 # Try to extract leading number from response value, fall back to index + 1
497- df = df .with_columns (
498- [
499- pl .when (
500- pl .col (original_response_col )
501- .cast (pl .Utf8 )
502- .str .extract (r"^(\d+)" , 1 )
503- .is_not_null ()
504- )
505- .then (
506- pl .col (original_response_col )
507- .cast (pl .Utf8 )
508- .str .extract (r"^(\d+)" , 1 )
509- .cast (pl .Int64 )
510- )
511- .otherwise (pl .col (col ) + 1 )
512- .alias (response_col )
513- ]
483+ response_exprs .append (
484+ pl .when (
485+ pl .col (original_response_col )
486+ .cast (pl .Utf8 )
487+ .str .extract (r"^(\d+)" , 1 )
488+ .is_not_null ()
489+ )
490+ .then (
491+ pl .col (original_response_col )
492+ .cast (pl .Utf8 )
493+ .str .extract (r"^(\d+)" , 1 )
494+ .cast (pl .Int64 )
495+ )
496+ .otherwise (pl .col (col ) + 1 )
497+ .alias (response_col )
514498 )
515499 else :
516500 # No original response column, use index + 1
517- df = df . with_columns ([ (pl .col (col ) + 1 ).alias (response_col )] )
501+ response_exprs . append ( (pl .col (col ) + 1 ).alias (response_col ))
518502
519- # Drop bare item columns that have _response, _score, or _index versions
520- response_bases = {
521- col .replace ("_response" , "" )
503+ return df .with_columns (response_exprs ) if response_exprs else df
504+
505+ def _drop_multiselect_response_options (self , df : pl .DataFrame ) -> pl .DataFrame :
506+ """Drop redundant multiselect response_options columns."""
507+ cols_to_keep = [
508+ col
522509 for col in df .columns
523- if col .endswith ("_response" )
524- }
525- score_bases = {
526- col .replace ("_score" , "" ) for col in df .columns if col .endswith ("_score" )
527- }
528- index_bases = {
529- col .replace ("_index" , "" ) for col in df .columns if col .endswith ("_index" )
530- }
510+ if not (
511+ "_response_options_" in col
512+ and col .split ("_response_options_" )[- 1 ].split ("_" )[- 1 ].isdigit ()
513+ )
514+ ]
515+ return df .select (cols_to_keep )
531516
517+ def _cleanup_response_column_names (
518+ self , df : pl .DataFrame , activity_prefix : str
519+ ) -> pl .DataFrame :
520+ """Rename nested response columns to cleaner names."""
521+ rename_map = {}
522+
523+ for col in df .columns :
524+ if f"{ activity_prefix } _response_response_" in col :
525+ new_name = (
526+ col .replace (
527+ f"{ activity_prefix } _response_response_" , f"{ activity_prefix } _"
528+ ).replace ("_response_response_" , "_" )
529+ + "_response"
530+ )
531+ rename_map [col ] = new_name
532+ elif f"{ activity_prefix } _response_value_" in col :
533+ new_name = (
534+ col .replace (
535+ f"{ activity_prefix } _response_value_" , f"{ activity_prefix } _"
536+ ).replace ("_response_value_" , "_" )
537+ + "_score"
538+ )
539+ rename_map [col ] = new_name
540+
541+ return df .rename (rename_map ) if rename_map else df
542+
543+ def _drop_redundant_base_columns (self , df : pl .DataFrame ) -> pl .DataFrame :
544+ """Drop bare item columns that have _response, _score, or _index versions."""
545+ response_bases = self ._get_column_bases (df , "_response" )
546+ score_bases = self ._get_column_bases (df , "_score" )
547+ index_bases = self ._get_column_bases (df , "_index" )
548+
549+ redundant_bases = response_bases | score_bases | index_bases
550+ cols_to_keep = [col for col in df .columns if col not in redundant_bases ]
551+
552+ return df .select (cols_to_keep )
553+
554+ def _prepare_activity_columns (
555+ self , df : pl .DataFrame , activity_prefix : str
556+ ) -> pl .DataFrame :
557+ """Rename and transform columns for a single activity."""
558+ # Prepend activity prefix and normalize underscores
532559 df = df .rename (
533560 {
534- col : col .replace (
535- f"{ activity_prefix } _response_response_" , f"{ activity_prefix } _"
536- ).replace ("_response_response_" , "_" )
537- + "_response"
561+ col : self ._normalize_column_name (f"{ activity_prefix } _{ col } " )
538562 for col in df .columns
539- if f"{ activity_prefix } _response_response_" in col
540563 }
541- ).rename (
564+ )
565+
566+ # Clean up common suffixes
567+ df = df .rename ({col : col .replace ("_user" , "" ) for col in df .columns }).rename (
542568 {
543- col : col .replace (
544- f"{ activity_prefix } _response_value_" , f"{ activity_prefix } _"
545- ).replace ("_response_value_" , "_" )
546- + "_score"
569+ col : col [:- 5 ]
547570 for col in df .columns
548- if f" { activity_prefix } _response_value_" in col
571+ if col . endswith (( "_start_time" , "_end_time" ))
549572 }
550573 )
551- return df .select (
552- [
553- col
554- for col in df .columns
555- if col not in (response_bases | score_bases | index_bases )
556- ]
557- )
574+
575+ # Apply transformations in sequence
576+ df = self ._stringify_options_columns (df )
577+ df = self ._create_score_from_index (df )
578+ df = self ._drop_multiselect_response_options (df )
579+ df = self ._create_redcap_response_columns (df )
580+ df = self ._cleanup_response_column_names (df , activity_prefix )
581+ return self ._drop_redundant_base_columns (df )
558582
559583 def _format_activity (self , df : pl .DataFrame , activity_name : str ) -> pl .DataFrame :
560584 """Format a single activity's data for REDCap import."""
561585 activity_prefix = activity_name .lower ()
562586
563- # Extract record_id BEFORE column transformations
587+ # Extract metadata BEFORE column transformations
564588 record_id = df .select ("target_user_secret_id" )
589+ start_date = df .select (
590+ pl .col ("activity_time_start_time" )
591+ .dt .strftime ("%m-%d-%Y" )
592+ .alias ("start_date" )
593+ )
565594
566595 df = self ._prepare_activity_columns (df , activity_prefix )
567- df = self ._add_redcap_metadata (df , activity_prefix , record_id )
596+ df = self ._add_redcap_metadata (df , activity_prefix , record_id , start_date )
568597
569598 # Track row count for this instrument
570599 self ._instrument_row_count [activity_name ] = df .shape [0 ]
571600
572601 return df
573602
574603 def _add_redcap_metadata (
575- self , df : pl .DataFrame , activity_prefix : str , record_id : pl .DataFrame
604+ self ,
605+ df : pl .DataFrame ,
606+ activity_prefix : str ,
607+ record_id : pl .DataFrame ,
608+ start_date : pl .DataFrame ,
576609 ) -> pl .DataFrame :
577610 """Add REDCap-required columns and form completion status."""
578- # Add required REDCap columns using pre-extracted record_id
579- _project = (
611+ # Add required REDCap columns
612+ project = (
580613 self ._project .get (activity_prefix , "" )
581614 if isinstance (self ._project , dict )
582615 else self ._project or ""
@@ -585,14 +618,15 @@ def _add_redcap_metadata(
585618 df = df .with_columns (
586619 [
587620 record_id .to_series ().alias ("record_id" ),
588- pl .lit (_project ).alias ("redcap_event_name" ),
621+ pl .lit (project ).alias ("redcap_event_name" ),
622+ start_date .to_series ().alias (f"{ activity_prefix } _start_date" ),
589623 ]
590624 )
591625
592626 # Remove all-null columns
593627 df = df .select ([s for s in df if s .null_count () != len (s )])
594628
595- # Reorder: required columns first, then data columns
629+ # Reorder columns : required first, then data, exclude account columns
596630 required_cols = ["record_id" , "redcap_event_name" ]
597631 account_cols = [
598632 col
@@ -604,7 +638,6 @@ def _add_redcap_metadata(
604638 for col in df .columns
605639 if col not in required_cols and col not in account_cols
606640 ]
607-
608641 df = df .select (required_cols + data_cols )
609642
610643 # Add form completion status (2 = Complete)
@@ -625,8 +658,7 @@ def _format(self, data: MindloggerData) -> list[NamedOutput]:
625658 # Format each activity for REDCap
626659 outputs = []
627660 for wide_output in wide_outputs :
628- activity_name = wide_output .name .translate ({32 : None , 45 : None , 43 : None })
629- """Output name without spaces, minuses, or pluses."""
661+ activity_name = wide_output .name .translate ({32 : 95 , 45 : None , 43 : None })
630662 formatted_df = self ._format_activity (wide_output .output , activity_name )
631663 outputs .append (NamedOutput (f"{ activity_name } _redcap" , formatted_df ))
632664
0 commit comments