Skip to content

Commit 078802b

Browse files
committed
Use input typing in response parsing and wide format pivot. Add subscale typing.
1 parent 73ffc27 commit 078802b

9 files changed

Lines changed: 1374 additions & 1203 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "mindlogger-data-export"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
description = "Add your description here"
55
readme = "README.md"
66
authors = [
@@ -27,6 +27,7 @@ dev = [
2727
"jupyter>=1.1.1",
2828
"marimo[lsp]>=0.13.15",
2929
"mypy>=1.15.0",
30+
"pandas>=2.3.1",
3031
"pyarrow>=19.0.0",
3132
"pytest>=8.3.4",
3233
"pytest-datafiles>=3.0.0",

scripts/ml_db_export.py

Lines changed: 162 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -15,55 +15,114 @@ def _():
1515
return Path, cs, mo, os, pl
1616

1717

18+
@app.cell
19+
def _():
20+
COMPLETION_COLUMNS = {
21+
"YMHA Cohort 3 Student Applet IN PERSON": [
22+
"AMHLQ",
23+
"Healthcare Access",
24+
"Impact and Professional Importance",
25+
"MAP + MH Interest",
26+
"MEIM-6",
27+
"Mental Health Attitudes and Help-Seeking",
28+
"Mental Healthcare",
29+
"PCRB",
30+
"PEARLS + CRISIS",
31+
"PSC",
32+
"SCCS",
33+
"SCCT",
34+
"SEHS-HE",
35+
"Student Week 1 Assessment",
36+
"Student Week 2 Assessment",
37+
"Student Week 3 Assessment",
38+
],
39+
"YMHA Cohort 3 Student Applet VIRTUAL": [
40+
"Adolescent Mental Health Literacy Questionnaire (AMHLQ)",
41+
"Healthcare Access",
42+
"Impact and Professional Importance",
43+
"MAP + Mental Health Interest",
44+
"Multigroup Ethnic Identity Measure–Revised (MEIM-6)",
45+
"Mental Health Attitudes and Help-Seeking",
46+
"Mental Healthcare",
47+
"Perceived Campus Resources and Barriers (PCRB)",
48+
"PEARLS + CRISIS",
49+
"Pediatric Symptom Checklist (PSC)",
50+
"Student Connectedness to Community Scale (SCCS)",
51+
"Social Cognitive Career Theory (SCCT)",
52+
"Social Emotional Health Survey – Higher Education (SEHS-HE)",
53+
"Student Week 1 Assessment",
54+
"Student Week 2 Assessment",
55+
"Student Week 3 Assessment",
56+
],
57+
"YMHA Cohort 3 Teacher Applet IN PERSON": ["Week 1", "Week 2", "Week 3"],
58+
"YMHA Cohort 3 Teacher Applet VIRTUAL": ["Week 1", "Week 2", "Week 3"],
59+
"YMHA Cohort 3 Mentor Applet": [
60+
"Brief Version of the Big Five Personality Inventory (BFI)",
61+
"Check-In",
62+
"Experience",
63+
"Multigroup Ethnic Identity Measure–Revised (MEIM-6)",
64+
"Mentor Experience",
65+
"MinT Mentoring Styles Questionnaire",
66+
"Program-Developed",
67+
"The MAP",
68+
],
69+
"YMHA Cohort 3 Parent Applet": [
70+
"Confidence",
71+
"DSM-5 Cross-Cutting Symptom Measure",
72+
"Demographic",
73+
"Inventory of Family Protective Factors (IFPF)",
74+
"Pediatric Symptom Checklist (PSC)",
75+
"Stigma and Self-Stigma Scales (SASS)",
76+
],
77+
}
78+
return (COMPLETION_COLUMNS,)
79+
80+
1881
@app.cell(hide_code=True)
19-
def _(mo, os):
82+
def _(COMPLETION_COLUMNS, mo, os):
2083
participants_file = mo.ui.file()
2184
data_file = mo.ui.file()
2285
run_button = mo.ui.run_button()
23-
output_dir = mo.ui.text()
24-
filter_to_applet_name = mo.ui.text(label="applet_name")
86+
output_dir = mo.ui.text(label="Output Dir", value="output/")
87+
applet_name = mo.ui.dropdown(
88+
label="Applet Name", options=COMPLETION_COLUMNS.keys()
89+
)
90+
date_format = mo.ui.text(label="Date Format", value="%F %T%.f")
2591
mo.vstack(
2692
[
2793
mo.md(f"Current dir: {os.getcwd()}"),
2894
mo.hstack(
29-
[mo.md("### Upload participants file"), participants_file],
95+
[mo.md("#### Upload participants file"), participants_file],
3096
justify="start",
3197
),
32-
mo.hstack([mo.md("### Upload data file"), data_file], justify="start"),
3398
mo.hstack(
34-
[
35-
mo.md("### Output dir"),
36-
output_dir,
37-
],
38-
justify="start",
39-
),
40-
mo.vstack(
41-
[mo.md("### Filter data by: "), filter_to_applet_name],
42-
justify="start",
99+
[mo.md("#### Upload data file"), data_file], justify="start"
43100
),
101+
output_dir,
102+
applet_name,
103+
date_format,
44104
run_button,
45105
]
46106
)
47107
return (
108+
applet_name,
48109
data_file,
49-
filter_to_applet_name,
110+
date_format,
50111
output_dir,
51112
participants_file,
52113
run_button,
53114
)
54115

55116

56117
@app.cell
57-
def _(OutputGenerationError, cs, mo, participants_file, pl, run_button):
118+
def _(cs, mo, participants_file, pl, run_button):
58119
def load_participants(data) -> pl.DataFrame:
59120
"""Load participants from file path in extra args."""
60121
participants = pl.read_csv(data)
61122
if "site" not in participants.columns:
62-
raise OutputGenerationError(
63-
"'site' column not found in YMHA participants file"
64-
)
123+
raise Exception("'site' column not found in YMHA participants file")
65124
if "secretUserId" not in participants.columns:
66-
raise OutputGenerationError(
125+
raise Exception(
67126
"'secretUserId' column not found in YMHA participants file"
68127
)
69128
return participants.select(
@@ -82,24 +141,22 @@ def load_participants(data) -> pl.DataFrame:
82141

83142

84143
@app.cell
85-
def _(data_file, filter_to_applet_name, mo, pl, run_button):
144+
def _(applet_name, data_file, date_format, mo, pl, run_button):
86145
def load_data(mindlogger_data) -> pl.DataFrame:
87146
"""Load data."""
88147
ml_data = pl.read_csv(
89148
mindlogger_data,
90149
# try_parse_dates=True,
91150
# schema_overrides={"response_start_time": pl.Datetime()},
92151
)
93-
if filter_to_applet_name.value:
94-
ml_data = ml_data.filter(
95-
pl.col("applet_name") == filter_to_applet_name.value
96-
)
152+
if applet_name.value:
153+
ml_data = ml_data.filter(pl.col("applet_name") == applet_name.value)
97154
return (
98155
ml_data.select(
99-
pl.col("activity_name"),
156+
pl.col("activity_name").str.strip_chars(),
100157
pl.col("secret_user_id").alias("secret_id"),
101158
pl.col("response_start_time")
102-
.str.to_datetime("%D %k:%M")
159+
.str.to_datetime(date_format.value)
103160
.dt.date()
104161
.alias("activity_date"),
105162
)
@@ -113,7 +170,7 @@ def load_data(mindlogger_data) -> pl.DataFrame:
113170
return (data,)
114171

115172

116-
@app.cell(hide_code=True)
173+
@app.cell
117174
def _(cs, pl):
118175
def calc_attendance(
119176
df: pl.DataFrame, participants: pl.DataFrame
@@ -141,10 +198,14 @@ def calc_attendance(
141198
((f"site_{part[0]}", f"date_{part[1]}", f"ymha_attendance"), df)
142199
for part, df in part_dfs.items()
143200
]
201+
# returns list[tuple[tuple[str], dataframe]]
202+
# list of outputs. each output is a tuple of path and dataframe. path is a tuple of path segments.
144203

145204

146205
def calc_completion(
147-
df: pl.DataFrame, participants: pl.DataFrame
206+
df: pl.DataFrame,
207+
participants: pl.DataFrame,
208+
completion_columns: list[str],
148209
) -> list[tuple[str, pl.DataFrame]]:
149210
completion = df.drop("activity_date").pivot(
150211
on="activity_name",
@@ -153,32 +214,34 @@ def calc_completion(
153214
maintain_order=True,
154215
sort_columns=True,
155216
)
156-
activity_col_selector = cs.exclude(
157-
[
158-
"secret_id",
159-
"nickname",
160-
"first_name",
161-
"last_name",
162-
"site",
163-
cs.matches("^room$"),
164-
]
165-
)
166217
identifier_col_selector = cs.by_name(
167218
"secret_id",
168219
"nickname",
169220
"first_name",
170221
"last_name",
171222
"site",
172223
) | cs.matches(r"^room$")
224+
activity_col_selector = ~identifier_col_selector
225+
all_completion = participants.join(
226+
completion, on="secret_id", how="left"
227+
).select(
228+
identifier_col_selector,
229+
activity_col_selector.fill_null(False), # noqa: FBT003
230+
)
231+
print(all_completion.columns)
173232
all_completion = (
174-
participants.join(completion, on="secret_id", how="left")
175-
.select(
176-
identifier_col_selector,
177-
activity_col_selector.fill_null(False), # noqa: FBT003
233+
all_completion.with_columns(
234+
complete=pl.concat_list(completion_columns).list.all(),
235+
partially_complete=pl.concat_list(completion_columns).list.any(),
178236
)
179237
.with_columns(
180-
complete=pl.concat_list(activity_col_selector).list.all(),
238+
complete=pl.when(pl.col("complete"))
239+
.then(pl.lit("TRUE"))
240+
.when(pl.col("partially_complete"))
241+
.then(pl.lit("PARTIALLY TRUE"))
242+
.otherwise(pl.lit("FALSE"))
181243
)
244+
.drop("partially_complete")
182245
)
183246
site_completion = all_completion.partition_by("site", as_dict=True)
184247
return (
@@ -190,12 +253,15 @@ def calc_completion(
190253
),
191254
]
192255
+ [
193-
(("site_{part[0]}", "ymha_completion"), df)
256+
((f"site_{part[0]}", "ymha_completion"), df)
194257
for part, df in site_completion.items()
195258
]
196259
+ [
197260
(
198-
("site_{part[0]}", f"ymha_completion_summary"),
261+
(
262+
f"site_{part[0]}",
263+
f"ymha_completion_summary",
264+
),
199265
df.select(identifier_col_selector, "complete"),
200266
)
201267
for part, df in site_completion.items()
@@ -206,6 +272,8 @@ def calc_completion(
206272

207273
@app.cell
208274
def _(
275+
COMPLETION_COLUMNS,
276+
applet_name,
209277
calc_attendance,
210278
calc_completion,
211279
data,
@@ -224,7 +292,11 @@ def _(
224292
if (True,) in _partitioned_activities
225293
else []
226294
) + (
227-
calc_completion(_partitioned_activities[(False,)], participants_data)
295+
calc_completion(
296+
_partitioned_activities[(False,)],
297+
participants_data,
298+
COMPLETION_COLUMNS[applet_name.value],
299+
)
228300
if (False,) in _partitioned_activities
229301
else []
230302
)
@@ -238,30 +310,54 @@ def _(Path, cs, mo, output_dir, outputs, run_button):
238310

239311
_output_dir = Path(output_dir.value)
240312
if not _output_dir.is_dir():
241-
disp = mo.md(
242-
f"OutputDir ({_output_dir}) does not exist or is not a directory. Please update output dir input above."
313+
_output_dir.mkdir(parents=True, exist_ok=True)
314+
print(f"Created output directory: {_output_dir}")
315+
316+
for _path_segments, _df in outputs:
317+
_output_path = (
318+
_output_dir.joinpath(*_path_segments).with_suffix(".xlsx").resolve()
243319
)
244-
else:
245-
for _path_segments, _df in outputs:
246-
_output_path = (
247-
_output_dir.joinpath(*_path_segments)
248-
.with_suffix(".xlsx")
249-
.resolve()
250-
)
251-
_output_path.parent.mkdir(parents=True, exist_ok=True)
252-
_df.write_excel(
253-
_output_path,
254-
conditional_formats={
255-
cs.all(): {
320+
_output_path.parent.mkdir(parents=True, exist_ok=True)
321+
_df.write_excel(
322+
_output_path,
323+
conditional_formats={
324+
cs.all(): [
325+
{
256326
"type": "cell",
257327
"criteria": "==",
258328
"value": False,
259329
"format": {"bg_color": "#FFC7CE"},
260-
}
261-
},
262-
)
263-
print(f"{len(outputs)} outputs written.")
264-
disp = mo.md(f"{len(outputs)} outputs written.")
330+
},
331+
{
332+
"type": "cell",
333+
"criteria": "==",
334+
"value": True,
335+
"format": {"bg_color": "#97bfa2"},
336+
},
337+
{
338+
"type": "text",
339+
"criteria": "begins with",
340+
"value": "FALS",
341+
"format": {"bg_color": "#FFC7C1"},
342+
},
343+
{
344+
"type": "text",
345+
"criteria": "begins with",
346+
"value": "TRU",
347+
"format": {"bg_color": "#97bfa1"},
348+
},
349+
{
350+
"type": "text",
351+
"criteria": "begins with",
352+
"value": "PARTIALLY TRUE",
353+
"format": {"bg_color": "#e6e887"},
354+
},
355+
],
356+
},
357+
)
358+
359+
print(f"{len(outputs)} outputs written.")
360+
disp = mo.md(f"{len(outputs)} outputs written.")
265361
disp
266362
return
267363

0 commit comments

Comments
 (0)