Skip to content

Commit 9bf070f

Browse files
committed
🐛 Correct handling of repeated instances vis-à-vis multiselect
1 parent 73165af commit 9bf070f

3 files changed

Lines changed: 202 additions & 8 deletions

File tree

python_jobs/src/hbnmigration/from_redcap/config.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ class export_247(metaclass=_ForCuriousMeta):
142142
"mrn",
143143
"par_rel",
144144
"par_rel_2",
145+
"parent_involvement",
145146
"parent_last_name",
146147
"parent_last_name_1821",
147148
"parent_last_name_2",
@@ -185,6 +186,9 @@ class export_247(metaclass=_ForCuriousMeta):
185186
"zipcode_1821",
186187
"zipcode_2",
187188
"zipcode_2_1821",
189+
"adult_phone",
190+
"permission_audiovideo_1821",
191+
"permission_collab_1821",
188192
]
189193
)
190194
"""Fields to export from REDCap PID 247 for import into REDCap PID 625."""
@@ -319,6 +323,8 @@ class import_curious:
319323
"adult_address",
320324
"adult_apt",
321325
"adult_city",
326+
"adult_email",
327+
"adult_phone",
322328
"adult_state",
323329
"adult_zip",
324330
"aptnumber1",
@@ -349,11 +355,11 @@ class import_curious:
349355
"last_name",
350356
"middlename_y",
351357
"mrn",
358+
"parent_involvement",
352359
"parent_last_name_2",
353360
"parentfirstname",
354361
"parentfirstname_2",
355362
"parentlastname",
356-
"parent_involvement",
357363
"permission_audiovideo",
358364
"permission_audiovideo_participant",
359365
"permission_collab",
@@ -379,6 +385,14 @@ class import_curious:
379385
)
380386
"""Fields to import into REDCap PID 625."""
381387

388+
class duplicate:
389+
"""One-to-many mappings from one DataFrame to another."""
390+
391+
redcap_consent_to_redcap_operations: Final[dict[str, list[str]]] = {
392+
# "adult_email": ["adult_email", "email"],
393+
}
394+
"""Columns to fan out from REDCap PID 247 to REDCap PID 625."""
395+
382396
class rename:
383397
"""Mappings to rename from one DataFrame to another."""
384398

@@ -491,7 +505,6 @@ class redcap_consent_to_redcap_responder_tracking(ColumnRenameMapping):
491505
"parent_second_guardian_"
492506
"consent_complete": "complete_parent_second_guardian_consent",
493507
"dob_1821": "dob",
494-
"adult_email": "email",
495508
"email_1821": "email",
496509
"email_2_1821": "email_2",
497510
"consent1": "first_name",
@@ -515,6 +528,8 @@ class redcap_consent_to_redcap_responder_tracking(ColumnRenameMapping):
515528
"parent_last_name_1821": "parentlastname",
516529
"permission_audiovideo_1113": "permission_audiovideo_participant",
517530
"permission_audiovideo_1417": "permission_audiovideo_participant",
531+
"permission_audiovideo_1821": "permission_audiovideo_participant",
532+
"permission_collab_1821": "permission_collab",
518533
"phone_1821": "phone",
519534
"phone_2_1821": "phone_2",
520535
"prefname_1821": "prefname",

python_jobs/src/hbnmigration/from_redcap/to_redcap.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,17 +210,34 @@ def format_data_for_redcap_operations(redcap_data: pd.DataFrame) -> pd.DataFrame
210210
df["field_name"] == "record_id", "record"
211211
]
212212

213-
# Step 8: Deduplicate by keeping most recent repeat instance
213+
# Step 8: For repeated instruments,
214+
# keep only the latest instance's full set of rows.
215+
# Then deduplicate non-repeated rows by record + field_name.
216+
217+
has_instance = df["redcap_repeat_instance"].notna()
218+
repeated_df = df[has_instance].copy()
219+
non_repeated_df = df[~has_instance].copy()
220+
221+
if not repeated_df.empty:
222+
# Keep only rows from the highest instance per record + instrument
223+
max_instance = repeated_df.groupby(["record", "redcap_repeat_instrument"])[
224+
"redcap_repeat_instance"
225+
].transform("max")
226+
repeated_df = repeated_df[repeated_df["redcap_repeat_instance"] == max_instance]
227+
228+
# For non-repeated rows, deduplicate on record + field_name
229+
non_repeated_df = non_repeated_df.drop_duplicates(
230+
subset=["record", "field_name"], keep="first"
231+
)
232+
214233
df = (
215-
df.sort_values("redcap_repeat_instance", ascending=False, na_position="last")
216-
.drop_duplicates(subset=["record", "field_name"], keep="first")
234+
pd.concat([non_repeated_df, repeated_df], ignore_index=True)
217235
.drop(
218236
columns=["redcap_repeat_instrument", "redcap_repeat_instance"],
219237
errors="ignore",
220238
)
221239
.reset_index(drop=True)
222240
)
223-
224241
# Step 9: Decrement permission_collab values by 1
225242
decrement_mask = df["field_name"] == "permission_collab"
226243
if decrement_mask.any():
@@ -447,7 +464,6 @@ def process_record_for_redcap_operations(record_id: str) -> dict[str, Any]:
447464
"record_id": record_id,
448465
"message": f"No data found in Intake (PID {_SOURCE_PID})",
449466
}
450-
451467
# Extract event name for intake_ready field BEFORE formatting
452468
event_name = None
453469
intake_ready_rows = source_data[source_data["field_name"] == "intake_ready"]
@@ -615,7 +631,6 @@ def main() -> None:
615631
_TARGET_PID,
616632
)
617633
return
618-
619634
# Build mapping of MRN -> source record_id BEFORE formatting
620635
mrn_to_source_record = (
621636
data_operations[data_operations["field_name"] == "mrn"]

python_jobs/src/tests/test_redcap.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,3 +1678,167 @@ def test_send_continues_after_single_failure(
16781678
assert len(failures) == 1
16791679
assert failures[0] == "2"
16801680
assert mocks["new_account"].call_count == 3
1681+
1682+
1683+
class TestFormatDataForRedcapOperations:
1684+
"""Tests for format_data_for_redcap_operations."""
1685+
1686+
class TestDeduplication:
1687+
"""Tests for Step 8: repeat instance deduplication logic."""
1688+
1689+
def test_keeps_all_values_from_latest_repeat_instance(self):
1690+
"""Checkbox fields with multiple values in the latest instance are kept."""
1691+
df = pd.DataFrame(
1692+
{
1693+
"record": ["1465"] * 6,
1694+
"field_name": [
1695+
"parent_involvement",
1696+
"parent_involvement",
1697+
"email",
1698+
"parent_involvement",
1699+
"parent_involvement",
1700+
"email",
1701+
],
1702+
"value": [
1703+
"1",
1704+
"2",
1705+
"old@example.com",
1706+
"1",
1707+
"2",
1708+
"new@example.com",
1709+
],
1710+
"redcap_event_name": ["event_1"] * 6,
1711+
"redcap_repeat_instrument": ["adult_consent"] * 6,
1712+
"redcap_repeat_instance": [1, 1, 1, 2, 2, 2],
1713+
}
1714+
)
1715+
1716+
result = format_data_for_redcap_operations(df)
1717+
1718+
# Should keep both parent_involvement rows from instance 2
1719+
pi_rows = result[result["field_name"] == "parent_involvement"]
1720+
assert len(pi_rows) == 2
1721+
assert set(pi_rows["value"]) == {"1", "2"}
1722+
1723+
# Should keep only the email from instance 2
1724+
email_rows = result[result["field_name"] == "email"]
1725+
assert len(email_rows) == 1
1726+
assert email_rows.iloc[0]["value"] == "new@example.com"
1727+
1728+
def test_discards_older_repeat_instances(self):
1729+
"""Only the highest repeat instance per record+instrument is kept."""
1730+
df = pd.DataFrame(
1731+
{
1732+
"record": ["100"] * 5,
1733+
"field_name": [
1734+
"parent_involvement",
1735+
"parent_involvement",
1736+
"parent_involvement",
1737+
"parent_involvement",
1738+
"parent_involvement",
1739+
],
1740+
"value": ["0", "1", "2", "1", "3"],
1741+
"redcap_event_name": ["event_1"] * 5,
1742+
"redcap_repeat_instrument": ["adult_consent"] * 5,
1743+
"redcap_repeat_instance": [1, 1, 1, 2, 2],
1744+
}
1745+
)
1746+
1747+
result = format_data_for_redcap_operations(df)
1748+
1749+
pi_rows = result[result["field_name"] == "parent_involvement"]
1750+
assert len(pi_rows) == 2
1751+
assert set(pi_rows["value"]) == {"1", "3"}
1752+
1753+
def test_non_repeated_rows_deduplicate_by_record_and_field(self):
1754+
"""Non-repeated rows (NaN instance) deduplicate on record + field_name."""
1755+
df = pd.DataFrame(
1756+
{
1757+
"record": ["200", "200"],
1758+
"field_name": ["email", "email"],
1759+
"value": ["first@example.com", "second@example.com"],
1760+
"redcap_event_name": ["event_1", "event_1"],
1761+
"redcap_repeat_instrument": [None, None],
1762+
"redcap_repeat_instance": [None, None],
1763+
}
1764+
)
1765+
1766+
result = format_data_for_redcap_operations(df)
1767+
1768+
email_rows = result[result["field_name"] == "email"]
1769+
assert len(email_rows) == 1
1770+
1771+
def test_mixed_repeated_and_non_repeated(self):
1772+
"""Records with both repeated and non-repeated fields are handled."""
1773+
df = pd.DataFrame(
1774+
{
1775+
"record": ["300"] * 5,
1776+
"field_name": [
1777+
"dob",
1778+
"parent_involvement",
1779+
"parent_involvement",
1780+
"parent_involvement",
1781+
"email",
1782+
],
1783+
"value": ["2010-01-01", "0", "1", "2", "test@example.com"],
1784+
"redcap_event_name": ["event_1"] * 5,
1785+
"redcap_repeat_instrument": [
1786+
None,
1787+
"adult_consent",
1788+
"adult_consent",
1789+
"adult_consent",
1790+
None,
1791+
],
1792+
"redcap_repeat_instance": [None, 1, 1, 1, None],
1793+
}
1794+
)
1795+
1796+
result = format_data_for_redcap_operations(df)
1797+
1798+
# All 3 checkbox values kept (only one instance, so all kept)
1799+
pi_rows = result[result["field_name"] == "parent_involvement"]
1800+
assert len(pi_rows) == 3
1801+
1802+
# Non-repeated fields kept as single rows
1803+
assert len(result[result["field_name"] == "dob"]) == 1
1804+
assert len(result[result["field_name"] == "email"]) == 1
1805+
1806+
class TestFieldRenaming:
1807+
"""Tests for field name rename and fan-out logic."""
1808+
1809+
def test_permission_audiovideo_1821_renamed(self):
1810+
"""Test that `permission_audiovideo_1821` is renamed."""
1811+
df = pd.DataFrame(
1812+
{
1813+
"record": ["400"],
1814+
"field_name": ["permission_audiovideo_1821"],
1815+
"value": ["1"],
1816+
"redcap_event_name": ["event_1"],
1817+
"redcap_repeat_instrument": [None],
1818+
"redcap_repeat_instance": [None],
1819+
}
1820+
)
1821+
1822+
result = format_data_for_redcap_operations(df)
1823+
1824+
assert "permission_audiovideo_1821" not in result["field_name"].values
1825+
assert "permission_audiovideo_participant" in result["field_name"].values
1826+
1827+
def test_permission_collab_1821_renamed_and_decremented(self):
1828+
"""Test that `permission_collab_1821` is renamed and decremented."""
1829+
df = pd.DataFrame(
1830+
{
1831+
"record": ["500"],
1832+
"field_name": ["permission_collab_1821"],
1833+
"value": ["1"],
1834+
"redcap_event_name": ["event_1"],
1835+
"redcap_repeat_instrument": [None],
1836+
"redcap_repeat_instance": [None],
1837+
}
1838+
)
1839+
1840+
result = format_data_for_redcap_operations(df)
1841+
1842+
collab_rows = result[result["field_name"] == "permission_collab"]
1843+
assert len(collab_rows) == 1
1844+
assert collab_rows.iloc[0]["value"] == "0" # decremented from 1

0 commit comments

Comments
 (0)