🐛 Fix column misalignment in Ripple-to-REDCap

shnizzedy · shnizzedy · commit a01f97d1149b · 2026-04-22T19:09:10.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 1.10.6
+
+### Fixed
+
+- Column misalignment in Ripple-to-REDCap.
+
 ## 1.10.5
 
 ### Fixed
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.10.5
+1.10.6
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "hbnmigration",
-  "version": "1.10.5",
+  "version": "1.10.6",
   "private": true,
   "description": "HBN data migration monitoring infrastructure with Python and Node.js services",
   "workspaces": [
diff --git a/python_jobs/VERSION b/python_jobs/VERSION
@@ -1 +1 @@
-1.10.5
+1.10.6
diff --git a/python_jobs/src/hbnmigration/from_ripple/to_redcap.py b/python_jobs/src/hbnmigration/from_ripple/to_redcap.py
@@ -191,7 +191,7 @@ def set_redcap_columns(
         lambda row: (
             row[first_match[row.name].replace(".contactType", ".information")]
             if is_email.loc[row.name].any()
-            else pd.NA
+            else float("nan")
         ),
         axis=1,
     )
@@ -258,26 +258,26 @@ def get_redcap_subjects_to_update(
 
 def prepare_redcap_data(df: pd.DataFrame, cache: DataCache | None = None) -> None:
     """Prepare Ripple API returned data to be imported into REDCap."""
-    copy_selected_redcap_df = set_redcap_columns(df)
-
-    # Add last_modified column for cache keying
+    # Inject lastModified into the source df BEFORE transforming,
+    # so it flows through set_redcap_columns naturally.
     if "lastModified" not in df.columns:
-        last_modified = extract_last_modified(df)
-        copy_selected_redcap_df = copy_selected_redcap_df.assign(
-            lastModified=last_modified.values
-        )
+        df = df.copy()
+        df["lastModified"] = extract_last_modified(df)
+
+    copy_selected_redcap_df = set_redcap_columns(
+        df, columns_to_keep=["mrn", "email_consent", "lastModified"]
+    )
 
     # Filter out records already processed by cache
     if cache:
-        # Create cache keys for each record
-        cache_keys = []
-        for _, row in copy_selected_redcap_df.iterrows():
-            cache_key = create_ripple_record_cache_key(
+        cache_keys = [
+            create_ripple_record_cache_key(
                 str(row["mrn"]),
                 str(row.get("email_consent", "")),
                 str(row.get("lastModified", "")),
             )
-            cache_keys.append(cache_key)
+            for _, row in copy_selected_redcap_df.iterrows()
+        ]
 
         copy_selected_redcap_df["cache_key"] = cache_keys
 
@@ -296,18 +296,22 @@ def prepare_redcap_data(df: pd.DataFrame, cache: DataCache | None = None) -> Non
         logger.info("No new records to prepare for REDCap")
         return
 
+    # Drop helper columns before downstream processing
+    cols_to_drop = [
+        c for c in ["cache_key", "lastModified"] if c in copy_selected_redcap_df.columns
+    ]
+    working_df = copy_selected_redcap_df.drop(columns=cols_to_drop)
+
     # Split into update and new
-    to_update, new_subjects = get_redcap_subjects_to_update(
-        copy_selected_redcap_df.drop(columns=["cache_key"], errors="ignore")
-    )
+    to_update, new_subjects = get_redcap_subjects_to_update(working_df)
 
     # Save the new dataframes to CSV files
     if not to_update.empty:
         to_update.to_csv(redcap_variables.redcap_update_file, index=False)
     if not new_subjects.empty:
         new_subjects.to_csv(redcap_variables.redcap_import_file, index=False)
 
-    # Mark as processed in cache (use the original df with cache_key)
+    # Mark as processed in cache
     if cache and "cache_key" in copy_selected_redcap_df.columns:
         processed_keys = copy_selected_redcap_df["cache_key"].tolist()
         cache.bulk_mark_processed(
diff --git a/python_jobs/src/tests/test_ripple.py b/python_jobs/src/tests/test_ripple.py

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "hbnmigration",`
`3`		`- "version": "1.10.5",`
	`3`	`+ "version": "1.10.6",`
`4`	`4`	`"private": true,`
`5`	`5`	`"description": "HBN data migration monitoring infrastructure with Python and Node.js services",`
`6`	`6`	`"workspaces": [`