Skip to content

Commit a01f97d

Browse files
committed
🐛 Fix column misalignment in Ripple-to-REDCap
1 parent adc57df commit a01f97d

6 files changed

Lines changed: 502 additions & 27 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## 1.10.6
9+
10+
### Fixed
11+
12+
- Column misalignment in Ripple-to-REDCap.
13+
814
## 1.10.5
915

1016
### Fixed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.10.5
1+
1.10.6

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "hbnmigration",
3-
"version": "1.10.5",
3+
"version": "1.10.6",
44
"private": true,
55
"description": "HBN data migration monitoring infrastructure with Python and Node.js services",
66
"workspaces": [

python_jobs/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.10.5
1+
1.10.6

python_jobs/src/hbnmigration/from_ripple/to_redcap.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def set_redcap_columns(
191191
lambda row: (
192192
row[first_match[row.name].replace(".contactType", ".information")]
193193
if is_email.loc[row.name].any()
194-
else pd.NA
194+
else float("nan")
195195
),
196196
axis=1,
197197
)
@@ -258,26 +258,26 @@ def get_redcap_subjects_to_update(
258258

259259
def prepare_redcap_data(df: pd.DataFrame, cache: DataCache | None = None) -> None:
260260
"""Prepare Ripple API returned data to be imported into REDCap."""
261-
copy_selected_redcap_df = set_redcap_columns(df)
262-
263-
# Add last_modified column for cache keying
261+
# Inject lastModified into the source df BEFORE transforming,
262+
# so it flows through set_redcap_columns naturally.
264263
if "lastModified" not in df.columns:
265-
last_modified = extract_last_modified(df)
266-
copy_selected_redcap_df = copy_selected_redcap_df.assign(
267-
lastModified=last_modified.values
268-
)
264+
df = df.copy()
265+
df["lastModified"] = extract_last_modified(df)
266+
267+
copy_selected_redcap_df = set_redcap_columns(
268+
df, columns_to_keep=["mrn", "email_consent", "lastModified"]
269+
)
269270

270271
# Filter out records already processed by cache
271272
if cache:
272-
# Create cache keys for each record
273-
cache_keys = []
274-
for _, row in copy_selected_redcap_df.iterrows():
275-
cache_key = create_ripple_record_cache_key(
273+
cache_keys = [
274+
create_ripple_record_cache_key(
276275
str(row["mrn"]),
277276
str(row.get("email_consent", "")),
278277
str(row.get("lastModified", "")),
279278
)
280-
cache_keys.append(cache_key)
279+
for _, row in copy_selected_redcap_df.iterrows()
280+
]
281281

282282
copy_selected_redcap_df["cache_key"] = cache_keys
283283

@@ -296,18 +296,22 @@ def prepare_redcap_data(df: pd.DataFrame, cache: DataCache | None = None) -> Non
296296
logger.info("No new records to prepare for REDCap")
297297
return
298298

299+
# Drop helper columns before downstream processing
300+
cols_to_drop = [
301+
c for c in ["cache_key", "lastModified"] if c in copy_selected_redcap_df.columns
302+
]
303+
working_df = copy_selected_redcap_df.drop(columns=cols_to_drop)
304+
299305
# Split into update and new
300-
to_update, new_subjects = get_redcap_subjects_to_update(
301-
copy_selected_redcap_df.drop(columns=["cache_key"], errors="ignore")
302-
)
306+
to_update, new_subjects = get_redcap_subjects_to_update(working_df)
303307

304308
# Save the new dataframes to CSV files
305309
if not to_update.empty:
306310
to_update.to_csv(redcap_variables.redcap_update_file, index=False)
307311
if not new_subjects.empty:
308312
new_subjects.to_csv(redcap_variables.redcap_import_file, index=False)
309313

310-
# Mark as processed in cache (use the original df with cache_key)
314+
# Mark as processed in cache
311315
if cache and "cache_key" in copy_selected_redcap_df.columns:
312316
processed_keys = copy_selected_redcap_df["cache_key"].tolist()
313317
cache.bulk_mark_processed(

0 commit comments

Comments
 (0)