@@ -191,7 +191,7 @@ def set_redcap_columns(
191191 lambda row : (
192192 row [first_match [row .name ].replace (".contactType" , ".information" )]
193193 if is_email .loc [row .name ].any ()
194- else pd . NA
194+ else float ( "nan" )
195195 ),
196196 axis = 1 ,
197197 )
@@ -258,26 +258,26 @@ def get_redcap_subjects_to_update(
258258
259259def prepare_redcap_data (df : pd .DataFrame , cache : DataCache | None = None ) -> None :
260260 """Prepare Ripple API returned data to be imported into REDCap."""
261- copy_selected_redcap_df = set_redcap_columns (df )
262-
263- # Add last_modified column for cache keying
261+ # Inject lastModified into the source df BEFORE transforming,
262+ # so it flows through set_redcap_columns naturally.
264263 if "lastModified" not in df .columns :
265- last_modified = extract_last_modified (df )
266- copy_selected_redcap_df = copy_selected_redcap_df .assign (
267- lastModified = last_modified .values
268- )
264+ df = df .copy ()
265+ df ["lastModified" ] = extract_last_modified (df )
266+
267+ copy_selected_redcap_df = set_redcap_columns (
268+ df , columns_to_keep = ["mrn" , "email_consent" , "lastModified" ]
269+ )
269270
270271 # Filter out records already processed by cache
271272 if cache :
272- # Create cache keys for each record
273- cache_keys = []
274- for _ , row in copy_selected_redcap_df .iterrows ():
275- cache_key = create_ripple_record_cache_key (
273+ cache_keys = [
274+ create_ripple_record_cache_key (
276275 str (row ["mrn" ]),
277276 str (row .get ("email_consent" , "" )),
278277 str (row .get ("lastModified" , "" )),
279278 )
280- cache_keys .append (cache_key )
279+ for _ , row in copy_selected_redcap_df .iterrows ()
280+ ]
281281
282282 copy_selected_redcap_df ["cache_key" ] = cache_keys
283283
@@ -296,18 +296,22 @@ def prepare_redcap_data(df: pd.DataFrame, cache: DataCache | None = None) -> Non
296296 logger .info ("No new records to prepare for REDCap" )
297297 return
298298
299+ # Drop helper columns before downstream processing
300+ cols_to_drop = [
301+ c for c in ["cache_key" , "lastModified" ] if c in copy_selected_redcap_df .columns
302+ ]
303+ working_df = copy_selected_redcap_df .drop (columns = cols_to_drop )
304+
299305 # Split into update and new
300- to_update , new_subjects = get_redcap_subjects_to_update (
301- copy_selected_redcap_df .drop (columns = ["cache_key" ], errors = "ignore" )
302- )
306+ to_update , new_subjects = get_redcap_subjects_to_update (working_df )
303307
304308 # Save the new dataframes to CSV files
305309 if not to_update .empty :
306310 to_update .to_csv (redcap_variables .redcap_update_file , index = False )
307311 if not new_subjects .empty :
308312 new_subjects .to_csv (redcap_variables .redcap_import_file , index = False )
309313
310- # Mark as processed in cache (use the original df with cache_key)
314+ # Mark as processed in cache
311315 if cache and "cache_key" in copy_selected_redcap_df .columns :
312316 processed_keys = copy_selected_redcap_df ["cache_key" ].tolist ()
313317 cache .bulk_mark_processed (
0 commit comments