|
| 1 | +import pandas as pd |
| 2 | +import os |
| 3 | +import requests |
| 4 | +import sys |
| 5 | +from io import StringIO |
| 6 | +from datetime import datetime, timedelta |
| 7 | + |
| 8 | + |
| 9 | +''' |
| 10 | +This script is designed to create a sleep log file for the group analysis of accelerometer data. |
| 11 | +It reads the individual participant files by first matching labID with studyID, then builds path to individual files on the RDSS |
| 12 | +Aggregates the sleep data by participant and session (with _accel suffix) and saves it to a CSV file. |
| 13 | +''' |
| 14 | + |
| 15 | +token = 'DE4E2DB72778DACA9B8848574107D2F5' |
| 16 | +INT_DIR = '/Volumes/vosslabhpc/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test' |
| 17 | +OBS_DIR = '/Volumes/vosslabhpc/Projects/BOOST/ObservationalStudy/3-experiment/data/act-obs-test' |
| 18 | +rdss_dir = '/Volumes/VossLab/Repositories/Accelerometer_Data/Sleep' |
| 19 | + |
| 20 | +def compare_ids(rdss_dir, token, daysago=350): |
| 21 | + """ |
| 22 | + Pulls all files from RDSS |
| 23 | + Pulls the list from RedCap |
| 24 | + Compares IDs and returns a dictionary with two keys: |
| 25 | + - 'matches': normal matches mapping boost_id to a list of dicts (filename, labID, date) |
| 26 | + - 'duplicates': a list of dictionaries each with lab_id, boost_id, filenames (list), and dates (list) |
| 27 | + """ |
| 28 | + # Retrieve the RedCap report and duplicates from report |
| 29 | + report, report_duplicates = _return_report(token) |
| 30 | + # Retrieve the full RDSS file list and duplicate files merged with duplicates from report |
| 31 | + rdss, file_duplicates = _rdss_file_list(report_duplicates) |
| 32 | + |
| 33 | + # Initialize the result dictionary for normal (non-duplicate) matches |
| 34 | + result = {} |
| 35 | + |
| 36 | + # Iterate over the rows in the cleaned RedCap report |
| 37 | + for _, row in report.iterrows(): |
| 38 | + boost_id = str(row['boost_id']) |
| 39 | + lab_id = str(row['lab_id']) |
| 40 | + |
| 41 | + # Find matching files in the RDSS list |
| 42 | + rdss_matches = rdss[rdss['ID'] == lab_id] |
| 43 | + if not rdss_matches.empty: |
| 44 | + if boost_id not in result: |
| 45 | + result[boost_id] = [] |
| 46 | + for _, match_row in rdss_matches.iterrows(): |
| 47 | + result[boost_id].append({ |
| 48 | + 'filename': match_row['filename'], |
| 49 | + 'labID': lab_id, |
| 50 | + 'date': match_row['Date'] |
| 51 | + }) |
| 52 | + |
| 53 | + # Process duplicates into the desired structure. |
| 54 | + duplicates_dict = [] |
| 55 | + if not file_duplicates.empty: |
| 56 | + # Group by lab_id and boost_id; each group represents one duplicate combination. |
| 57 | + grouped = file_duplicates.groupby(['lab_id', 'boost_id']) |
| 58 | + for (lab_id, boost_id), group in grouped: |
| 59 | + duplicates_dict.append({ |
| 60 | + 'lab_id': lab_id, |
| 61 | + 'boost_id': boost_id, |
| 62 | + 'filenames': group['filename'].tolist(), |
| 63 | + 'dates': group['Date'].tolist() |
| 64 | + }) |
| 65 | + else: |
| 66 | + print("Found no duplicates.") |
| 67 | + |
| 68 | + return {'matches': result, 'duplicates': duplicates_dict} |
| 69 | + |
| 70 | +def _return_report(token): |
| 71 | + """ |
| 72 | + pulls the id report from the rdss via redcap api. |
| 73 | + reads the report as a dataframe. |
| 74 | + checks for boost_ids that are associated with multiple lab_ids, logs a critical error, |
| 75 | + and removes these rows from the dataframe. |
| 76 | + separates duplicate rows (based on any column) from the cleaned data. |
| 77 | + |
| 78 | + returns: |
| 79 | + df_cleaned: dataframe with duplicates removed and problematic boost_ids excluded |
| 80 | + duplicate_rows: dataframe of duplicate rows |
| 81 | + """ |
| 82 | + url = 'https://redcap.icts.uiowa.edu/redcap/api/' |
| 83 | + data = { |
| 84 | + 'token': token, |
| 85 | + 'content': 'report', |
| 86 | + 'report_id': 43327, |
| 87 | + 'format': 'csv' |
| 88 | + } |
| 89 | + r = requests.post(url, data=data) |
| 90 | + if r.status_code != 200: |
| 91 | + print(f"error! status code is {r.status_code}") |
| 92 | + sys.exit(1) |
| 93 | + |
| 94 | + df = pd.read_csv(StringIO(r.text)) |
| 95 | + |
| 96 | + # identify boost_ids associated with multiple lab_ids. |
| 97 | + boost_id_counts = df.groupby('boost_id')['lab_id'].nunique() |
| 98 | + problematic_boost_ids = boost_id_counts[boost_id_counts > 1].index.tolist() |
| 99 | + |
| 100 | + if problematic_boost_ids: |
| 101 | + print(f"found boost_id(s) with multiple lab_ids: {', '.join(map(str, problematic_boost_ids))}. " |
| 102 | + "these entries will be removed from processing.") |
| 103 | + df = df[~df['boost_id'].isin(problematic_boost_ids)] |
| 104 | + |
| 105 | + # identify and separate duplicate rows based on any column. |
| 106 | + duplicate_rows = df[df.duplicated(keep=False)] |
| 107 | + df_cleaned = df.drop_duplicates(keep=False) |
| 108 | + |
| 109 | + if not duplicate_rows.empty: |
| 110 | + print(f"duplicate rows found:\n{duplicate_rows}") |
| 111 | + |
| 112 | + return df_cleaned, duplicate_rows |
| 113 | + |
| 114 | +def _rdss_file_list(duplicates, daysago=None): |
| 115 | + """ |
| 116 | + extracts the first string before the space and the date from filenames ending with .csv |
| 117 | + in the specified folder and stores them in a dataframe. |
| 118 | + |
| 119 | + Also, merges the file list with duplicate report entries based on lab_id. |
| 120 | + |
| 121 | + Returns: |
| 122 | + df: DataFrame of all file entries |
| 123 | + merged_df: DataFrame of file entries that match duplicate lab_ids from the report |
| 124 | + """ |
| 125 | + extracted_data = [] |
| 126 | + |
| 127 | + # Loop through all files in the rdss_dir folder. |
| 128 | + for filename in os.listdir(rdss_dir): |
| 129 | + if filename.endswith('.csv'): |
| 130 | + try: |
| 131 | + # Handle both old and new filename formats |
| 132 | + if '_' in filename and filename.endswith('.csv'): |
| 133 | + # New format: 1288_4-26-2025_Sleep.csv |
| 134 | + parts = filename.replace('.csv', '').split('_') |
| 135 | + if len(parts) >= 3: |
| 136 | + base_name = parts[0] # lab_id |
| 137 | + date_part = parts[1] # date |
| 138 | + extracted_data.append({'ID': base_name, 'Date': date_part, 'filename': filename}) |
| 139 | + else: |
| 140 | + print(f"Skipping file with unexpected format: {filename}") |
| 141 | + else: |
| 142 | + try: |
| 143 | + base_name = filename.split(' ')[0] # Extract lab_id (old format) |
| 144 | + date_part = filename.split('(')[1].split(')')[0] # Extract date (old format) |
| 145 | + extracted_data.append({'ID': base_name, 'Date': date_part, 'filename': filename}) |
| 146 | + except IndexError: |
| 147 | + print(f"Skipping file with unexpected format: {filename}") |
| 148 | + except IndexError: |
| 149 | + print(f"Skipping file with unexpected format: {filename}") |
| 150 | + |
| 151 | + df = pd.DataFrame(extracted_data) |
| 152 | + |
| 153 | + if not df.empty: |
| 154 | + df['Date'] = pd.to_datetime(df['Date'], errors='coerce') |
| 155 | + |
| 156 | + if daysago: |
| 157 | + cutoff_date = datetime.today() - timedelta(days=daysago) |
| 158 | + df = df[df['Date'] >= cutoff_date] # Filter files within the last `daysago` days |
| 159 | + else: |
| 160 | + df = df[df['Date'] >= '2024-08-05'] # Filter out rows before the threshold date |
| 161 | + |
| 162 | + # Filter the file list to only include rows where ID is in the duplicate report (if any) |
| 163 | + if not duplicates.empty: |
| 164 | + matched_df = df[df['ID'].isin(duplicates['lab_id'])] |
| 165 | + # Merge with the duplicates to bring in boost_id information from the report |
| 166 | + merged_df = matched_df.merge(duplicates, left_on='ID', right_on='lab_id') |
| 167 | + else: |
| 168 | + merged_df = pd.DataFrame() |
| 169 | + |
| 170 | + return df, merged_df |
| 171 | + |
| 172 | +matches = compare_ids(rdss_dir, token, daysago=None) |
| 173 | +# Print the matches and duplicates for verification |
| 174 | +print("Matches:") |
| 175 | +for boost_id, files in matches['matches'].items(): |
| 176 | + print(f"Boost ID: {boost_id}") |
| 177 | + for file_info in files: |
| 178 | + print(f" - {file_info['filename']} (Lab ID: {file_info['labID']}, Date: {file_info['date']})") |
| 179 | +print("\nDuplicates:") |
| 180 | +for dup in matches['duplicates']: |
| 181 | + print(f"Lab ID: {dup['lab_id']}, Boost ID: {dup['boost_id']}") |
| 182 | + print(f" Filenames: {', '.join(dup['filenames'])}") |
| 183 | + print(f" Dates: {', '.join(map(str, dup['dates']))}") |
| 184 | +# The above code is a complete script that compares IDs from RDSS and RedCap, identifies matches and duplicates, and prints the results. |
| 185 | + |
| 186 | +''' |
| 187 | +Below we create the sessions, where if the same subject ID has multiple files, we will create a session for each file ordered by date. |
| 188 | +this will now be stored as a dataframe with the columns: |
| 189 | + # 'subject_id', 'session_id', 'filename', 'date' |
| 190 | + # where subject_id is 'sub-<subject_id>', session_id is 'ses-<session_number>', filename is the file name with full path, and date is the date of the file. |
| 191 | +''' |
| 192 | +def create_sessions(matches): |
| 193 | + """ |
| 194 | + Create sessions from the matches dictionary. |
| 195 | + |
| 196 | + Args: |
| 197 | + matches (dict): Dictionary containing matches with boost_id as keys and list of file info as values. |
| 198 | + |
| 199 | + Returns: |
| 200 | + pd.DataFrame: DataFrame with columns 'subject_id', 'session_id', 'filename', 'date'. |
| 201 | + """ |
| 202 | + sessions = [] |
| 203 | + |
| 204 | + for boost_id, files in matches['matches'].items(): |
| 205 | + subject_id = f'sub-{boost_id}' |
| 206 | + for i, file_info in enumerate(files): |
| 207 | + session_id = f'ses-{i + 1}' # Session number starts from 1 |
| 208 | + sessions.append({ |
| 209 | + 'subject_id': subject_id, |
| 210 | + 'session_id': session_id, |
| 211 | + 'filename': os.path.join(rdss_dir, file_info['filename']), |
| 212 | + 'date': file_info['date'] |
| 213 | + }) |
| 214 | + |
| 215 | + return pd.DataFrame(sessions) |
| 216 | + |
| 217 | +# Create sessions from the Matches |
| 218 | +sessions_df = create_sessions(matches) |
| 219 | +# Print the sessions DataFrame for verification |
| 220 | +print("\nSessions DataFrame:") |
| 221 | +print(sessions_df) |
| 222 | + |
| 223 | + |
| 224 | +''' |
| 225 | +Iterate through the files in the sessions file list and build the sleep log file. Files are have the format: |
| 226 | +
|
| 227 | +Sleep Algorithm,In Bed Date,In Bed Time,Out Bed Date,Out Bed Time,Onset Date,Onset Time,Latency,Total Counts,Efficiency,Total Minutes in Bed,Total Sleep Time (TST),Wake After Sleep Onset (WASO),Number of Awakenings,Average Awakening Length,Movement Index,Fragmentation Index,Sleep Fragmentation Index |
| 228 | +Cole-Kripke,4/12/2023,12:00 AM,4/12/2023,6:00 AM,4/12/2023,12:00 AM,0,34862,84.44,360,304,56,10,5.6,8.889,10,18.889 |
| 229 | +Cole-Kripke,4/12/2023,11:09 PM,4/13/2023,7:00 AM,4/12/2023,11:19 PM,10,54263,87.05,471,410,51,13,3.92,12.951,7.692,20.643 |
| 230 | +
|
| 231 | +They need to have the following format: |
| 232 | +ID D1_date D1_wakeup D1_inbed D1_nap_start D1_nap_end D1_nonwear1_off D1_nonwear1_on D2_date … |
| 233 | +123 2015-03-30 09:00:00 22:00:00 11:15:00 11:45:00 13:35:00 14:10:00 31/03/2015 … |
| 234 | +567 2015-04-20 08:30:00 23:15:00 |
| 235 | +
|
| 236 | +where ID should be sub-{subject_id}_ses-{session_id}_accel and D1_date is the first date of the file, D1_wakeup is the first wakeup time, D1_inbed is the second in-bed time (skipping the first in-bed time). Naps and non-wear will be skipped for now. |
| 237 | +complete for all dates in the file |
| 238 | +
|
| 239 | +''' |
| 240 | + |
| 241 | +def create_sleep_log(sessions_df): |
| 242 | + """ |
| 243 | + Create a sleep log file from the sessions DataFrame. |
| 244 | + |
| 245 | + Args: |
| 246 | + sessions_df (pd.DataFrame): DataFrame with columns 'subject_id', 'session_id', 'filename', 'date'. |
| 247 | + |
| 248 | + Returns: |
| 249 | + pd.DataFrame: DataFrame with sleep log entries in wide format. |
| 250 | + """ |
| 251 | + # List to store the final entries (one per subject/session) |
| 252 | + final_entries = [] |
| 253 | + |
| 254 | + for _, row in sessions_df.iterrows(): |
| 255 | + subject_id = row['subject_id'] |
| 256 | + session_id = row['session_id'] |
| 257 | + filename = row['filename'] |
| 258 | + |
| 259 | + try: |
| 260 | + # Skip first 5 rows and first column |
| 261 | + sleep_data = pd.read_csv(filename, skiprows=5, usecols=lambda x: x != 'Unnamed: 0') |
| 262 | + if sleep_data.empty: |
| 263 | + print(f"No data found in file {filename}. Skipping.") |
| 264 | + continue |
| 265 | + |
| 266 | + # Create a base entry for this subject/session |
| 267 | + entry = {'ID': f'{subject_id}_{session_id}_accel'} |
| 268 | + |
| 269 | + # Convert In Bed Date to datetime for sorting |
| 270 | + sleep_data['In_Bed_Date_DT'] = pd.to_datetime(sleep_data['In Bed Date'], format='%m/%d/%Y', errors='coerce') |
| 271 | + sleep_data = sleep_data.sort_values('In_Bed_Date_DT') # Sort by date |
| 272 | + |
| 273 | + # Group by date and keep only the last entry for each date |
| 274 | + date_groups = {} |
| 275 | + for _, data in sleep_data.iterrows(): |
| 276 | + in_bed_date_str = data['In Bed Date'] |
| 277 | + date_key = pd.to_datetime(in_bed_date_str, format='%m/%d/%Y').strftime('%Y-%m-%d') |
| 278 | + date_groups[date_key] = data |
| 279 | + |
| 280 | + # Process each day's data |
| 281 | + for day_num, (date_key, data) in enumerate(sorted(date_groups.items()), 1): |
| 282 | + day_prefix = f'D{day_num}_' |
| 283 | + |
| 284 | + # Extract date and time information |
| 285 | + in_bed_date = pd.to_datetime(data['In Bed Date'], format='%m/%d/%Y').strftime('%Y-%m-%d') |
| 286 | + in_bed_time = pd.to_datetime(data['In Bed Time'], format='%I:%M %p').strftime('%H:%M:%S') |
| 287 | + out_bed_date = pd.to_datetime(data['Out Bed Date'], format='%m/%d/%Y').strftime('%Y-%m-%d') |
| 288 | + out_bed_time = pd.to_datetime(data['Out Bed Time'], format='%I:%M %p').strftime('%H:%M:%S') |
| 289 | + |
| 290 | + # Store only date and time information for this day |
| 291 | + entry[f'{day_prefix}date'] = in_bed_date |
| 292 | + entry[f'{day_prefix}inbed'] = in_bed_time |
| 293 | + entry[f'{day_prefix}wakeup'] = out_bed_time |
| 294 | + |
| 295 | + final_entries.append(entry) |
| 296 | + |
| 297 | + except Exception as e: |
| 298 | + print(f"Error processing file {filename}: {e}") |
| 299 | + |
| 300 | + return pd.DataFrame(final_entries) |
| 301 | + |
| 302 | + |
| 303 | +# Create the sleep log dataframe |
| 304 | +sleep_log_df = create_sleep_log(sessions_df) |
| 305 | +# Print the sleep log DataFrame for verification |
| 306 | +print("\nSleep Log DataFrame:") |
| 307 | +print(sleep_log_df) |
| 308 | + |
| 309 | + |
| 310 | + |
| 311 | +''' |
| 312 | +Split the dataframes into two parst, intervention and observational |
| 313 | +where if subject ID starts with sub-7*, it is an observational study subject, otherwise it is an intervention study subject. |
| 314 | +then clean up the dataframes by removing any extra columns that are unused |
| 315 | +
|
| 316 | +''' |
| 317 | + |
| 318 | +def split_and_clean_dataframes(sleep_log_df): |
| 319 | + """ |
| 320 | + Split the sleep log DataFrame into intervention and observational study DataFrames, |
| 321 | + and clean up by removing unused columns. |
| 322 | + |
| 323 | + Args: |
| 324 | + sleep_log_df (pd.DataFrame): DataFrame with sleep log entries. |
| 325 | + |
| 326 | + Returns: |
| 327 | + tuple: Two DataFrames, one for intervention study and one for observational study. |
| 328 | + """ |
| 329 | + # Split the DataFrame based on subject ID |
| 330 | + obs_df = sleep_log_df[sleep_log_df['ID'].str.startswith('sub-7')] |
| 331 | + # Intervention study DataFrame contains all other subjects (doesn't start with sub-7 or sub-6) |
| 332 | + int_df = sleep_log_df[~sleep_log_df['ID'].str.startswith('sub-7')] |
| 333 | + int_df = int_df[~int_df['ID'].str.startswith('sub-6')] |
| 334 | + # Clean up by removing unused columns (if any) |
| 335 | + int_df = int_df.reset_index(drop=True) |
| 336 | + obs_df = obs_df.reset_index(drop=True) |
| 337 | + |
| 338 | + return int_df, obs_df |
| 339 | + |
| 340 | +# Split and clean the DataFrames |
| 341 | +int_df, obs_df = split_and_clean_dataframes(sleep_log_df) |
| 342 | +# Print the intervention and observational DataFrames for verification |
| 343 | +print("\nIntervention Study DataFrame:") |
| 344 | +print(int_df.head()) |
| 345 | +print("\nObservational Study DataFrame:") |
| 346 | +print(obs_df.head()) |
| 347 | + |
| 348 | +# save the DataFrames to CSV files |
| 349 | +int_df.to_csv(os.path.join(INT_DIR, 'sleep_log_intervention.csv'), index=False) |
| 350 | +obs_df.to_csv(os.path.join(OBS_DIR, 'sleep_log_observational.csv'), index=False) |
| 351 | + |
| 352 | + |
0 commit comments