Skip to content

Commit e1f9be6

Browse files
committed
pushing to pull
1 parent cf383d9 commit e1f9be6

File tree

11 files changed

+519
-11
lines changed

11 files changed

+519
-11
lines changed

998_4-12-2023_Sleep.csv

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Sleep Report for: C:\Users\hbclab.IOWA\Documents\ActiGraph\ActiLife\Downloads\998 (2023-04-12)60sec.agd
2+
Subject Name: 998
3+
Serial Number: TAS1H05190290
4+
Sleep Algorithm: Cole-Kripke
5+
6+
Sleep Algorithm,In Bed Date,In Bed Time,Out Bed Date,Out Bed Time,Onset Date,Onset Time,Latency,Total Counts,Efficiency,Total Minutes in Bed,Total Sleep Time (TST),Wake After Sleep Onset (WASO),Number of Awakenings,Average Awakening Length,Movement Index,Fragmentation Index,Sleep Fragmentation Index
7+
Cole-Kripke,4/12/2023,12:00 AM,4/12/2023,6:00 AM,4/12/2023,12:00 AM,0,34862,84.44,360,304,56,10,5.6,8.889,10,18.889
8+
Cole-Kripke,4/12/2023,11:09 PM,4/13/2023,7:00 AM,4/12/2023,11:19 PM,10,54263,87.05,471,410,51,13,3.92,12.951,7.692,20.643
9+
Cole-Kripke,4/13/2023,10:44 PM,4/14/2023,6:44 AM,4/13/2023,11:05 PM,21,84929,75.21,480,361,98,21,4.67,20.833,0,20.833
10+
Cole-Kripke,4/14/2023,10:35 PM,4/15/2023,6:10 AM,4/14/2023,10:45 PM,10,69046,83.74,455,381,64,13,4.92,17.363,7.692,25.055
11+
Cole-Kripke,4/15/2023,10:30 PM,4/16/2023,6:55 AM,4/15/2023,10:36 PM,6,93473,75.84,505,383,116,22,5.27,20.99,13.636,34.626
12+
Cole-Kripke,4/16/2023,10:53 PM,4/17/2023,6:24 AM,4/16/2023,10:59 PM,6,38964,83.59,451,377,68,19,3.58,12.86,31.579,44.439
13+
Cole-Kripke,4/17/2023,10:58 PM,4/18/2023,6:15 AM,4/17/2023,11:06 PM,8,59953,77.12,437,337,92,20,4.6,24.256,15,39.256
14+
Cole-Kripke,4/18/2023,11:01 PM,4/19/2023,7:04 AM,4/18/2023,11:07 PM,6,73641,83.44,483,403,74,15,4.93,14.907,6.667,21.574
15+
Cole-Kripke,4/19/2023,11:05 PM,4/20/2023,12:16 AM,4/19/2023,11:14 PM,9,16881,83.1,71,59,3,3,1,18.31,0,18.31

code/core/acc_new.R

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
#!/usr/bin/env Rscript
32

43
# Usage: Rscript new_gg.R --project_dir "/Shared/vosslabhpc/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test/" --deriv_dir "derivatives/GGIR-3.2.6-test/"
@@ -104,9 +103,9 @@ main <- function() {
104103
# ==== Part 2: Non-wear detection ====
105104
ignorenonwear = TRUE,
106105

107-
# ==== Part 3: Sleep detection (optional if using external file) ====
106+
# ==== Part 3: Sleep detection ====
108107
# Uncomment the below if using external sleep log:
109-
# loglocation = "/Shared/vosslabhpc/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test/sleep.csv",
108+
# loglocation = "/mnt/nfs/lss/vosslabhpc/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test/sleep.csv",
110109
# colid = 1,
111110
# coln1 = 2,
112111
# sleepwindowType = "SPT",

code/core/gg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self, matched, intdir, obsdir):
1717
self.matched = matched
1818
self.INTDIR = intdir.rstrip('/') + '/'
1919
self.OBSDIR = obsdir.rstrip('/') + '/'
20-
self.DERIVATIVES = "/derivatives/GGIR-3.2.6-test" # Defined within the class
20+
self.DERIVATIVES = "derivatives/GGIR-3.2.6-test/" # Defined within the class
2121

2222
def run_gg(self):
2323
"""
-5 Bytes
Loading

code/tests/sleep/log_creation.py

Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
import pandas as pd
2+
import os
3+
import requests
4+
import sys
5+
from io import StringIO
6+
from datetime import datetime, timedelta
7+
8+
9+
'''
10+
This script is designed to create a sleep log file for the group analysis of accelerometer data.
11+
It reads the individual participant files by first matching labID with studyID, then builds path to individual files on the RDSS
12+
Aggregates the sleep data by participant and session (with _accel suffix) and saves it to a CSV file.
13+
'''
14+
15+
token = 'DE4E2DB72778DACA9B8848574107D2F5'
16+
INT_DIR = '/Volumes/vosslabhpc/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test'
17+
OBS_DIR = '/Volumes/vosslabhpc/Projects/BOOST/ObservationalStudy/3-experiment/data/act-obs-test'
18+
rdss_dir = '/Volumes/VossLab/Repositories/Accelerometer_Data/Sleep'
19+
20+
def compare_ids(rdss_dir, token, daysago=350):
21+
"""
22+
Pulls all files from RDSS
23+
Pulls the list from RedCap
24+
Compares IDs and returns a dictionary with two keys:
25+
- 'matches': normal matches mapping boost_id to a list of dicts (filename, labID, date)
26+
- 'duplicates': a list of dictionaries each with lab_id, boost_id, filenames (list), and dates (list)
27+
"""
28+
# Retrieve the RedCap report and duplicates from report
29+
report, report_duplicates = _return_report(token)
30+
# Retrieve the full RDSS file list and duplicate files merged with duplicates from report
31+
rdss, file_duplicates = _rdss_file_list(report_duplicates)
32+
33+
# Initialize the result dictionary for normal (non-duplicate) matches
34+
result = {}
35+
36+
# Iterate over the rows in the cleaned RedCap report
37+
for _, row in report.iterrows():
38+
boost_id = str(row['boost_id'])
39+
lab_id = str(row['lab_id'])
40+
41+
# Find matching files in the RDSS list
42+
rdss_matches = rdss[rdss['ID'] == lab_id]
43+
if not rdss_matches.empty:
44+
if boost_id not in result:
45+
result[boost_id] = []
46+
for _, match_row in rdss_matches.iterrows():
47+
result[boost_id].append({
48+
'filename': match_row['filename'],
49+
'labID': lab_id,
50+
'date': match_row['Date']
51+
})
52+
53+
# Process duplicates into the desired structure.
54+
duplicates_dict = []
55+
if not file_duplicates.empty:
56+
# Group by lab_id and boost_id; each group represents one duplicate combination.
57+
grouped = file_duplicates.groupby(['lab_id', 'boost_id'])
58+
for (lab_id, boost_id), group in grouped:
59+
duplicates_dict.append({
60+
'lab_id': lab_id,
61+
'boost_id': boost_id,
62+
'filenames': group['filename'].tolist(),
63+
'dates': group['Date'].tolist()
64+
})
65+
else:
66+
print("Found no duplicates.")
67+
68+
return {'matches': result, 'duplicates': duplicates_dict}
69+
70+
def _return_report(token):
71+
"""
72+
pulls the id report from the rdss via redcap api.
73+
reads the report as a dataframe.
74+
checks for boost_ids that are associated with multiple lab_ids, logs a critical error,
75+
and removes these rows from the dataframe.
76+
separates duplicate rows (based on any column) from the cleaned data.
77+
78+
returns:
79+
df_cleaned: dataframe with duplicates removed and problematic boost_ids excluded
80+
duplicate_rows: dataframe of duplicate rows
81+
"""
82+
url = 'https://redcap.icts.uiowa.edu/redcap/api/'
83+
data = {
84+
'token': token,
85+
'content': 'report',
86+
'report_id': 43327,
87+
'format': 'csv'
88+
}
89+
r = requests.post(url, data=data)
90+
if r.status_code != 200:
91+
print(f"error! status code is {r.status_code}")
92+
sys.exit(1)
93+
94+
df = pd.read_csv(StringIO(r.text))
95+
96+
# identify boost_ids associated with multiple lab_ids.
97+
boost_id_counts = df.groupby('boost_id')['lab_id'].nunique()
98+
problematic_boost_ids = boost_id_counts[boost_id_counts > 1].index.tolist()
99+
100+
if problematic_boost_ids:
101+
print(f"found boost_id(s) with multiple lab_ids: {', '.join(map(str, problematic_boost_ids))}. "
102+
"these entries will be removed from processing.")
103+
df = df[~df['boost_id'].isin(problematic_boost_ids)]
104+
105+
# identify and separate duplicate rows based on any column.
106+
duplicate_rows = df[df.duplicated(keep=False)]
107+
df_cleaned = df.drop_duplicates(keep=False)
108+
109+
if not duplicate_rows.empty:
110+
print(f"duplicate rows found:\n{duplicate_rows}")
111+
112+
return df_cleaned, duplicate_rows
113+
114+
def _rdss_file_list(duplicates, daysago=None):
115+
"""
116+
extracts the first string before the space and the date from filenames ending with .csv
117+
in the specified folder and stores them in a dataframe.
118+
119+
Also, merges the file list with duplicate report entries based on lab_id.
120+
121+
Returns:
122+
df: DataFrame of all file entries
123+
merged_df: DataFrame of file entries that match duplicate lab_ids from the report
124+
"""
125+
extracted_data = []
126+
127+
# Loop through all files in the rdss_dir folder.
128+
for filename in os.listdir(rdss_dir):
129+
if filename.endswith('.csv'):
130+
try:
131+
# Handle both old and new filename formats
132+
if '_' in filename and filename.endswith('.csv'):
133+
# New format: 1288_4-26-2025_Sleep.csv
134+
parts = filename.replace('.csv', '').split('_')
135+
if len(parts) >= 3:
136+
base_name = parts[0] # lab_id
137+
date_part = parts[1] # date
138+
extracted_data.append({'ID': base_name, 'Date': date_part, 'filename': filename})
139+
else:
140+
print(f"Skipping file with unexpected format: {filename}")
141+
else:
142+
try:
143+
base_name = filename.split(' ')[0] # Extract lab_id (old format)
144+
date_part = filename.split('(')[1].split(')')[0] # Extract date (old format)
145+
extracted_data.append({'ID': base_name, 'Date': date_part, 'filename': filename})
146+
except IndexError:
147+
print(f"Skipping file with unexpected format: {filename}")
148+
except IndexError:
149+
print(f"Skipping file with unexpected format: {filename}")
150+
151+
df = pd.DataFrame(extracted_data)
152+
153+
if not df.empty:
154+
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
155+
156+
if daysago:
157+
cutoff_date = datetime.today() - timedelta(days=daysago)
158+
df = df[df['Date'] >= cutoff_date] # Filter files within the last `daysago` days
159+
else:
160+
df = df[df['Date'] >= '2024-08-05'] # Filter out rows before the threshold date
161+
162+
# Filter the file list to only include rows where ID is in the duplicate report (if any)
163+
if not duplicates.empty:
164+
matched_df = df[df['ID'].isin(duplicates['lab_id'])]
165+
# Merge with the duplicates to bring in boost_id information from the report
166+
merged_df = matched_df.merge(duplicates, left_on='ID', right_on='lab_id')
167+
else:
168+
merged_df = pd.DataFrame()
169+
170+
return df, merged_df
171+
172+
matches = compare_ids(rdss_dir, token, daysago=None)
173+
# Print the matches and duplicates for verification
174+
print("Matches:")
175+
for boost_id, files in matches['matches'].items():
176+
print(f"Boost ID: {boost_id}")
177+
for file_info in files:
178+
print(f" - {file_info['filename']} (Lab ID: {file_info['labID']}, Date: {file_info['date']})")
179+
print("\nDuplicates:")
180+
for dup in matches['duplicates']:
181+
print(f"Lab ID: {dup['lab_id']}, Boost ID: {dup['boost_id']}")
182+
print(f" Filenames: {', '.join(dup['filenames'])}")
183+
print(f" Dates: {', '.join(map(str, dup['dates']))}")
184+
# The above code is a complete script that compares IDs from RDSS and RedCap, identifies matches and duplicates, and prints the results.
185+
186+
'''
187+
Below we create the sessions, where if the same subject ID has multiple files, we will create a session for each file ordered by date.
188+
this will now be stored as a dataframe with the columns:
189+
# 'subject_id', 'session_id', 'filename', 'date'
190+
# where subject_id is 'sub-<subject_id>', session_id is 'ses-<session_number>', filename is the file name with full path, and date is the date of the file.
191+
'''
192+
def create_sessions(matches):
193+
"""
194+
Create sessions from the matches dictionary.
195+
196+
Args:
197+
matches (dict): Dictionary containing matches with boost_id as keys and list of file info as values.
198+
199+
Returns:
200+
pd.DataFrame: DataFrame with columns 'subject_id', 'session_id', 'filename', 'date'.
201+
"""
202+
sessions = []
203+
204+
for boost_id, files in matches['matches'].items():
205+
subject_id = f'sub-{boost_id}'
206+
for i, file_info in enumerate(files):
207+
session_id = f'ses-{i + 1}' # Session number starts from 1
208+
sessions.append({
209+
'subject_id': subject_id,
210+
'session_id': session_id,
211+
'filename': os.path.join(rdss_dir, file_info['filename']),
212+
'date': file_info['date']
213+
})
214+
215+
return pd.DataFrame(sessions)
216+
217+
# Create sessions from the Matches
218+
sessions_df = create_sessions(matches)
219+
# Print the sessions DataFrame for verification
220+
print("\nSessions DataFrame:")
221+
print(sessions_df)
222+
223+
224+
'''
225+
Iterate through the files in the sessions file list and build the sleep log file. Files are have the format:
226+
227+
Sleep Algorithm,In Bed Date,In Bed Time,Out Bed Date,Out Bed Time,Onset Date,Onset Time,Latency,Total Counts,Efficiency,Total Minutes in Bed,Total Sleep Time (TST),Wake After Sleep Onset (WASO),Number of Awakenings,Average Awakening Length,Movement Index,Fragmentation Index,Sleep Fragmentation Index
228+
Cole-Kripke,4/12/2023,12:00 AM,4/12/2023,6:00 AM,4/12/2023,12:00 AM,0,34862,84.44,360,304,56,10,5.6,8.889,10,18.889
229+
Cole-Kripke,4/12/2023,11:09 PM,4/13/2023,7:00 AM,4/12/2023,11:19 PM,10,54263,87.05,471,410,51,13,3.92,12.951,7.692,20.643
230+
231+
They need to have the following format:
232+
ID D1_date D1_wakeup D1_inbed D1_nap_start D1_nap_end D1_nonwear1_off D1_nonwear1_on D2_date …
233+
123 2015-03-30 09:00:00 22:00:00 11:15:00 11:45:00 13:35:00 14:10:00 31/03/2015 …
234+
567 2015-04-20 08:30:00 23:15:00
235+
236+
where ID should be sub-{subject_id}_ses-{session_id}_accel and D1_date is the first date of the file, D1_wakeup is the first wakeup time, D1_inbed is the second in-bed time (skipping the first in-bed time). Naps and non-wear will be skipped for now.
237+
complete for all dates in the file
238+
239+
'''
240+
241+
def create_sleep_log(sessions_df):
242+
"""
243+
Create a sleep log file from the sessions DataFrame.
244+
245+
Args:
246+
sessions_df (pd.DataFrame): DataFrame with columns 'subject_id', 'session_id', 'filename', 'date'.
247+
248+
Returns:
249+
pd.DataFrame: DataFrame with sleep log entries in wide format.
250+
"""
251+
# List to store the final entries (one per subject/session)
252+
final_entries = []
253+
254+
for _, row in sessions_df.iterrows():
255+
subject_id = row['subject_id']
256+
session_id = row['session_id']
257+
filename = row['filename']
258+
259+
try:
260+
# Skip first 5 rows and first column
261+
sleep_data = pd.read_csv(filename, skiprows=5, usecols=lambda x: x != 'Unnamed: 0')
262+
if sleep_data.empty:
263+
print(f"No data found in file {filename}. Skipping.")
264+
continue
265+
266+
# Create a base entry for this subject/session
267+
entry = {'ID': f'{subject_id}_{session_id}_accel'}
268+
269+
# Convert In Bed Date to datetime for sorting
270+
sleep_data['In_Bed_Date_DT'] = pd.to_datetime(sleep_data['In Bed Date'], format='%m/%d/%Y', errors='coerce')
271+
sleep_data = sleep_data.sort_values('In_Bed_Date_DT') # Sort by date
272+
273+
# Group by date and keep only the last entry for each date
274+
date_groups = {}
275+
for _, data in sleep_data.iterrows():
276+
in_bed_date_str = data['In Bed Date']
277+
date_key = pd.to_datetime(in_bed_date_str, format='%m/%d/%Y').strftime('%Y-%m-%d')
278+
date_groups[date_key] = data
279+
280+
# Process each day's data
281+
for day_num, (date_key, data) in enumerate(sorted(date_groups.items()), 1):
282+
day_prefix = f'D{day_num}_'
283+
284+
# Extract date and time information
285+
in_bed_date = pd.to_datetime(data['In Bed Date'], format='%m/%d/%Y').strftime('%Y-%m-%d')
286+
in_bed_time = pd.to_datetime(data['In Bed Time'], format='%I:%M %p').strftime('%H:%M:%S')
287+
out_bed_date = pd.to_datetime(data['Out Bed Date'], format='%m/%d/%Y').strftime('%Y-%m-%d')
288+
out_bed_time = pd.to_datetime(data['Out Bed Time'], format='%I:%M %p').strftime('%H:%M:%S')
289+
290+
# Store only date and time information for this day
291+
entry[f'{day_prefix}date'] = in_bed_date
292+
entry[f'{day_prefix}inbed'] = in_bed_time
293+
entry[f'{day_prefix}wakeup'] = out_bed_time
294+
295+
final_entries.append(entry)
296+
297+
except Exception as e:
298+
print(f"Error processing file {filename}: {e}")
299+
300+
return pd.DataFrame(final_entries)
301+
302+
303+
# Create the sleep log dataframe
304+
sleep_log_df = create_sleep_log(sessions_df)
305+
# Print the sleep log DataFrame for verification
306+
print("\nSleep Log DataFrame:")
307+
print(sleep_log_df)
308+
309+
310+
311+
'''
312+
Split the dataframes into two parst, intervention and observational
313+
where if subject ID starts with sub-7*, it is an observational study subject, otherwise it is an intervention study subject.
314+
then clean up the dataframes by removing any extra columns that are unused
315+
316+
'''
317+
318+
def split_and_clean_dataframes(sleep_log_df):
319+
"""
320+
Split the sleep log DataFrame into intervention and observational study DataFrames,
321+
and clean up by removing unused columns.
322+
323+
Args:
324+
sleep_log_df (pd.DataFrame): DataFrame with sleep log entries.
325+
326+
Returns:
327+
tuple: Two DataFrames, one for intervention study and one for observational study.
328+
"""
329+
# Split the DataFrame based on subject ID
330+
obs_df = sleep_log_df[sleep_log_df['ID'].str.startswith('sub-7')]
331+
# Intervention study DataFrame contains all other subjects (doesn't start with sub-7 or sub-6)
332+
int_df = sleep_log_df[~sleep_log_df['ID'].str.startswith('sub-7')]
333+
int_df = int_df[~int_df['ID'].str.startswith('sub-6')]
334+
# Clean up by removing unused columns (if any)
335+
int_df = int_df.reset_index(drop=True)
336+
obs_df = obs_df.reset_index(drop=True)
337+
338+
return int_df, obs_df
339+
340+
# Split and clean the DataFrames
341+
int_df, obs_df = split_and_clean_dataframes(sleep_log_df)
342+
# Print the intervention and observational DataFrames for verification
343+
print("\nIntervention Study DataFrame:")
344+
print(int_df.head())
345+
print("\nObservational Study DataFrame:")
346+
print(obs_df.head())
347+
348+
# save the DataFrames to CSV files
349+
int_df.to_csv(os.path.join(INT_DIR, 'sleep_log_intervention.csv'), index=False)
350+
obs_df.to_csv(os.path.join(OBS_DIR, 'sleep_log_observational.csv'), index=False)
351+
352+

0 commit comments

Comments
 (0)