Skip to content

Commit 1427704

Browse files
committed
Set up behavioral postprocessing structure
- Reorganize postprocessing: move EEG scripts to postprocessing-eeg/ folder - Create postprocessing-behavior/ R project for behavioral data processing - Add config/paths.R reading from existing preprocessed link file - Add config/settings.R with parameters matching matlab (codes, thresholds, RT trimming) - Add functions/load_behavioral_data.R with validation - Test data loading successfully for behavioral CSVs
1 parent 57c2bed commit 1427704

18 files changed

+562
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
.DS_Store
33

44
# ignore data specific files
5-
derivatives/*
5+
derivatives/*
6+
.Rproj.user
34.3 KB
Binary file not shown.
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
# paths.r - path configuration for behavioral analysis
2+
# author: marlene buch
3+
library(here)
4+
library(stringr)
5+
# read link to preprocessed data (same file used by matlab)
6+
preprocessed_link <- file.path(here(), "..", "..", "input", "preprocessed")
7+
preprocessed_path <- readLines(preprocessed_link, warn = FALSE) %>% str_trim()
8+
# paths.r - path configuration for behavioral analysis
9+
# author: marlene buch
10+
library(here)
11+
library(stringr)
12+
# read link to preprocessed data (same file used by matlab)
13+
# here() gives us the r project root, need to go up to repo root
14+
repo_root <- file.path(here(), "..", "..", "..")
15+
preprocessed_link <- file.path(repo_root, "input", "preprocessed")
16+
preprocessed_path <- readLines(preprocessed_link, warn = FALSE) %>% str_trim()
17+
# construct paths to data
18+
behavioral_dir <- file.path(preprocessed_path, "s1_r1", "behavior")
19+
eeg_dir <- file.path(preprocessed_path, "s1_r1", "eeg")
20+
# output directory (timestamped)
21+
output_dir <- file.path(repo_root, "derivatives",
22+
paste0(Sys.Date(), "_behavioral-analysis"))
23+
# output subdirectories
24+
cleaned_data_dir <- file.path(output_dir, "cleaned_data")
25+
descriptives_dir <- file.path(output_dir, "descriptives")
26+
statistics_dir <- file.path(output_dir, "statistics")
27+
logs_dir <- file.path(output_dir, "logs")
28+
# matlab outputs for validation (if needed)
29+
matlab_erp_dir <- file.path(repo_root, "derivatives")
30+
# create output directories
31+
create_output_dirs <- function() {
32+
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
33+
dir.create(cleaned_data_dir, showWarnings = FALSE, recursive = TRUE)
34+
dir.create(descriptives_dir, showWarnings = FALSE, recursive = TRUE)
35+
dir.create(statistics_dir, showWarnings = FALSE, recursive = TRUE)
36+
dir.create(logs_dir, showWarnings = FALSE, recursive = TRUE)
37+
message("output directories created:")
38+
message(" ", output_dir)
39+
}
40+
# validate paths exist
41+
validate_paths <- function() {
42+
if (!file.exists(preprocessed_link)) {
43+
stop("link file not found: ", preprocessed_link)
44+
}
45+
if (!dir.exists(behavioral_dir)) {
46+
stop("behavioral data directory not found: ", behavioral_dir)
47+
}
48+
message("paths validated")
49+
message(" behavioral data: ", behavioral_dir)
50+
}
51+
validate_paths()
52+
# settings.r - analysis parameters matching matlab postprocessing
53+
# CRITICAL: these must match batch_eeg_postprocessing.m exactly
54+
# author: marlene buch
55+
# === BEHAVIORAL CODES ===
56+
# tier 1: primary hypothesis codes (all-or-nothing for dataset inclusion)
57+
PRIMARY_CODES <- c(102, 104, 202, 204)
58+
PRIMARY_CODE_NAMES <- c("social-invis-FE", "social-invis-NFG",
59+
"nonsoc-invis-FE", "nonsoc-invis-NFG")
60+
# tier 2: secondary analysis codes (condition-specific inclusion)
61+
SECONDARY_CODES <- c(111, 112, 113, 211, 212, 213)
62+
SECONDARY_CODE_NAMES <- c("social-vis-corr", "social-vis-FE", "social-vis-NFE",
63+
"nonsoc-vis-corr", "nonsoc-vis-FE", "nonsoc-vis-NFE")
64+
# all codes combined
65+
ALL_CODES <- c(PRIMARY_CODES, SECONDARY_CODES)
66+
ALL_CODE_NAMES <- setNames(
67+
c(PRIMARY_CODE_NAMES, SECONDARY_CODE_NAMES),
68+
ALL_CODES
69+
)
70+
# === INCLUSION THRESHOLDS (MUST MATCH MATLAB) ===
71+
# minimum trials per condition for inclusion
72+
MIN_EPOCHS_PER_CODE <- 10
73+
# minimum overall accuracy (calculated on visible target trials only)
74+
MIN_ACCURACY <- 0.60
75+
# === RT TRIMMING PARAMETERS (MUST MATCH MATLAB) ===
76+
# rt lower bound (trials < 150ms excluded)
77+
RT_LOWER_BOUND <- 150 # milliseconds
78+
# rt outlier threshold (per condition)
79+
RT_OUTLIER_THRESHOLD <- 3 # standard deviations
80+
# === CONDITION GROUPINGS FOR ANALYSES ===
81+
# visibility conditions
82+
VISIBLE_CODES <- SECONDARY_CODES
83+
INVISIBLE_CODES <- PRIMARY_CODES
84+
# social conditions
85+
SOCIAL_CODES <- c(111, 112, 113, 102, 104)
86+
NONSOCIAL_CODES <- c(211, 212, 213, 202, 204)
87+
# response types
88+
CORRECT_CODES <- c(111, 211)
89+
FLANKER_ERROR_CODES <- c(112, 212, 102, 202)
90+
NONFLANKER_CODES <- c(113, 213, 104, 204) # nfe in visible, nfg in invisible
91+
source("config/settings.R")
92+
print(ALL_CODES)
93+
print(MIN_EPOCHS_PER_CODE)
94+
# paths.r - path configuration for behavioral analysis
95+
# author: marlene buch
96+
library(here)
97+
library(stringr)
98+
# read link to preprocessed data (same file used by matlab)
99+
# here() gives us the r project root, need to go up to repo root
100+
repo_root <- file.path(here(), "..", "..")
101+
preprocessed_link <- file.path(repo_root, "input", "preprocessed")
102+
preprocessed_path <- readLines(preprocessed_link, warn = FALSE) %>% str_trim()
103+
# paths.r - path configuration for behavioral analysis
104+
# author: marlene buch
105+
library(here)
106+
library(stringr)
107+
# read link to preprocessed data (same file used by matlab)
108+
# here() gives us the r project root, need to go up to repo root
109+
repo_root <- file.path(here(), "..", "..", "..")
110+
preprocessed_link <- file.path(repo_root, "input", "preprocessed")
111+
preprocessed_path <- readLines(preprocessed_link, warn = FALSE) %>% str_trim()
112+
# construct paths to data
113+
behavioral_dir <- file.path(preprocessed_path, "s1_r1", "behavior")
114+
eeg_dir <- file.path(preprocessed_path, "s1_r1", "eeg")
115+
# output directory (timestamped)
116+
output_dir <- file.path(repo_root, "derivatives",
117+
paste0(Sys.Date(), "_postprocessing-behavior"))
118+
# output subdirectories
119+
cleaned_data_dir <- file.path(output_dir, "cleaned_data")
120+
descriptives_dir <- file.path(output_dir, "descriptives")
121+
statistics_dir <- file.path(output_dir, "statistics")
122+
logs_dir <- file.path(output_dir, "logs")
123+
# matlab outputs for validation (if needed)
124+
matlab_erp_dir <- file.path(repo_root, "derivatives")
125+
# create output directories
126+
create_output_dirs <- function() {
127+
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
128+
dir.create(cleaned_data_dir, showWarnings = FALSE, recursive = TRUE)
129+
dir.create(descriptives_dir, showWarnings = FALSE, recursive = TRUE)
130+
dir.create(statistics_dir, showWarnings = FALSE, recursive = TRUE)
131+
dir.create(logs_dir, showWarnings = FALSE, recursive = TRUE)
132+
message("output directories created:")
133+
message(" ", output_dir)
134+
}
135+
# validate paths exist
136+
validate_paths <- function() {
137+
if (!file.exists(preprocessed_link)) {
138+
stop("link file not found: ", preprocessed_link)
139+
}
140+
if (!dir.exists(behavioral_dir)) {
141+
stop("behavioral data directory not found: ", behavioral_dir)
142+
}
143+
message("paths validated")
144+
message(" behavioral data: ", behavioral_dir)
145+
}
146+
source("config/paths.R")
147+
validate_paths()
148+
# settings.r - postprocessing parameters matching matlab postprocessing
149+
# CRITICAL: these must match batch_eeg_postprocessing.m exactly
150+
# author: marlene buch
151+
# === BEHAVIORAL CODES ===
152+
# tier 1: primary hypothesis codes (all-or-nothing for dataset inclusion)
153+
PRIMARY_CODES <- c(102, 104, 202, 204)
154+
PRIMARY_CODE_NAMES <- c("social-invis-FE", "social-invis-NFG",
155+
"nonsoc-invis-FE", "nonsoc-invis-NFG")
156+
# tier 2: secondary analysis codes (condition-specific inclusion)
157+
SECONDARY_CODES <- c(111, 112, 113, 211, 212, 213)
158+
SECONDARY_CODE_NAMES <- c("social-vis-corr", "social-vis-FE", "social-vis-NFE",
159+
"nonsoc-vis-corr", "nonsoc-vis-FE", "nonsoc-vis-NFE")
160+
# all codes combined
161+
ALL_CODES <- c(PRIMARY_CODES, SECONDARY_CODES)
162+
ALL_CODE_NAMES <- setNames(
163+
c(PRIMARY_CODE_NAMES, SECONDARY_CODE_NAMES),
164+
ALL_CODES
165+
)
166+
# === INCLUSION THRESHOLDS (MUST MATCH MATLAB) ===
167+
# minimum trials per condition for inclusion
168+
MIN_EPOCHS_PER_CODE <- 10
169+
# minimum overall accuracy (calculated on visible target trials only)
170+
MIN_ACCURACY <- 0.60
171+
# === RT TRIMMING PARAMETERS (MUST MATCH MATLAB) ===
172+
# rt lower bound (trials < 150ms excluded)
173+
RT_LOWER_BOUND <- 150 # milliseconds
174+
# rt outlier threshold (per condition)
175+
RT_OUTLIER_THRESHOLD <- 3 # standard deviations
176+
# === CONDITION GROUPINGS FOR ANALYSES ===
177+
# visibility conditions
178+
VISIBLE_CODES <- SECONDARY_CODES
179+
INVISIBLE_CODES <- PRIMARY_CODES
180+
# social conditions
181+
SOCIAL_CODES <- c(111, 112, 113, 102, 104)
182+
NONSOCIAL_CODES <- c(211, 212, 213, 202, 204)
183+
# response types
184+
CORRECT_CODES <- c(111, 211)
185+
FLANKER_ERROR_CODES <- c(112, 212, 102, 202)
186+
NONFLANKER_CODES <- c(113, 213, 104, 204) # nfe in visible, nfg in invisible
187+
source("config/settings.R")
188+
print(PRIMARY_CODES)
189+
# load_behavioral_data.r - load cleaned behavioral csvs from preprocessing
190+
# author: marlene buch
191+
library(tidyverse)
192+
load_behavioral_data <- function(behavioral_dir, subjects = NULL) {
193+
# load all cleaned behavioral csvs from soccer-dataset preprocessing
194+
#
195+
# inputs:
196+
# behavioral_dir - path to preprocessed behavior folder
197+
# subjects - optional vector of subject ids to load (e.g., c("390001", "390002"))
198+
#
199+
# outputs:
200+
# tibble with all subjects' behavioral data
201+
message("loading behavioral data from: ", behavioral_dir)
202+
# find all subject directories
203+
subject_dirs <- list.dirs(behavioral_dir, recursive = FALSE, full.names = TRUE)
204+
if (length(subject_dirs) == 0) {
205+
stop("no subject directories found in: ", behavioral_dir)
206+
}
207+
# filter to requested subjects if specified
208+
if (!is.null(subjects)) {
209+
subject_pattern <- paste0("sub-", subjects, collapse = "|")
210+
subject_dirs <- subject_dirs[str_detect(basename(subject_dirs), subject_pattern)]
211+
}
212+
message("found ", length(subject_dirs), " subject directories")
213+
# load all csvs
214+
all_data <- map_dfr(subject_dirs, function(subject_dir) {
215+
# extract subject id
216+
subject_id <- str_extract(basename(subject_dir), "\\d+")
217+
# find csv file (should be exactly one per subject)
218+
csv_files <- list.files(subject_dir, pattern = "*_clean\\.csv$", full.names = TRUE)
219+
if (length(csv_files) == 0) {
220+
warning("no clean csv found for subject ", subject_id)
221+
return(NULL)
222+
}
223+
if (length(csv_files) > 1) {
224+
warning("multiple csvs found for subject ", subject_id, ", using first")
225+
}
226+
# read csv
227+
data <- read_csv(csv_files[1], show_col_types = FALSE) %>%
228+
mutate(subject = subject_id) %>%
229+
relocate(subject)
230+
return(data)
231+
})
232+
message("loaded data for ", n_distinct(all_data$subject), " subjects")
233+
message("total trials: ", nrow(all_data))
234+
return(all_data)
235+
}
236+
# helper function to get list of available subjects
237+
get_available_subjects <- function(behavioral_dir) {
238+
subject_dirs <- list.dirs(behavioral_dir, recursive = FALSE, full.names = FALSE)
239+
subjects <- str_extract(subject_dirs, "\\d+")
240+
return(sort(subjects[!is.na(subjects)]))
241+
}
242+
# validate loaded data structure
243+
validate_behavioral_data <- function(data) {
244+
# check required columns exist
245+
required_cols <- c(
246+
"subject", "code", "flankerResponse_rt", "flankerResponse_keys",
247+
"confidenceRating", "responseType", "visInvis", "block_condition",
248+
"target", "flanker", "correctKey", "flankerKey"
249+
)
250+
missing_cols <- setdiff(required_cols, names(data))
251+
if (length(missing_cols) > 0) {
252+
stop("missing required columns: ", paste(missing_cols, collapse = ", "))
253+
}
254+
# check for expected codes
255+
unexpected_codes <- setdiff(unique(data$code), ALL_CODES)
256+
if (length(unexpected_codes) > 0) {
257+
warning("unexpected behavioral codes found: ", paste(unexpected_codes, collapse = ", "))
258+
}
259+
message("behavioral data structure validated")
260+
return(invisible(TRUE))
261+
}
262+
source("config/paths.R")
263+
source("config/settings.R")
264+
source("functions/load_behavioral_data.R")
265+
# test with one subject
266+
test_data <- load_behavioral_data(behavioral_dir, subjects = c("390001"))
267+
head(test_data)
268+
validate_behavioral_data(test_data)
269+
# load_behavioral_data.r - load cleaned behavioral csvs from preprocessing
270+
# author: marlene buch
271+
library(tidyverse)
272+
load_behavioral_data <- function(behavioral_dir, subjects = NULL) {
273+
# load all cleaned behavioral csvs from soccer-dataset preprocessing
274+
#
275+
# inputs:
276+
# behavioral_dir - path to preprocessed behavior folder
277+
# subjects - optional vector of subject ids to load (e.g., c("390001", "390002"))
278+
#
279+
# outputs:
280+
# tibble with all subjects' behavioral data
281+
message("loading behavioral data from: ", behavioral_dir)
282+
# find all subject directories
283+
subject_dirs <- list.dirs(behavioral_dir, recursive = FALSE, full.names = TRUE)
284+
if (length(subject_dirs) == 0) {
285+
stop("no subject directories found in: ", behavioral_dir)
286+
}
287+
# filter to requested subjects if specified
288+
if (!is.null(subjects)) {
289+
subject_pattern <- paste0("sub-", subjects, collapse = "|")
290+
subject_dirs <- subject_dirs[str_detect(basename(subject_dirs), subject_pattern)]
291+
}
292+
message("found ", length(subject_dirs), " subject directories")
293+
# load all csvs
294+
all_data <- map_dfr(subject_dirs, function(subject_dir) {
295+
# extract subject id
296+
subject_id <- str_extract(basename(subject_dir), "\\d+")
297+
# find csv file (should be exactly one per subject)
298+
csv_files <- list.files(subject_dir, pattern = "*_clean\\.csv$", full.names = TRUE)
299+
if (length(csv_files) == 0) {
300+
warning("no clean csv found for subject ", subject_id)
301+
return(NULL)
302+
}
303+
if (length(csv_files) > 1) {
304+
warning("multiple csvs found for subject ", subject_id, ", using first")
305+
}
306+
# read csv
307+
data <- read_csv(csv_files[1], show_col_types = FALSE) %>%
308+
mutate(subject = subject_id) %>%
309+
relocate(subject)
310+
return(data)
311+
})
312+
message("loaded data for ", n_distinct(all_data$subject), " subjects")
313+
message("total trials: ", nrow(all_data))
314+
return(all_data)
315+
}
316+
# helper function to get list of available subjects
317+
get_available_subjects <- function(behavioral_dir) {
318+
subject_dirs <- list.dirs(behavioral_dir, recursive = FALSE, full.names = FALSE)
319+
subjects <- str_extract(subject_dirs, "\\d+")
320+
return(sort(subjects[!is.na(subjects)]))
321+
}
322+
# validate loaded data structure
323+
validate_behavioral_data <- function(data) {
324+
# check required columns exist
325+
required_cols <- c(
326+
"subject", "code", "flankerResponse.rt", "flankerResponse.keys",
327+
"confidenceRating", "responseType", "visInvis", "block_condition",
328+
"target", "flanker", "correctKey", "flankerKey"
329+
)
330+
missing_cols <- setdiff(required_cols, names(data))
331+
if (length(missing_cols) > 0) {
332+
stop("missing required columns: ", paste(missing_cols, collapse = ", "))
333+
}
334+
# check for expected codes
335+
unexpected_codes <- setdiff(unique(data$code), ALL_CODES)
336+
if (length(unexpected_codes) > 0) {
337+
warning("unexpected behavioral codes found: ", paste(unexpected_codes, collapse = ", "))
338+
}
339+
message("behavioral data structure validated")
340+
return(invisible(TRUE))
341+
}
342+
source("functions/load_behavioral_data.R")
343+
validate_behavioral_data(test_data)
344+
test_data %>% count(code, responseType) %>% arrange(code)
345+
test_data %>% count(code, responseType) %>% arrange(code)
346+
test_data %>% count(code, responseType) %>% arrange(code)

0 commit comments

Comments
 (0)