Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.9
3.13
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ This project uses [uv](https://github.com/astral-sh/uv) for fast, reliable Pytho
curl -LsSf https://astral.sh/uv/install.sh | sh

# Install the package in development mode with all dependencies
uv pip install -e ".[analysis,dev]"
uv sync --all-extras
```

### Running Scripts
Expand All @@ -57,7 +57,7 @@ uv run scripts/fsstats_extraction.py --help

```bash
# Start JupyterLab with all analysis dependencies
uv run --with "jupyterlab" jupyter lab
uv run jupyter lab
```

## Development Workflow
Expand Down Expand Up @@ -113,10 +113,10 @@ git config diff.ipynb.textconv 'nbstripout -t'

## Analysis Pipeline

1. **Data Processing**: Use `scripts/fsstats_extraction.py` to extract FreeSurfer statistics
2. **Exploration**: Run notebooks in `notebooks/` directory for analysis
3. **Figure Generation**: Use `notebooks/figures.ipynb` to create publication figures
4. **Outputs**: All results saved to `output/` directory, figures to `output/images/`
1. **Stats Extraction**: `scripts/fsstats_extraction.py` to extract FreeSurfer statistics (tricky dependencies involved)
2. **Other exploration/figures**: Run notebooks in `notebooks/`
3. **Outputs**: Most results saved to `output/` directory, figures to `output/images/`
4. **Swarm output**: Larger datasets (freesurfer derivatives, swarm files etc.) are stored in the appropriate cluster dirs.

## Contributing

Expand Down
19 changes: 17 additions & 2 deletions dl_morphometrics_helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@
pipelines = [
("recon-all", "_ra", "inner"),
# ('recon-all-8', '_ra8', 'inner'),
("recon-all-8_1", "_ra81", "inner"),
("recon-all_clinical_t1", "_ract1", "inner"),
("recon-all_clinical_t2", "_ract2", "inner"),
("recon-all_clinical_t1_resample-3", "_ract1r3", "inner"),
("recon-all_clinical_t1_resample-5", "_ract1r5", "inner"),
# ('recon-all_not2', '_ranot2', 'inner'),
("synthsr-rac-with-skullstrip", "_raseed", "inner"),
("recon-any_t2", "_ranyt2", "inner"),
("recon-any_t1", "_ranyt1", "inner"),
("recon-any_t1_resample-3", "_ranyt1r3", "inner"),
Expand All @@ -47,6 +49,7 @@
diff_specs = {
# key: (base_suffix, comp_suffix)
# 'all8': ('_ra', '_ra8'),
"all81": ("_ra", "_ra81"),
"ct1": ("_ra", "_ract1"),
"ct2": ("_ra", "_ract2"),
"ct1r3": ("_ra", "_ract1r3"),
Expand All @@ -58,11 +61,14 @@
"anyt1r3": ("_ra", "_ranyt1r3"),
"anyt1r5": ("_ra", "_ranyt1r5"),
"any1any2": ("_ranyt1", "_ranyt2"),
# SynthSR‑seeded recon‑all vs 8.1 baseline
"raseed81": ("_ra81", "_raseed"),
}

comparison_labels = {
# key: LABEL
"all8": "RA_7-vs-8",
"all81": "RA_7-vs-8.1",
"ct1": "RAC_T1",
"ct2": "RAC_T2",
"ct1r3": "RAC_T1-R3",
Expand All @@ -74,12 +80,17 @@
"anyt1r3": "RANY_T1-R3",
"anyt1r5": "RANY_T1-R5",
"any1any2": "RANY_T1-vs-T2",
# SynthSR‑seeded recon‑all label
"raseed81": "RA_seeded_RAC_SynthSR",
}

# RAC / RANY split
rac_groups = ["ct1", "ct1r3", "ct1r5", "ct2", "ct1ct2"]
rany_groups = ["anyt1", "anyt1r3", "anyt1r5", "anyt2", "any1any2"]

# Optional RA variant comparisons (legacy vs 8.1, and 8.1 vs SynthSR‑seeded)
ra_variant_groups = ["all81", "raseed81"]

# ============================================================================
# PATH CONFIGURATION
# ============================================================================
Expand All @@ -88,10 +99,14 @@
# Update these as needed for your local setup
project_dir = Path("/data/ABCD_MBDU/ohbm2024/")
drv_dir = project_dir / "data/derivatives/"
abcd_data = Path("/data/ABCD_DSST/ABCD")
stats_dir = drv_dir / "leej3/fs_stats_multi"
tpl_root = project_dir / "templateflow" / "tpl-fsaverage"
age_tsv = "/data/ABCD_DSST/ABCD/imaging_data/fast_track/sessions.tsv"
fastages_tsv = "/data/ABCD_DSST/ABCD/imaging_data/fast_track/code/abcd_fastqc01_history/2024-05-01/abcd_fastqc01_ages_innerjoin_sessions_without_age.tsv"
age_tsv = abcd_data / "imaging_data/fast_track/sessions.tsv"
fastages_tsv = (
abcd_data
/ "imaging_data/fast_track/code/abcd_fastqc01_history/2024-05-01/abcd_fastqc01_ages_innerjoin_sessions_without_age.tsv"
)

path_inputs = [drv_dir / "freesurfer" / p[0] for p in pipelines]

Expand Down
243 changes: 241 additions & 2 deletions dl_morphometrics_helpers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@
particularly for handling hemisphere-specific data and region name extraction.
"""

from typing import Optional
from __future__ import annotations

from pathlib import Path

import pandas as pd

from . import constants as cfg


def strip_to_hemi(col: str) -> Optional[str]:
def strip_to_hemi(col: str) -> str | None:
"""
If `col` contains '_lh' or '_rh', return everything
up through that hemisphere tag. Otherwise return None.
Expand Down Expand Up @@ -55,3 +59,238 @@ def get_region_names(df: pd.DataFrame) -> set[str]:
stripped for col in df.columns if (stripped := strip_to_hemi(col)) is not None
}
return regions


def parse_bids_ids(
df: pd.DataFrame,
participant_col: str = "participant_id",
session_col: str = "session_id",
) -> pd.DataFrame:
"""
Parse BIDS participant and session IDs into subject/session columns.

Args:
df: DataFrame with BIDS-style participant_id and session_id columns
participant_col: Name of the participant ID column
session_col: Name of the session ID column

Returns:
DataFrame with added 'subject' and 'session' columns
"""
df = df.copy()
df["subject"] = df[participant_col].str.split("-").str[1]
df["session"] = df[session_col].str.split("-").str[1]
return df


def load_balanced_scans(bids_dir: Path, project_dir: Path) -> pd.DataFrame:
"""
Load and prepare balanced scans DataFrame with full paths.

Args:
bids_dir: Path to BIDS raw data directory
project_dir: Path to project directory containing balanced_scans.csv

Returns:
DataFrame with balanced scans and full file paths
"""
balanced_scans = pd.read_csv(project_dir / "code/balanced_scans.csv")
balanced_scans["path"] = bids_dir / balanced_scans.filename
return balanced_scans


def prepare_scan_pairs(
balanced_scans: pd.DataFrame, max_per_session: int = 250
) -> pd.DataFrame:
"""
Prepare T1w/T2w scan pairs for processing.

Args:
balanced_scans: DataFrame from load_balanced_scans()
max_per_session: Maximum scans per session

Returns:
DataFrame with paired T1w/T2w scans
"""
t1w_scans = (
balanced_scans.query("modality == 'T1w'")
.loc[:, ["participant_id", "session_id", "path"]]
.rename(columns={"path": "t1_path"})
)

t2w_scans = (
balanced_scans.query("modality == 'T2w'")
.loc[:, ["participant_id", "session_id", "path"]]
.rename(columns={"path": "t2_path"})
)

scans_to_run = t1w_scans.merge(
t2w_scans, how="left", on=["participant_id", "session_id"]
)
scans_to_run = parse_bids_ids(scans_to_run)
scans_to_run = scans_to_run.groupby("session").head(max_per_session)

return scans_to_run


def load_session_info() -> pd.DataFrame:
"""
Load and process session info with age conversion.

Returns:
DataFrame with processed session information including age_years
"""
ses_info = pd.read_csv(cfg.age_tsv, sep="\t")
ses_info = parse_bids_ids(ses_info)
ses_info["age_years"] = ses_info.age / 12
return ses_info


def load_fastages_info() -> pd.DataFrame:
"""
Load and process fast ages info.

Returns:
DataFrame with processed fast ages information
"""
fastages = pd.read_csv(cfg.fastages_tsv, sep="\t")
fastages = parse_bids_ids(fastages)
fastages["age_years"] = fastages.age / 12
return fastages


def merge_age_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Merge DataFrame with age data from both session sources.

Args:
df: DataFrame with subject/session columns to merge age data into

Returns:
DataFrame with age_years column populated
"""
ses_info = load_session_info()
fastages = load_fastages_info()

# Merge with main session info
with_age = df.merge(
ses_info.loc[:, cfg.metadata_cols], how="left", on=["subject", "session"]
)

# Fill missing ages with fastages data
with_all_ages = with_age.merge(
fastages.loc[:, cfg.metadata_cols],
how="left",
on=["subject", "session"],
suffixes=("", "_fa"),
)
with_all_ages["age_years"] = with_all_ages.age_years.fillna(
with_all_ages.age_years_fa
)

# Clean up temporary columns
with_all_ages = with_all_ages.drop(
columns=[col for col in with_all_ages.columns if col.endswith("_fa")]
)

return with_all_ages


def parse_synthsr_path(path: Path) -> dict[str, str | None]:
"""
Parse a SynthSR file path to extract subject, session, and pipeline information.

Args:
path: Path to SynthSR file

Returns:
Dictionary with subject_id, session_id, pipeline_src, and synth_path, or None if parsing fails
"""
parts = path.parts

subj_index = parts.index("fs_out") - 2
sess_index = parts.index("fs_out") - 1
subject_id = parts[subj_index] # e.g. 'sub-XXXX'
session_id = parts[sess_index] if parts[sess_index].startswith("ses-") else None
pipeline_name = parts[parts.index("freesurfer") + 1] # pipeline directory name

return {
"subject": subject_id,
"session": session_id,
"pipeline_src": pipeline_name,
"t1_path": str(path),
}


def get_synthsr_for_pipeline(
fs_deriv_dir: Path, pipeline_name: str, glob_pattern="*/*/fs_out/*/mri/SynthSR.mgz"
) -> pd.DataFrame:
"""
Find and catalog SynthSR files across FreeSurfer pipeline outputs.

Args:
fs_deriv_dir: Path to FreeSurfer derivatives directory
pipeline_name: Pipeline name to filter for (e.g., 'recon-any_t1')

Returns:
DataFrame with SynthSR file information
"""
synth_files = []
pipeline_dir = fs_deriv_dir / pipeline_name
assert pipeline_dir.exists()
synth_files.extend(pipeline_dir.glob(glob_pattern))

synth_records = []
for path in synth_files:
record = parse_synthsr_path(path)
if record:
synth_records.append(record)

return pd.DataFrame(synth_records)


def load_brain_confounds(pipeline: str, suffix: str) -> pd.DataFrame:
"""
Load BrainConfounds.tsv for a specific pipeline with proper column naming.

Args:
pipeline: Pipeline name (e.g., 'recon-all', 'recon-all_clinical_t1')
suffix: Suffix to append to metric columns (e.g., '_ra', '_ract1')

Returns:
DataFrame with renamed columns
"""
fn = cfg.stats_dir / pipeline / "BrainConfounds.tsv"
df = pd.read_csv(fn, sep="\t")
df.columns = ["subject", "session"] + [m + suffix for m in cfg.metric_cols]
return df


def load_and_merge_brain_confounds() -> pd.DataFrame:
"""
Load and merge all brain confounds data according to pipeline configuration.

Returns:
Merged and scaled DataFrame with all pipeline data
"""
dfs = {}

for pipeline, suffix, how in cfg.pipelines:
df = load_brain_confounds(pipeline, suffix)
dfs[suffix] = (df, how)

# Merge them in sequence
merged, _ = dfs["_ra"] # start with recon-all (_ra)
for _, suffix, _how in cfg.pipelines[1:]:
df, merge_how = dfs[suffix]
merged = merged.merge(df, on=["subject", "session"], how=merge_how)

# Fix column names
merged.columns = (
merged.columns.str.strip().str.replace(r"[\s\-]+", "_", regex=True).str.lower()
)

# Scale to 10^5 mm^3
merged.iloc[:, 2:] /= 10000

return merged
Loading