|
1 | 1 |
|
2 | | -import { Project, Publication, ResearchArea, PortfolioCase } from './types'; |
| 2 | +import { Project, Publication, ResearchArea, PortfolioCase, MLEvidenceSection } from './types'; |
3 | 3 |
|
4 | 4 | export const RESEARCH_AREAS: ResearchArea[] = [ |
5 | 5 | { |
@@ -98,32 +98,113 @@ export const PROJECTS: Project[] = [ |
98 | 98 | export const PORTFOLIO_CASES: PortfolioCase[] = [ |
99 | 99 | { |
100 | 100 | title: "Programmed Splicing Kinetics in the Inflammatory Response", |
101 | | - biologicalQuestion: "Do splicing delays in NF-κB–responsive genes reflect a programmed regulatory mechanism, or are they stochastic noise? Answering this required moving from descriptive catalogs of splicing events to a quantitative kinetic framework operating at intron resolution.", |
| 101 | + problem: "Do splicing delays in NF-κB–responsive genes reflect a programmed regulatory mechanism, or are they stochastic noise? Answering this required moving from descriptive catalogs of splicing events to a quantitative kinetic framework operating at intron resolution.", |
| 102 | + data: "Kinetic RNA-seq time series from LPS-stimulated macrophages across 7 timepoints; ~12,000 annotated introns across the NF-κB–responsive transcriptome.", |
102 | 103 | whatIBuilt: "An end-to-end Python pipeline to quantify intron excision dynamics from kinetic RNA-seq data across the NF-κB–responsive transcriptome.", |
103 | | - howItWorks: [ |
| 104 | + methodsStack: [ |
104 | 105 | "STAR alignment of kinetic RNA-seq time series", |
105 | 106 | "Custom interval engineering for intron-level quantification", |
106 | 107 | "Adapted Completed Splicing Index (CoSI) to quantify splicing completion per intron and timepoint", |
107 | 108 | "Python-based aggregation, normalization, and visualization across replicates and conditions", |
108 | 109 | "Follow-up sequence modeling to prioritize candidate regulatory features" |
109 | 110 | ], |
110 | | - engineeringChallenge: "Extracting accurate intron-level kinetics requires precise interval handling, isoform-aware quantification, and robust normalization across temporal datasets at a resolution standard RNA-seq workflows are not designed to support out of the box.", |
111 | | - outcome: "Identified a class of 'bottleneck introns' that delay inflammatory gene expression. Minigene assays experimentally validated that weak 5' splice donors drive this delay in a subset of targets, while downstream model interpretation highlighted additional putative non-canonical regulatory motifs. First-author manuscript currently in review at eLife." |
| 111 | + result: "Identified a class of 'bottleneck introns' that delay inflammatory gene expression. Minigene assays experimentally validated that weak 5' splice donors drive this delay. First-author manuscript in review at eLife.", |
| 112 | + whyItMatters: "RNA processing rate is an underexplored axis of gene regulation; these delays shape the timing of the inflammatory response and may represent a target for therapeutic modulation.", |
| 113 | + signalsForML: "Custom metric design (CoSI), sequence feature integration, quantitative kinetic modeling, foundation for downstream deep learning target generation.", |
| 114 | + signalsForBio: "Kinetic splicing quantification at intron resolution, NF-κB transcriptome biology, integration of computational findings with minigene experimental validation.", |
| 115 | + figurePlaceholderLabel: "Figure: CoSI kinetic profiles — bottleneck vs. non-bottleneck introns" |
112 | 116 | }, |
113 | 117 | { |
114 | 118 | title: "Decoding Immune Cell Type–Specific Splicing with Foundation Models", |
115 | | - biologicalQuestion: "How do B cells, T cells, and macrophages deploy distinct splicing programs from the same genome? Answering this required building cell-type–resolved training labels at scale, adapting a genomic foundation model for splicing prediction, and interpreting the learned sequence features underlying lineage-specific regulation.", |
| 119 | + problem: "How do B cells, T cells, and macrophages deploy distinct splicing programs from the same genome, and can a sequence-to-function model learn those differences? Answering this required building cell-type–resolved training labels at scale and adapting a genomic foundation model for splicing prediction.", |
| 120 | + data: "Bulk RNA-seq from 3 immune lineages; ~50,000 PSI and intron retention labels generated at genome scale; Borzoi model weights (pre-trained on 500bp–196kb sequence contexts).", |
116 | 121 | whatIBuilt: "A unified data engineering, modeling, and interpretation framework for immune cell type–specific splicing prediction.", |
117 | | - howItWorks: [ |
| 122 | + methodsStack: [ |
118 | 123 | "Cell-type–specific GTF construction with StringTie using dominant isoform selection by TPM", |
119 | 124 | "PSI extraction from rMATS and integration of exon inclusion / intron retention labels into genome-scale training targets", |
120 | 125 | "Automated labeling of exons, introns, and intergenic regions with reproducible YAML-driven configuration", |
121 | 126 | "Fine-tuning of Borzoi for splicing prediction using single-task and multitask training strategies", |
122 | | - "Attribution-based interpretation with DeepSHAP / TF-MoDISco to identify cis-regulatory motifs associated with lineage-specific splicing", |
| 127 | + "Attribution-based interpretation with DeepSHAP / TF-MoDISco to identify cis-regulatory motifs", |
123 | 128 | "In silico perturbation analyses to test the functional importance of discovered sequence elements" |
124 | 129 | ], |
125 | | - engineeringChallenge: "This project required solving two tightly linked problems at once: generating high-confidence labels across multiple immune cell types without coordinate drift or label leakage, and adapting a large genomic foundation model to a splicing-specific prediction task without losing interpretability.", |
126 | | - outcome: "Produced a scalable training dataset and modeling framework for immune cell type–specific splicing, improved predictive performance over simpler baselines, and recovered interpretable candidate regulatory motifs associated with exon inclusion and intron retention across lineages. Manuscript in preparation." |
| 130 | + result: "Produced a scalable training dataset and modeling framework for immune cell type–specific splicing. Improved predictive performance over simpler baselines and recovered interpretable candidate regulatory motifs. Manuscript in preparation.", |
| 131 | + whyItMatters: "Sequence-to-function models that predict cell-type–specific splicing could accelerate discovery of therapeutic targets in immune dysregulation and splicing-linked disease.", |
| 132 | + signalsForML: "Foundation model fine-tuning (LoRA/PEFT), multi-task learning across cell types, attribution-based interpretability, HPC multi-GPU training, reproducible ML pipelines.", |
| 133 | + signalsForBio: "Cell-type–specific transcript annotation, PSI quantification across immune lineages, biologically grounded motif discovery, integration with RBP expression data.", |
| 134 | + figurePlaceholderLabel: "Figure: TF-MoDISco motif clusters — lineage-specific splicing regulators" |
| 135 | + }, |
| 136 | + { |
| 137 | + title: "Adipose Tissue Transcriptomics in Lung Cancer Cachexia", |
| 138 | + problem: "Does Kras\u1d33\u00b9\u00b2\u1d30/+ lung tumor induction cause transcriptional reprogramming in perigonadal white adipose tissue, and what pathways drive early adipose wasting?", |
| 139 | + data: "Bulk RNA-seq from gWAT of Kras\u1d33\u00b9\u00b2\u1d30/+ and WT littermate mice at 3 and 6 weeks post-induction. Snoke DB, van der Velden JL, Dearborn J, et al. Cell Reports 2025.", |
| 140 | + whatIBuilt: "DESeq2 differential expression pipeline, fgsea hallmark pathway analysis, and an interactive R/Shiny volcano and GSEA explorer published as Supplemental Figure S9.", |
| 141 | + methodsStack: [ |
| 142 | + "DESeq2 differential expression analysis", |
| 143 | + "fgsea hallmark pathway enrichment (MSigDB)", |
| 144 | + "R/Shiny interactive app with adjustable padj and |LFC| thresholds", |
| 145 | + "Volcano plots with hover tooltips and gene name annotation", |
| 146 | + "Reactive DEG count summary table", |
| 147 | + "Shinylive/WebAssembly browser deployment" |
| 148 | + ], |
| 149 | + result: "Identified early lipid metabolism and inflammatory pathway changes in gWAT preceding overt cachexia. Published as Supplemental Figure S9, Cell Reports (2025). DOI: 10.1016/j.celrep.2025.116278.", |
| 150 | + whyItMatters: "Cachexia affects ~50% of cancer patients and is an independent predictor of mortality; characterizing adipose transcriptional programs may reveal intervention points.", |
| 151 | + signalsForML: "Pathway-level summarization of high-dimensional transcriptomic data, statistical pipeline design, DESeq2 normalization and modeling.", |
| 152 | + signalsForBio: "Bulk RNA-seq end-to-end, differential expression, GSEA, adipose and cancer biology, published collaborative analysis.", |
| 153 | + figurePlaceholderLabel: "Figure: Volcano plot — gWAT DEGs at 3 and 6 weeks" |
| 154 | + } |
| 155 | +]; |
| 156 | + |
| 157 | +export const ML_EVIDENCE_SECTIONS: MLEvidenceSection[] = [ |
| 158 | + { |
| 159 | + title: "Model Architecture & Foundation Model Fine-Tuning", |
| 160 | + accentColor: 'blue', |
| 161 | + items: [ |
| 162 | + "Transformer and CNN-hybrid architectures for long-range sequence context", |
| 163 | + "Task-specific fine-tuning of foundation models (Borzoi) via LoRA/PEFT", |
| 164 | + "Single-task and multitask prediction across B cell, T cell, and macrophage contexts", |
| 165 | + "Evaluation through attribution, perturbation, and embedding-space analyses" |
| 166 | + ], |
| 167 | + figure: { |
| 168 | + label: "Fine-tuning loss curves — Borzoi on splicing targets", |
| 169 | + caption: "Training and validation loss for PSI and intron retention prediction tasks across immune lineages." |
| 170 | + } |
| 171 | + }, |
| 172 | + { |
| 173 | + title: "Training Infrastructure & HPC Workflows", |
| 174 | + accentColor: 'green', |
| 175 | + items: [ |
| 176 | + "Config-driven experiments — YAML/JSON parameterization with full run logging", |
| 177 | + "PyTorch Lightning — clean training loops and multi-GPU support", |
| 178 | + "Mixed-precision training — fp16/bf16 for memory efficiency on large models", |
| 179 | + "Structured checkpointing — resume-from-checkpoint workflows for long runs", |
| 180 | + "SLURM orchestration — job submission, GPU allocation, restartable training", |
| 181 | + "WandB + Optuna — experiment tracking and hyperparameter search" |
| 182 | + ] |
| 183 | + }, |
| 184 | + { |
| 185 | + title: "Attribution & Interpretability", |
| 186 | + accentColor: 'purple', |
| 187 | + items: [ |
| 188 | + "DeepLIFT / DeepSHAP via Captum for per-nucleotide attribution", |
| 189 | + "TF-MoDISco motif discovery from attribution maps", |
| 190 | + "SEA / FIMO (MEME Suite) for motif scanning and validation", |
| 191 | + "In silico mutagenesis — motif scrambling/deletion with batch prediction on perturbed sequences", |
| 192 | + "Embedding-space visualization with UMAP for exploratory analysis" |
| 193 | + ], |
| 194 | + figure: { |
| 195 | + label: "Attribution heatmap — per-nucleotide DeepSHAP scores", |
| 196 | + caption: "Nucleotide-resolution attribution scores highlighting splice site and intronic regulatory elements." |
| 197 | + } |
| 198 | + }, |
| 199 | + { |
| 200 | + title: "Reproducibility Practices", |
| 201 | + accentColor: 'orange', |
| 202 | + items: [ |
| 203 | + "Workflows parameterized via config files rather than hardcoded paths", |
| 204 | + "Conda environments pinned and versioned per project", |
| 205 | + "Outputs versioned alongside input metadata for auditability", |
| 206 | + "Pipeline logic separated from execution logic for portability across HPC environments" |
| 207 | + ] |
127 | 208 | } |
128 | 209 | ]; |
129 | 210 |
|
@@ -174,3 +255,6 @@ export const PRESENTATIONS: string[] = [ |
174 | 255 | "Transformer Models in Genomics (Guest Lecture)", |
175 | 256 | "Selective RNA Depletion to Enhance Single-Cell Transcriptomics" |
176 | 257 | ]; |
| 258 | + |
| 259 | +export const COMP_BIO_PROJECTS = PROJECTS.filter((_, i) => i <= 1); |
| 260 | +export const ML_PROJECTS = PROJECTS.filter((_, i) => i >= 2); |
0 commit comments