broadinstitute
diff --git a/‎.gitignore‎
Lines changed: 10 additions & 0 deletions b/‎.gitignore‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎R/bican.mccarroll.eqtl/DESCRIPTION‎
Lines changed: 8 additions & 1 deletion b/‎R/bican.mccarroll.eqtl/DESCRIPTION‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎R/bican.mccarroll.eqtl/NAMESPACE‎
Lines changed: 59 additions & 0 deletions b/‎R/bican.mccarroll.eqtl/NAMESPACE‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎R/bican.mccarroll.eqtl/R/cell_type_pairwise_cor_matrix.R‎
Lines changed: 106 additions & 0 deletions b/‎R/bican.mccarroll.eqtl/R/cell_type_pairwise_cor_matrix.R‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎R/bican.mccarroll.eqtl/R/combine_expression_across_cell_types.R‎
Lines changed: 69 additions & 0 deletions b/‎R/bican.mccarroll.eqtl/R/combine_expression_across_cell_types.R‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎R/bican.mccarroll.eqtl/R/egene_union_pairs.R‎
Lines changed: 74 additions & 0 deletions b/‎R/bican.mccarroll.eqtl/R/egene_union_pairs.R‎
Lines changed: 74 additions & 0 deletions
@@ -6,3 +6,13 @@
 **/venv
 **/*.Rcheck
 R/*.tar.gz
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.env
+.venv
+env/
+venv/
@@ -2,7 +2,14 @@ Package: bican.mccarroll.eqtl
 Type: Package
 Title: Eqtl analysis for the bican manuscript
 Version: 0.1.0
-Imports: data.table, ggplot2, ggvenn, cowplot,digest,logger, rlang
+Imports: data.table, ggplot2, ggvenn, cowplot, digest, logger, rlang,
+    ComplexHeatmap, circlize, grid, grDevices, stats, utils,
+    VariantAnnotation, GenomicRanges, IRanges, ggbeeswarm
+Remotes:
+    bioc::ComplexHeatmap,
+    bioc::VariantAnnotation,
+    bioc::GenomicRanges,
+    bioc::IRanges
 Authors@R: c(
     person(
       "James", "Nemesh",
 
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(build_pair_effect_table)
+export(combine_expression_across_cell_types)
 export(compare_all_eQTL_runs)
 export(compare_eqtl_runs)
 export(compare_eqtl_runs_ctr)
@@ -9,39 +10,97 @@ export(eqtl_cluster_filtering_trajectories)
 export(filter_covariates_to_subpopulation)
 export(filter_metacells_to_subpopulation)
 export(filter_significant_index)
+export(get_cell_type_pairwise_cor_matrix)
+export(get_egene_union_pairs)
+export(get_heatmap_index_snp_median_expression)
+export(get_index_snp_slope_matrix_with_impute)
+export(get_index_snp_start_distance)
 export(get_or_build_augmented_indices_for_sign)
+export(get_pval_nominal_matrix)
+export(get_pval_nominal_threshold_matrix)
+export(get_sig_coloc)
+export(get_slope_matrix)
+export(plot_cell_type_pairwise_cor)
 export(plot_effect_sizes)
+export(plot_eqtl_distance_to_tss_boxplot)
+export(plot_fisher_exact)
+export(plot_gene_snp)
 export(plot_pair_effects)
 export(plot_pair_pvals)
 export(read_all_pairs_file)
 export(read_index_file)
+export(run_eqtl_manuscript_pipeline)
+export(run_eqtl_manuscript_pipeline_defaults)
 export(select_reference_pairs)
+importFrom(ComplexHeatmap,Heatmap)
+importFrom(ComplexHeatmap,draw)
+importFrom(GenomicRanges,GRanges)
+importFrom(IRanges,IRanges)
+importFrom(VariantAnnotation,geno)
+importFrom(VariantAnnotation,readVcf)
+importFrom(circlize,colorRamp2)
 importFrom(cowplot,draw_label)
 importFrom(cowplot,ggdraw)
 importFrom(cowplot,plot_grid)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
+importFrom(data.table,data.table)
+importFrom(data.table,fifelse)
+importFrom(data.table,frank)
 importFrom(data.table,fread)
+importFrom(data.table,fwrite)
 importFrom(data.table,is.data.table)
+importFrom(data.table,melt)
+importFrom(data.table,rbindlist)
 importFrom(data.table,setDF)
+importFrom(data.table,setnames)
+importFrom(data.table,tstrsplit)
+importFrom(data.table,uniqueN)
 importFrom(digest,digest)
+importFrom(ggbeeswarm,geom_beeswarm)
+importFrom(ggbeeswarm,geom_quasirandom)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,annotate)
+importFrom(ggplot2,coord_cartesian)
+importFrom(ggplot2,element_blank)
 importFrom(ggplot2,element_text)
+importFrom(ggplot2,expansion)
+importFrom(ggplot2,facet_wrap)
 importFrom(ggplot2,geom_abline)
+importFrom(ggplot2,geom_boxplot)
+importFrom(ggplot2,geom_errorbarh)
 importFrom(ggplot2,geom_histogram)
 importFrom(ggplot2,geom_point)
 importFrom(ggplot2,geom_segment)
+importFrom(ggplot2,geom_text)
+importFrom(ggplot2,geom_violin)
+importFrom(ggplot2,geom_vline)
 importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggsave)
 importFrom(ggplot2,ggtitle)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,margin)
 importFrom(ggplot2,scale_color_manual)
+importFrom(ggplot2,scale_x_continuous)
+importFrom(ggplot2,scale_y_continuous)
 importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_bw)
+importFrom(ggplot2,theme_classic)
+importFrom(ggplot2,unit)
 importFrom(ggvenn,ggvenn)
 importFrom(grDevices,dev.off)
 importFrom(grDevices,pdf)
+importFrom(grDevices,svg)
+importFrom(grid,gpar)
+importFrom(grid,grid.text)
+importFrom(grid,unit)
 importFrom(logger,log_info)
+importFrom(logger,log_warn)
 importFrom(rlang,.data)
+importFrom(stats,fisher.test)
+importFrom(stats,lm)
+importFrom(stats,median)
+importFrom(stats,p.adjust)
 importFrom(stats,qchisq)
+importFrom(stats,sd)
 importFrom(utils,write.table)
@@ -0,0 +1,106 @@
+# slope_matrix_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/script_output/LEVEL_3/slope_matrix_qval_0.01.tsv"
+# pval_nominal_matrix_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/script_output/LEVEL_3/pval_nominal_matrix_qval_0.01.tsv"
+# pval_nominal_threshold_matrix_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/script_output/LEVEL_3/pval_nominal_threshold_matrix_qval_0.01.tsv"
+# egene_union_pairs_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/script_output/LEVEL_3/egene_union_pairs_qval_0.01.tsv"
+# region_cell_type_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/results/region_cell_type.tsv"
+# output_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/script_output/LEVEL_3/cell_type_pairwise_r_squared.tsv"
+# bican.mccarroll.eqtl::get_cell_type_pairwise_cor_matrix(slope_matrix_path, pval_nominal_matrix_path, pval_nominal_threshold_matrix_path, egene_union_pairs_path, region_cell_type_path, output_path)
+
+
+#' Compute pairwise Spearman correlation matrix of eQTL effect sizes across cell types
+#'
+#' For each pair of cell type / region groups, identifies eGene-variant pairs
+#' that are nominally significant in at least one of the two groups (using
+#' \code{pval_nominal < pval_nominal_threshold}), then computes the Spearman
+#' correlation of slopes for those pairs.  Returns the R-squared matrix.
+#'
+#' @param slope_matrix_path Character scalar.  Path to the slope matrix TSV
+#'   (output of \code{\link{get_slope_matrix}}).
+#' @param pval_nominal_matrix_path Character scalar.  Path to the pval_nominal matrix TSV
+#'   (output of \code{\link{get_pval_nominal_matrix}}).
+#' @param pval_nominal_threshold_matrix_path Character scalar.  Path to the pval_nominal_threshold
+#'   matrix TSV (output of \code{\link{get_pval_nominal_threshold_matrix}}).
+#' @param egene_union_pairs_path Character scalar.  Path to the eGene union pairs TSV
+#'   (output of \code{\link{get_egene_union_pairs}}).
+#' @param region_cell_type_path Character scalar.  Path to a tab-delimited file
+#'   with columns \code{cell_type} and \code{region}.
+#' @param output_path Character scalar or \code{NULL}.  If non-NULL, the
+#'   R-squared matrix is written to this path as a tab-delimited file.
+#'
+#' @return A matrix of pairwise Spearman R-squared values (cell types x cell types).
+#'
+#' @export
+#' @importFrom data.table fread fwrite
+#' @importFrom logger log_info
+get_cell_type_pairwise_cor_matrix <- function(slope_matrix_path,
+                                              pval_nominal_matrix_path,
+                                              pval_nominal_threshold_matrix_path,
+                                              egene_union_pairs_path,
+                                              region_cell_type_path,
+                                              output_path = NULL) {
+
+    slope_dt <- data.table::fread(slope_matrix_path)
+    pval_dt <- data.table::fread(pval_nominal_matrix_path)
+    pval_threshold_dt <- data.table::fread(pval_nominal_threshold_matrix_path)
+    egene_dt <- data.table::fread(egene_union_pairs_path, select = c("phenotype_id", "variant_id"))
+
+    # Filter to eGene union pairs
+    slope_dt <- merge(slope_dt, egene_dt, by = c("phenotype_id", "variant_id"))
+    pval_dt <- merge(pval_dt, egene_dt, by = c("phenotype_id", "variant_id"))
+    pval_threshold_dt <- merge(pval_threshold_dt, egene_dt, by = c("phenotype_id", "variant_id"))
+
+    # Filter to cell type/region columns
+    region_cell_type_dt <- data.table::fread(region_cell_type_path)
+    ct_cols <- paste0(region_cell_type_dt$cell_type, "__", region_cell_type_dt$region)
+    ct_cols <- intersect(ct_cols, names(slope_dt))
+    ct_cols <- intersect(ct_cols, names(pval_dt))
+    ct_cols <- intersect(ct_cols, names(pval_threshold_dt))
+
+    slope_m <- as.matrix(slope_dt[, ct_cols, with = FALSE])
+    pval_m <- as.matrix(pval_dt[, ct_cols, with = FALSE])
+    pval_thresh_m <- as.matrix(pval_threshold_dt[, ct_cols, with = FALSE])
+
+    n <- length(ct_cols)
+    logger::log_info("Computing pairwise Spearman correlations for {n} cell type/regions")
+
+    cor_matrix <- matrix(NA_real_, nrow = n, ncol = n,
+                         dimnames = list(ct_cols, ct_cols))
+
+    for (i in seq_len(n)) {
+        for (j in seq_len(n)) {
+            slope1 <- slope_m[, i]
+            slope2 <- slope_m[, j]
+            pval1 <- pval_m[, i]
+            pval2 <- pval_m[, j]
+            thresh1 <- pval_thresh_m[, i]
+            thresh2 <- pval_thresh_m[, j]
+
+            sig_idx <- which(
+                (!is.na(thresh1) & !is.na(pval1) & pval1 < thresh1) |
+                (!is.na(thresh2) & !is.na(pval2) & pval2 < thresh2)
+            )
+
+            if (length(sig_idx) == 0) {
+                cor_matrix[i, j] <- 0
+                next
+            }
+
+            cor_matrix[i, j] <- stats::cor(
+                slope1[sig_idx], slope2[sig_idx],
+                use = "pairwise.complete.obs", method = "spearman"
+            )
+        }
+    }
+
+    r_squared <- cor_matrix^2
+
+    logger::log_info("Correlation matrix complete")
+
+    if (!is.null(output_path)) {
+        out_dt <- data.table::as.data.table(r_squared, keep.rownames = "cell_type")
+        data.table::fwrite(out_dt, output_path, sep = "\t")
+        logger::log_info("Written to: {output_path}")
+    }
+
+    return(r_squared)
+}
@@ -0,0 +1,69 @@
+# eqtl_dir <- "/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/results/LEVEL_3"
+# region_cell_type_path <- "/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/manuscript_data/region_cell_type.tsv"
+# output_path <- "/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/manuscript_data/combined_tpm_expression_across_cell_types.tsv"
+# bican.mccarroll.eqtl::combine_expression_across_cell_types(eqtl_dir, region_cell_type_path, output_path)
+
+
+#' Combine gene expression TPM across cell types into a single matrix
+#'
+#' For each cell type / region group, reads the per-sample gene expression
+#' TPM BED file, appends the cell type name to sample column headers, and
+#' joins all cell types into one wide matrix keyed by gene ID (\code{pid}).
+#'
+#' @param eqtl_dir Character scalar.  Path to the eQTL results directory
+#'   (e.g. \code{.../LEVEL_3}).  Each cell type subdirectory should contain
+#'   a file named \code{<ct>__<reg>.gene_expression_tpm.bed.gz}.
+#' @param region_cell_type_path Character scalar.  Path to a tab-delimited
+#'   file with columns \code{cell_type} and \code{region}.
+#' @param output_path Character scalar or \code{NULL}.  If non-NULL, the
+#'   combined matrix is written to this path as a tab-delimited file.
+#'
+#' @return A \code{data.table} with column \code{pid} (gene ID) and one
+#'   column per sample per cell type, named \code{<sample>_<ct>__<reg>}.
+#'
+#' @export
+#' @importFrom data.table fread fwrite
+#' @importFrom logger log_info
+combine_expression_across_cell_types <- function(eqtl_dir,
+                                                  region_cell_type_path,
+                                                  output_path = NULL) {
+
+    region_cell_type_dt <- data.table::fread(region_cell_type_path)
+    ct_keys <- paste0(region_cell_type_dt$cell_type, "__", region_cell_type_dt$region)
+    logger::log_info("Combining expression for {length(ct_keys)} cell type/region groups")
+
+    read_and_format <- function(ct_key) {
+        bed_path <- file.path(eqtl_dir, ct_key,
+                              paste0(ct_key, ".gene_expression_tpm.bed.gz"))
+        logger::log_info("Reading: {ct_key}")
+        dt <- data.table::fread(bed_path)
+
+        # Remove chr, start, end columns (first 3)
+        dt[, c(1, 2, 3) := NULL]
+
+        # Rename sample columns: append _<ct_key> (skip first col which is pid)
+        old_names <- names(dt)[-1]
+        new_names <- paste0(old_names, "_", ct_key)
+        data.table::setnames(dt, old_names, new_names)
+
+        return(dt)
+    }
+
+    expression_list <- lapply(ct_keys, read_and_format)
+
+    # Merge all cell types by pid (full outer join)
+    combined_dt <- expression_list[[1]]
+    for (i in seq(2, length(expression_list))) {
+        combined_dt <- merge(combined_dt, expression_list[[i]],
+                             by = "pid", all = TRUE)
+    }
+
+    logger::log_info("Combined: {nrow(combined_dt)} genes x {ncol(combined_dt) - 1} sample columns")
+
+    if (!is.null(output_path)) {
+        data.table::fwrite(combined_dt, output_path, sep = "\t")
+        logger::log_info("Written to: {output_path}")
+    }
+
+    return(combined_dt)
+}
@@ -0,0 +1,74 @@
+# eqtl_dir="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/results/LEVEL_3"
+# region_cell_type_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/results/region_cell_type.tsv"
+# qval_threshold=0.05
+# output_path="/broad/bican_um1_mccarroll/RNAseq/analysis/CAP_freeze_3_analysis/eqtls/script_output/LEVEL_3/egene_union_pairs_qval_0.05.tsv"
+# bican.mccarroll.eqtl::get_egene_union_pairs(eqtl_dir, region_cell_type_path, qval_threshold, output_path)
+
+
+#' Get union of eGene-variant pairs across cell types and regions
+#'
+#' For each cell type / region combination in the region-cell-type table,
+#' reads the tensorQTL cis-eQTL index file, filters to eGenes at
+#' \code{qval < qval_threshold}, and returns the union of unique
+#' (phenotype_id, variant_id) pairs across all groups.
+#'
+#' Each subdirectory under \code{eqtl_dir} is expected to follow the naming
+#' convention \code{<cell_type>__<region>/}, containing a file named
+#' \code{<cell_type>__<region>.cis_qtl.txt.gz}.
+#'
+#' @param eqtl_dir Character scalar.  Base directory containing per-cell-type
+#'   eQTL result subdirectories.
+#' @param region_cell_type_path Character scalar.  Path to a tab-delimited file
+#'   with columns \code{cell_type} and \code{region}.
+#' @param qval_threshold Numeric scalar.  q-value threshold for calling an
+#'   eGene significant.  Default \code{0.05}.
+#' @param output_path Character scalar or \code{NULL}.  If non-NULL, the result
+#'   table is written to this path as a tab-delimited file.
+#'
+#' @return A \code{data.table} with columns \code{phenotype_id}, \code{variant_id},
+#'   and \code{qval}, containing one row per unique eGene-variant pair.
+#'
+#' @export
+#' @importFrom data.table fread fwrite rbindlist
+#' @importFrom logger log_info
+get_egene_union_pairs <- function(eqtl_dir,
+                                  region_cell_type_path,
+                                  qval_threshold = 0.05,
+                                  output_path = NULL) {
+
+    region_cell_type_dt <- data.table::fread(region_cell_type_path)
+
+    results <- vector("list", nrow(region_cell_type_dt))
+
+    for (i in seq_len(nrow(region_cell_type_dt))) {
+        cell_type <- region_cell_type_dt$cell_type[i]
+        region    <- region_cell_type_dt$region[i]
+        subdir    <- paste0(cell_type, "__", region)
+        filename  <- paste0(subdir, ".cis_qtl.txt.gz")
+        eqtl_file <- file.path(eqtl_dir, subdir, filename)
+
+        logger::log_info("Reading eQTL index: {eqtl_file}")
+
+        dt <- data.table::fread(eqtl_file, select = c("phenotype_id", "variant_id", "qval"))
+        dt <- dt[dt$qval < qval_threshold, ]
+
+        logger::log_info("  {cell_type} / {region}: {nrow(dt)} eGenes at qval < {qval_threshold}")
+        results[[i]] <- dt
+    }
+
+    combined <- data.table::rbindlist(results)
+
+    # Deduplicate to unique (phenotype_id, variant_id) pairs, keeping lowest qval
+    data.table::setorder(combined, phenotype_id, variant_id, qval)
+    result <- combined[!duplicated(combined, by = c("phenotype_id", "variant_id")),
+                       .(phenotype_id, variant_id, qval)]
+
+    logger::log_info("Total unique eGene-variant pairs: {nrow(result)}")
+
+    if (!is.null(output_path)) {
+        data.table::fwrite(result, output_path, sep = "\t")
+        logger::log_info("Written to: {output_path}")
+    }
+
+    return(result)
+}