diff --git a/02_differential_expression/differential_expression.qmd b/02_differential_expression/differential_expression.qmd index 8363caf..6e5ff2e 100644 --- a/02_differential_expression/differential_expression.qmd +++ b/02_differential_expression/differential_expression.qmd @@ -890,6 +890,84 @@ for (contrast in names(de_list)) { tagList(dt_list) ``` +```{r viz-gsea, results = "asis", fig.width = 8, eval = run_FA} +cat("## Visualizations: GSEA enrichment plots for top pathways") + +cat("Each plot shows the enrichment score for a given significant pathway. The enrichment score is calculated by taking our full list of genes (not just significantly DE), ranking it by fold change (high to low), and then moving through the list in order, increasing the score when the gene is part of the pathway and decreasing the score when it is not. The enrichment score graph will point upwards (positive) for upregulated pathways and downwards (negative) for downregulated pathways. The black bars at the bottom indicate the positions of the pathway genes in the full gene list. (See [the GSEA User Guide](https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideTEXT.htm#_Enrichment_Score_(ES)) for further explanation.)") + +# # if you want to subset to just the top pathways +# cat("We will look at the top 10 pathways by p-value.") + +# Go through each contrast in the de_list list object and make enrichment plots for GSEA results +for (contrast in names(de_list)) { + # pull ranked list used to run GSEA: sorted numeric vector of LFC values where names are gene symbols + gene_ranks <- de_list[[contrast]][["all"]] %>% filter(!is.na(padj)) %>% + # fix missing gene names + mutate(gene_name_fixed = ifelse(gene_name == "" | is.na(gene_name), gene_id, gene_name)) %>% + # sort by LFC + arrange(desc(lfc)) %>% + # named vector + dplyr::select(gene_name_fixed, lfc) %>% deframe() + + # # plot the ranked fold changes + # barplot(sort(gene_ranks, decreasing = T)) + + # pull pathway information: list of pathways where each element is a character vector of gene symbols + pathways_to_plot <- fa_gsea_list[[contrast]] %>% ungroup() %>% + # # top pathways only + # dplyr::slice_min(order_by = padj, n = 10) %>% + dplyr::select(pathway, genes) %>% + # list of genes per pathway as comma-separated strings + deframe() %>% as.list() %>% + # list of genes per pathway as vectors + lapply(function(gene_names) stringr::str_split_1(gene_names, ",")) + + # make plot for all or top pathways + lapply(names(pathways_to_plot), function(pathway_name) { + # # make sure all the gene names match + # print(all(pathways_to_plot[[pathway_name]] %in% names(gene_ranks))) + + # plot leading edge enrichment graph + plot_enrich <- plotEnrichment(pathway = pathways_to_plot[[pathway_name]], + stats = gene_ranks) + + # wrap long titles + labs(title = str_wrap(str_replace_all(pathway_name, pattern = "_", replacement = " "), + width = 20)) + + # add a line representing the expression of all genes + # going from red on left (upregulated) to blue on right (downregulated) + # first set up a DF with gene, LFC, and rank + gene_ranks_number <- gene_ranks %>% + enframe(name = "gene", value = "lfc") %>% + rowid_to_column(var = "rank") + # # break rank into intervals (higher interval = higher expression = lower rank) + # # reference: enrichplot::gseaplot2 "GSEA plot that mimic the plot generated by broad institute's GSEA software" + # # https://github.com/YuLab-SMU/enrichplot/blob/devel/R/gseaplot.R + # v <- seq(1, sum(gene_ranks_number$rank), length.out = 9) + # break rank into intervals separately for up- and down-regulated genes + v_up <- seq(1, sum(gene_ranks_number[gene_ranks_number$lfc > 0, "rank"]), length.out = 5) + v_dn <- seq(max(v_up), sum(gene_ranks_number[gene_ranks_number$lfc < 0, "rank"]), length.out = 5) + v <- c(v_up, v_dn[-1]) # first value of v_dn is the same as last of v_up, so drop it + gene_ranks_number$interval <- findInterval(rev(cumsum(gene_ranks_number$rank)), v) + if (min(gene_ranks_number$interval) == 0) { gene_ranks_number$interval <- gene_ranks_number$interval + 1 } + # add to plot + plot_enrich <- plot_enrich + + geom_line(data = gene_ranks_number, + aes(x = rank, y = 0, color = interval), + linewidth = 10, alpha = 0.5, + # plot on top of existing graph using new data + inherit.aes = FALSE) + + # change colors: dark red -> white -> dark blue + scale_color_gradientn(colors = c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))) + + # remove legend + theme(legend.position = "none") + + # final plot + plot_enrich + }) +} +``` + # Pathway Analysis- Over-representation Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA: