rnaseq-reports/02_differential_expression/differential_expression.qmd at acb00b837ca6b60773105021f64fb458fbd5b932 · bcbio/rnaseq-reports · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Differential Expression"
author: "Harvard Chan Bioinformatics Core"
date: "`r Sys.Date()`"
format:
  html:
    code-fold: true
    code-tools: true
    code-overflow: wrap
    df-print: paged
    highlight-style: pygments
    number-sections: true
    self-contained: true
    theme: default
    toc: true
    toc-location: left
    toc-expand: false
    lightbox: true
    page-layout: full
params:
  ## Combatseq and ruv can both be false or ONLY ONE can be true
  # column is the factor of interest in your data that you are trying to evaluate
  # For example, this might be "condition" or "treatment"
  column: "sample_type"
  # This is a list of contrasts that you would like to evaluate
  # The syntax for each item in the list is: c("column_name", "treatment_name", "control_name")
  contrasts:
    value:
      - ["sample_type", "tumor", "normal"]
      - ["sample_type", "normal", "tumor"]
  # These next two subset lines are useful if you would like to subset your data in someway before you run the analysis
  # The column of data in you would like to subset, perhaps: "cell_type" or "batch"
  subset_column: null
  # Then, which values for that subset_column would you like to keep, like "macrophages" or "batch_2"
  subset_value: null
  # Reference genome used
  genome: hg38
  # If you would like to run RUV, set to TRUE, otherwise FALSE
  ruv: false
  # If you would like to run ComBatseq, set to TRUE, otherwise FALSE
  combatseq: false
  params_file: ../00_params/params-example.R # example data
  # Path to the information.R file
  project_file: ../information.R
  # Path to the directory with the functions
  functions_file: ../00_libs
---

Template developed with materials from https://hbctraining.github.io/main/.

```{r libraries}
# NOTE: This code will check version, this is our recommendation, it may work on other versions
# Requires R version 4
stopifnot(R.version$major >= 4)
# If the minor version number is less than .3.1, send a warning that we recommend R version 4.3.1 or higher
if (compareVersion(R.version$minor, "3.1") < 0) warning("We recommend >= R4.3.1")
# Requires BiocManager version 3.18 or later
stopifnot(compareVersion(as.character(BiocManager::version()), "3.18") >= 0)
```

This code is in this ![](https://img.shields.io/badge/status-stable-green) revision.

```{r load-params, cache = FALSE, message = FALSE, warning=FALSE}
library(tidyverse)
library(purrr)
# 1. Set up input files in this R file (params_de.R)
source(params$params_file)
# 2. Set up project file (already done from QC probably)
source(params$project_file)
# 3. Load custom functions to load data from coldata/metrics/counts
purrr::map(list.files(params$functions_file, pattern = "*.R$", full.names = T), source) %>% invisible()

# Assign the genome parameter to the object genome
# IMPORTANT set these values if you are not using the parameters in the header (lines 23-44)
genome <- params$genome
# If the genome name is matching to human or mouse, use their respective annotation, otherwise give a warning
if (grepl("hg|human", genome, ignore.case = TRUE)) {
  library(org.Hs.eg.db)
  ann.org <- org.Hs.eg.db
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) {
  library(org.Mm.eg.db)
  ann.org <- org.Mm.eg.db
} else {
  warning("If you are working on an organism other than human or mouse: Bioconductor packages are also available for a number of other organisms. This code should work for any of the organisms with an eg.db package listed at https://bioconductor.org/packages/release/BiocViews.html#___Organism -- please search for your organism and assign ann.org manually.")
}

# Assigns the column object from the parameters in the YAML header to the column object
column <- params$column
# Assigns the contrasts object from the parameters in the YAML header to the contrasts object
contrasts <- params$contrasts
# Assigns the subset_column object from the parameters in the YAML header to the subset_column object
subset_column <- params$subset_column
# Assigns the subset_value object from the parameters in the YAML header to the subset_value object
subset_value <- params$subset_value
# Assigns the ruv boolean object from the parameters in the YAML header to the run_ruv object
run_ruv <- params$ruv
# Assigns the combatseq boolean object from the parameters in the YAML header to the run_combatseq object
run_combatseq <- params$combatseq
# If the run_ruv object and/or run_combatseq is TRUE, set the run_rmv boolean to TRUE, otherwise set run_rmv boolean to FALSE
run_rmv <- run_ruv | run_combatseq
# Assign the value of column to factor_of_interest, this will allow you to change factor_of_interest throughout the Rmd if needed, while maintaining the value of column
factor_of_interest <- column
```

```{r load-libraries, cache = FALSE, message = FALSE, warning=FALSE}
# Load required libraries
library(knitr)
library(tidyverse)
library(glue)
library(janitor)
library(purrr)
library(ggpubr)
library(rtracklayer)
library(DESeq2)
library(vegan)
library(EnhancedVolcano)
library(DEGreport)
library(pheatmap)
library(msigdbr)
library(fgsea)
library(htmltools)
library(ggforce)
library(RColorBrewer)
library(ggprism)
library(ggprism)
library(grafify)
library(viridis)
library(ashr)

# Overwrite the default filter function with dplyr's filter function
filter <- dplyr::filter
# Overwrite the default select function with dplyr's select function
select <- dplyr::select

# Setting the base theme that ggplot2 will use to theme_prism with the standard text size being 12
ggplot2::theme_set(theme_prism(base_size = 12))
# Setting scale color palette to the palette "kelly" from grafify
# You can select different palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
scale_colour_discrete <- function(...) {
  scale_colour_manual(...,
    values = as.vector(grafify:::graf_palettes[["kelly"]])
  )
}
# Setting scale fill palette to the palette "kelly" from grafify
# You can select different palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
scale_fill_discrete <- function(...) {
  scale_fill_manual(...,
    values = as.vector(grafify:::graf_palettes[["kelly"]])
  )
}
# Setting color to the palette "kelly" from grafify
# You can select diffferent palettes from here:
# https://grafify-vignettes.netlify.app/colour_palettes.html
colors <- as.vector(grafify:::graf_palettes[["kelly"]])

# Setting options for Rmd code chunks
opts_chunk[["set"]](
  cache = FALSE, # When the cache is turned on, knitr will skip the execution of this code chunk if it has been executed before and nothing in the code chunk has changed since then. For a cached code chunk, its output and objects will be automatically loaded from the previous run, as if the chunk were executed again.
  cache.lazy = FALSE, # Whether to only load cached objects when needed
  dev = c("png", "pdf"), # Specifies the graphical devices to use figures
  error = TRUE, # Whether to stop rendering on an error
  highlight = TRUE, # Whether to enable syntax highlighting for the code in the chunk
  message = FALSE, # Whether to preserve messages emitted by message() in the final output
  prompt = FALSE, # Whether an R prompt (>) is displayed at the beginning of each line of R input code in rendered documents
  tidy = FALSE, # Whether or not R code in a chunk is formatted automatically before rendering, similar to the R package, styler
  warning = FALSE, # Whether to preserve warnings in the output
  echo = TRUE, # Whether to include R source code in the final knitted document
  fig.height = 4) # Sets the default height of plot figures generated in inches

# Set seed for reproducibility
set.seed(1234567890L)
```

```{r sanitize-datatable}
# Create a function to clean up data frames
sanitize_datatable <- function(df, ...) {
  # Remove dashes from row names and column names which cause wrapping
  DT::datatable(df, ...,
    rownames = gsub("-", "_", rownames(df)),
    colnames = gsub("-", "_", colnames(df)),
    options = list(scrollX = TRUE, autoWidth = TRUE),
    class = "stripe hover"
  )
}
```

```{r load-data, message=F, warning=F}
# This code will load from nf-core folder
# NOTE make sure to set numerator and denominator
# Use the load_coldata function from the sourced load_data.R within the 00_params directory, to subset the colodata_fn dataframe that was pointed to when sourcing params.R within the 00_params directory
# The factors for this subsetting were set in thr params section of the YAML code chunk.
# The default subsetting is none since subset_column and subset_value are both set to null initially
# The column object here is given to ensure that the column of interest exists in the loaded dataframe
coldata <- load_coldata(
  coldata_fn, column,
  subset_column, subset_value
)
# Set the row names from the coldata dataframe to be a column called sample
# This column should already exist from the Nextflow run
coldata$sample <- row.names(coldata)

# Use the load_counts function from the sourced load_data.R within the 00_params directory, to load the counts data from the dataframe pointed to when sourcing params.R within the 00_params directory
counts <- load_counts(counts_fn)
# If subsetting parameters were provided, this will subset the counts data to match the samples in the coldata dataframe
counts <- counts[, colnames(counts) %in% coldata$sample]

# Use the load_metrics function from the sourced load_data.R within the 00_params directory, to load the metrics from the multiqc, along with the values of se_object and gtf_fn defined in the sourced params.R within the 00_params directory and also the recently created counts data, which is used to estimate the tRNA/rRNA rate (r_and_t_rna_rate).
metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>%
  left_join(coldata, by = c("sample")) %>% # Join the coldata table to the end of object produced from the load_metrics function
  as.data.frame() # Format it as a dataframe
# Assign the sample names to be the row names in the metrics object
rownames(metrics) <- metrics$sample
# If subsetting parameters were provided, this will subset the metrics data to match the samples in the coldata dataframe
metrics <- subset(metrics, metrics$sample %in% coldata$sample)

# If the sample names in the counts (column) don't match in order or string of the metrics file, reorder them in the counts object
counts <- counts[, rownames(metrics)]
# If the sample names in the coldata object (rows) don't match in order or string of the metrics file, reorder them in the counts object
coldata <- coldata[rownames(metrics), ]

# This will relevel the first contrast in your contrasts list with with the "control_name" as the baseline value defined in the params section of the YAML
coldata[[contrasts[[1]][1]]] <- relevel(as.factor(coldata[[contrasts[[1]][1]]]), contrasts[[1]][3])
# Stop if the column names in counts does not match and in the same order as the row names in metrics
stopifnot(all(colnames(counts) == rownames(metrics)))
```

# Overview

-   Project: `r project`
-   PI: `r PI`
-   Analyst: `r analyst`
-   Experiment: `r experiment`
-   Aim: `r aim`

```{r load-counts-data}
# Query AnnotationHub and assign it to ah object
ah <- AnnotationHub::AnnotationHub()

# Retrieve the annotations from Annotation Hub dependent on your reference genome
if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the reference genome is human
  ens <- AnnotationHub::query(ah, c("Homo sapiens", "EnsDb")) # Retrieve the recent human annotations
  ens <- ens[["AH119325"]] # Assign a human annotation, may need to change
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the reference genome is mouse
  ens <- AnnotationHub::query(ah, c("Mus musculus", "EnsDb")) # Retrieve the recent mouse annotations
  ens <- ens[["AH116340"]] # Assign a mouse annotation, may need to change
} else { # If the reference genome is not human or mouse, prin this warning
  cat("Warning: Please, get a table that contains gene_id, gene_name and entrez to continue using this template.")
}

# Create a gene-level dataframe
# NOTE: Change this code if you are not using AnnotationHub
#       The rdata is a table with gene_id, gene_name and entrez columns
rdata <- ensembldb::genes(ens, return.type = "data.frame") %>% # Extract ens annotation as a dataframe
  dplyr::select(gene_id, gene_name,
    entrez = entrezid,
    gene_biotype, description
  ) %>% # Retrieve certain columns and rename the entrez column
  dplyr::filter(gene_id %in% rownames(counts)) %>% # Retain only gene annotations in the counts matrix
  dplyr::distinct(gene_id, .keep_all = TRUE) # Retain only unique gene_ids and also retain all of the other columns with .keep_all = TRUE

# Keep one entrez Id per gene
rdata$entrez <- purrr::map(rdata$entrez, 1) %>% unlist()
# Filter not needed genes
# rdata <- rdata %>% filter(!grepl("ncRNA", gene_biotype)) %>% # Remove non-coding RNAs
#   filter(!grepl('Mir', gene_name)) %>% # Remove miRNAs
#   filter(!grepl('Snor', gene_name)) # Remove snoRNAs
# counts <- counts[rdata$gene_id,] # Filter the counts matrix after removing these not needed genes
```

# Set up

We recommend avoiding manual filtering prior to running DESeq2, as the software performs internal filtering as part of its pipeline. However, pre-filtering may be appropriate in certain contexts to improve computational efficiency or address data imbalances, such as:

- A high proportion of dropouts (zero counts), where filtering can reduce computational burden
- A large number of samples, where pre-filtering low-expressed features can improve performance
- Strongly unbalanced group sizes, where group-specific filtering strategies may help mitigate bias or improve sensitivity

```{r setup}
# Establish a null DDS object for DESeq2
dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~1)

# Perform a variance-stabilized transformation and assign the DESeqTransform object to vsd_before
vsd_before <- vst(dds_to_use)
# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix
norm_matrix <- assay(vsd_before)
```

# PCA and group level variance.

**Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms high-dimensional data into a smaller set of uncorrelated variables called principal components. In gene expression analysis, PCA reduces thousands of gene expression measurements into a few components that capture the most variance across samples. This helps visualize overall patterns, identify outliers, and detect batch effects or biological groupings.

Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.

**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.**

**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.**

```{r set-group, eval=FALSE, echo=FALSE}
## Example of creating a group covariate
# meta$group <- paste0(meta$sex, "_", meta$age, "_", meta$treatment)

# Re-assign the factor of interest in case it has change
# You can modify this to test for different factors of interest with:
# factor_of_interest <- "insert column name for covariate of interest"
factor_of_interest <- column
```

```{r plot-pca}
# Run PCA on the normalized counts
pca <- degPCA(norm_matrix,
  metadata = metrics, # Using the metrics object as our metadata
  condition = factor_of_interest, # Specifying the condition as our factor_of_interest from the above code chunk
  name = "sample", # Column from metadata to use to label the points in the PCA plot
  data = TRUE # Include the data
)

pca$plot + # Create plot
  ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + # Add title to plot
  theme(plot.title = element_text(hjust = 0.5)) + # Center title on plot
  geom_mark_ellipse(aes(color = .data[[column]])) + # Create ellipses on plot
  scale_color_grafify("kelly") # Set color parameter for plot
```

## Analysis of the variance by group

Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test.  PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/)

Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison.

**This analysis allows us to confirm that the variance is the same in all groups (non-significant p-value), so that the DE analysis will be accurate.** Dispersion estimates are a key part of the DESeq2 analysis. DESeq2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per-gene dispersions to match this distribution. If one group has higher variance than all others, this will affect the dispersion estimates.

```{r PERMDISP}
# Use vegdist() to compute pair-wise dissimilarity indices between samples from the normalized count matrix
# Note, we need to transpose (use t())) the counts matrix for vegdist() to compute dissimilarity indices between the samples
vare.disa <- vegdist(t(assay(vsd_before)))

# Evaluate the homogeneity of dispersion within the groups
mod <- betadisper(vare.disa, group = metrics[[factor_of_interest]])
# Test the homogeniety of dispersion between groups  with ANOVA.
# ANOVA tests is there is overall significant variation in variance between groups.
anova(mod)
# Test the homogeniety of dispersion between groups  with a permutation test
# The permutation test considers pairwise comparisons so you can see which group(s) is different
permutest(mod, pairwise = TRUE)
```

# Covariate analysis

Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.

Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.**

```{r run-covariates, fig.height = 6, fig.width = 10}
# Visualize covariates for the PCs
degCovariates(
  counts = norm_matrix, # Normalized counts matrix
  metadata = metrics # Metadata to overlay as covariates
)
```

# Data modeling

```{r setup-deseq2}
# Enter your design formula
# The format should be:
# as.formula(paste0("~ ", column_to_control_for, " + ", factor_of_interest))
formula <- as.formula(paste0("~ ", " + ", column))
# Stop if the column names in counts does not match and in the same order as the row names in metrics
stopifnot(all(colnames(counts) == rownames(coldata)))

# Create the DDS object for DESeq2 and assign it to the object dds_to_use
dds_to_use <- DESeqDataSetFromMatrix(
  countData = counts, # Provide the raw counts data
  colData = coldata, # Provide the metadata
  design = formula # Provide the design formula
)

# Carry out a variance-stabilized transformation of the data in the DDS object and assign it to vsd_before
# Note: vsd_before and norm_matrix was previously assigned in the setup code chunk  w/o variables in the formula so this is overwriting that
# NOTE: VST won’t regress out this when normalizing
vsd_before <- vst(dds_to_use)
# Extract the variance-stabilized transformed (normalized) counts and assign them to norm_matrix
norm_matrix <- assay(vsd_before)
# We are going to modify coldata in the RUV section when is turn on, hence want to retain the original
new_cdata <- coldata
```

For this study, this formula is  recommended: `r as.character(formula)`

```{r note-ruv, eval=F, echo=FALSE}
#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION or remove this section
```

## Remove unwanted variation

Removing unwanted variation from RNA-seq analysis is essential to ensure that the results reflect biological rather than technical differences. Methods like ComBat, RUVseq, or surrogate variable analysis (SVA) can be applied to adjust for batch effects, library preparation discrepancies, or other confounders. These techniques model and subtract the unwanted variation, enhancing the ability to detect true biological signals in the data. Proper normalization and careful experimental design are also crucial steps to mitigate such unwanted variation.

### Assessing unknown factors

```{r note-ruv-print, results='asis'}
# Print out text discussing is RUVseq will be done
# If the RUV parameter in the params section of the YAML is TRUE, then
if (run_ruv) {
  cat("When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")
} else { # If the RUV parameter in the params section of the YAML is FALSE, then
  cat("There is no need to assess unknown factor for this study.")
}
```

```{r do_RUV, eval=run_ruv, echo=run_ruv}
# TOFIX Add to template: check correlation of dummy variables produced by ruvseq with existing covariates in metadata
# NOTE: RUVseq is used when you don’t know where the unwanted variation is coming from. The package utilizes dummy variable(s), 1-5 used, start with 1, look at PCA, decide if you want more separation. Add any known-created RUV variables to DESeq2 formula. The normalized matrix produced – only for visualization, not for input into DESeq2.
library(RUVSeq)

# If you want to skip the code, just set up formula to be your model in the next chunk of code
# Vector containing the  variable of interest
design <- coldata[[column]]
# Formats the variable of interest into a matrix
# Levels are the rows and columns are the number of elements of a given level
# The value is the index of the positions in the vector
diffs <- makeGroups(design)
# Re-assigning the variance stabilized transformation count matrix to a object called dat
dat <- norm_matrix
# By default RUVSeq is running one variable,
ruvset_2 <- RUVs(
  x = dat, # Variance stabilized transformed counts data
  cIdx = rownames(dat), # Vector of the genes
  k = 1, # Number of unknown covariates to find
  scIdx = diffs, # Matrix formatted variable of interest
  isLog = TRUE, # The variance stabilized transformation is approximately a log transformation
  round = FALSE # Whether to round the normalized measures to form pseudocounts
)
# W is the estimation of the unknown source of variation that we can now add to DESeq2
vars <- ruvset$W
# Create a new colData with W added to it
new_cdata <- cbind(coldata, vars)
# New formula with W
formula <- as.formula(paste0(
  "~ ",
  paste0(
    colnames(new_cdata)[grepl("W", colnames(new_cdata))],
    collapse = " + "
  ), " + ", column
))
# Overwrite the normalized count matrix with the normalized counts from RUVSeq
# NOTE: Only use this for visualization
norm_matrix <- ruvset$normalizedCounts
# Create a PCA
pca2 <- degPCA(
  counts = norm_matrix, # Provide the adjusted counts from RUVSeq
  metadata = new_cdata, # Provide metadata
  condition = column # Provide the variable of interest to color the plot by
) + ggtitle("After RUV") # Add a title
# Plot PCA
pca2 +
  scale_color_grafify("kelly") # Apply color scale
```

```{r after_RUV, eval=run_ruv}
# Make DDS object
dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula)
# Apply VST
vsd_to_use <- vst(dds_to_use, blind = FALSE)
```

### Remove Batch Effects

```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq}
# NOTE Combatseq (part of the SVA package) - corrected count, removes the effects while retaining the structure of the data. Used in a scenario where you know what covariate/batch is. Do not add known-removed covariates to DESeq2 formula. Also, don’t attempt to remove biological effect (e.g. donor), this is not conceptually valid; best for technical variation.
library(sva)

cat("Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest.
Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).")
```

```{r print-no-ruv-or-batch, eval=!run_combatseq, results='asis', echo=run_combatseq}
cat("There is no need to remove known factors like batch effect in this study.")
```

```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq}
# NOTE: work on this code if you need to run ComBat-seq
# Set your batch effect variable here this is the variable that ComBat-seq will try to remove

## Column name of your batch variable
to_remove <- "batch"

## Column name of of your variable(s) of interest
to_keep <- "sample_type"

# Re-assign the column of coldata with the batch information as a factor
coldata[[to_remove]] <- as.factor(coldata[[to_remove]])
# Re-assign the column of coldata with the variable of interest as a factor
coldata[[to_keep]] <- as.factor(coldata[[to_keep]])

# Assign the column with batch information to the batch object
batch <- coldata[[to_remove]]
# Assign the column with variable of interest information to the treatment object
treatment <- coldata[[to_keep]]

# If you have multiple variables of interest you will need to assign each to a vector

# treatment1 <- metrics[[to_keep]]
# treatment2 <- metrics[[to_keep]]
# treatment3 <- metrics[[to_keep]]

# Column bind all of the vectors with columns containing variables of interest into one object
# imp <- cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3)))
```

```{r do_combatseq, eval=run_combatseq}
# Run ComBat-seq
adjusted_counts <- ComBat_seq(
  counts = as.matrix(counts), # Provide the counts matrix
  batch = batch, # Provide the batch information
  group = treatment # Provide the variable of interest information
)

# NOTE: Running ComBat-seq with multiple variables of interest
# adjusted_counts <- ComBat_seq(
#   counts = as.matrix(counts), # Provide the counts matrix
#   batch = batch, # Provide the batch information
#   covar_mod = imp # Provide the variables of interest information
# )
```

```{r after_combatseq, eval=run_combatseq}
# Re-enter the adjusted count from ComBat-seq into DESeq
# NOTE: Make sure the formula doesn't contain the covariates used in ComBat-seq above
dds_to_use <- DESeqDataSetFromMatrix(
  countData = adjusted_counts, # Provide the adjusted counts from ComBat-seq
  colData = coldata, # Provide coldata
  design = formula # Provide design formula, notably without the covariates used in ComBat-seq
)
# Perform a variance-stabilized transformation on the ComBat-seq adjusted count data
vsd_combat <- vst(dds_to_use, blind = FALSE)

# Retrieve the variance-stabilized transformed counts data and assign it to the object norm_matrix
norm_matrix <- assay(vsd_combat)

# Run degPCA to use prcomp and make a plot
pca_combat <- degPCA(
  counts = norm_matrix, # Variance-stabilized transformed counts data after ComBat-seq
  metadata = coldata, # Provide metadata
  condition = column # Provide the variable of interest to color the plot by
) +
  ggtitle("After Combatseq") # Add title

# Plot PCA from dgePCA
pca_combat +
  scale_color_grafify("kelly") # Apply color scale
```

# Differential Expression

Differential gene expression analysis of count data was performed using the Bioconductor R package, DESeq2, which fits the count data to a negative binomial model.

Before fitting the model, we often look at a metric called dispersion, which is a measure for variance which also takes into consideration mean expression. A dispersion value is estimated for each individual gene, then 'shrunken' to a more accurate value based on expected variation for a typical gene exhibiting that level of expression. Finally, the shrunken dispersion value is used in the final GLM fit.

We use the below dispersion plot, which should show an inverse relationship between dispersion and mean expression, to get an idea of whether our data is a good fit for the model.

```{r run-deseq2}
# Perform differential expression analysis on the DDS object
de <- DESeq(dds_to_use)
# Plot the dispersion for the data
DESeq2::plotDispEsts(de)
```

Because it is difficult to accurately detect and quantify the expression of lowly expressed genes, differences in their expression between treatment conditions can be unduly exaggerated after the model is fit. We correct for this so that gene LFC is not dependent overall on basal gene expression level.

In cases there are multiple groups and conditions across groups is recommended to use dummy variables instead of interaction terms: https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#interactions.

The LRT is useful for testing multiple terms at once, for example testing 3 or more levels of a factor at once, or all interactions between two variables. The LRT for count data is conceptually similar to an analysis of variance (ANOVA) calculation in linear regression, except that in the case of the Negative Binomial GLM, we use an analysis of deviance (ANODEV), where the deviance captures the difference in likelihood between a full and a reduced model.

```{r lfc-shrink}
# NOTE: We recommend LRT for time series
# resultsNames(de) # Check the order is right
# Create a list that mirrors the contrasts in the contrasts list object where the pattern is:
# <column_of_comparison>_<treatment_name>_vs_<control_name>
names_to_use <- lapply(contrasts, function(contrast) {
  coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])
})
# Currently the contrasts list object doesn't have names for the items in the list
# Assign the names from names_to_use to the names in the contrasts list object
names(contrasts) <- names_to_use

# Create a list called `de_list` to hold the differential expression information for each contrast
# Go through each contrast in the contrasts list object and
de_list <- lapply(contrasts, function(contrast) {
  # Extract the results (LFC, pvalue, padj, etc.) for the contrast
  resLFC <- results(de, contrast = contrast)
  # Create a string for the contrast following this pattern:
  # <column_of_comparison>_<treatment_name>_vs_<control_name>
  coef <- paste0(contrast[1], "_", contrast[2], "_vs_", contrast[3])
  # Apply LFC shrinkage with `ashr`
  # NOTE: Use `ashr` for comparisons with many groups to be able to pull out all the contrasts; otherwise `apeglm` is fine. It shrinks less.
  resLFCS <- lfcShrink(
    dds = de, # Provide the DDS object
    contrast = contrast, # Provide the contrast
    type = "ash" # Use `ashr` to apply the shrinkage
  )
  # If you want to use `apeglm` instead of `ashr` then use:
  # resLFCS <- lfcShrink(
  #   dds = de, # Provide the DDS object
  #   coef = coef, # Provide the contrast
  #   type = "apeglm" # Use `apeglm` to apply the shrinkage
  # )

  # Create a new results data frame with some extra information
  res <- as.data.frame(resLFCS) %>% # Extract the results from the resLFCS as a dataframe
    rownames_to_column("gene_id") %>% # Move the rownames to a column of data called "gene_id"
    left_join(rdata, by = "gene_id") %>% # Combine the results with the rdata object by the gene_id and rdata holds information about each gene
    relocate(gene_name) %>% # Move the "gene_name" column to be the first column
    dplyr::rename(lfc = log2FoldChange) %>% # Rename the column log2FoldChange to lfc
    mutate(pi = abs(lfc) * -log10(padj)) %>% # Create a new column named pi that multiplies the absolute value of log fold change by the log10 adjusted adjusted p-value
    arrange(-pi) # Arrange the data frame by the highest values of pi

  # Filter out genes that have no expression or were filtered out by DESEQ2
  res <- res[res$baseMean > 0, ] %>% # Remove genes with no expression
    drop_na(padj) %>% # Remove genes who don't have an adjusted p-value (lots of lowly expressed genes)
    drop_na(pvalue) # Remove genes who don't have a p-value (lots of lowly expressed genes)

  # Extract the significant genes
  res_sig <- res %>%
    filter(padj < 0.05) %>% # Retain the values that have an adjusted p-value less than 0.05
    arrange(padj) %>% # Arrange the genes by the lowest adjusted p-value
    mutate(gene_name = ifelse(test = gene_name == "" | is.na(gene_name), # If the gene_name value for a gene is NA or blank then
      yes = gene_id, # Use the gene_id as the gene_name
      no = gene_name # Maintain the gene_name
    ))
  # Create a list object called results
  results <- list(
    lfc = resLFC, # Assign the unshrunken results data to `lfc`
    lfcs = resLFCS, # Assign the shrunken results data to `lfcs`
    all = res, # Assign the full differential expression results to `all`
    sig = res_sig # Assign the significantly differentially expressed genes to `sig`
  )
  # Return the `results` list object to be a list within the object `de_list`
  return(results)
})

# NOTE: If you manually add any other comparison to the list with the following variables, the code below will make the plots for those as well:
# de_list <- c(de_list, new_comparison=list(lfc=resLFC,
#                                           lfcs=resLFCS,
#                                           all=res, sig=res_sig))
```

## MA plot

This plot can help to:

- Identify Differential Expression: Genes that show a significant log-fold change (M value away from 0) indicate changes in expression between conditions.
- Assess Data Quality: The plot can help in identifying biases or systematic errors in the data. Ideally, most points should scatter around the M=0 line, indicating that there is no significant systematic difference between the conditions.
- Visualize data dispersion: The distribution of points along the A-axis gives a sense of the spread of expression levels and any patterns or anomalies in the dataset.

::: {.panel-tabset}

```{r after_lfc_shrink, message=F, warning=F}
#| results: asis
#| fig-width: 4
#| fig-height: 4
# For each of the contrasts in the de_list list object to create MA plots
for (contrast in names(de_list)) {
  # Create a tabset of the contrast
  cat("### ", contrast, "\n\n")
  # Use the unshrunken (lfc) counts for creating an MA plot
  p1 <- degMA(as.DEGSet(de_list[[contrast]]$lfc)) +
    ggtitle("Before LFC Shrinking") # Add title
  # Print the unshrunken counts MA plot
  print(p1)
  # Use the shrunken (lfc) counts for creating an MA plot
  p2 <- degMA(as.DEGSet(de_list[[contrast]]$lfcs), limit = 2) +
    ggtitle("After LFC Shrinking") # Add title
  print(p2) # Print the shrunken counts MA plot
  cat("\n\n") # Print two new lines
}
```
:::

## Differentially Expressed Genes

```{r}
#| results: asis

dt_list <- list()
for (contrast in names(de_list)) {
  res_sig <- de_list[[contrast]][["sig"]]
  dt_list <- c(
    dt_list,
    list(h3(contrast)),
    list(DT::datatable(res_sig,
      options = list(
        scrollX = TRUE, autoWidth = TRUE
      ),
      class = "stripe hover"
    ))
  )
}
tagList(dt_list)
```

## Volcano plot

This volcano plot shows the genes that are significantly up- and down-regulated as a result of the analysis comparison. The points highlighted in purple are genes that are significantly differentially expressed with a large effect (padj < 0.05 and model coefficient > ±0.5). Points in pink are significant with a small effect (padj < 0.05 and model coefficient < ±0.5) and points in blue are not significant but have a large effect (padj > 0.05 and model coefficient > ±0.5). Grey points are non-significant. The dashed lines correspond to the cutoff values of padj and model coefficient that we have chosen.

::: {.panel-tabset}
```{r}
#| results: asis
#| fig-height: 6

for (contrast in names(de_list)) {
  cat("### ", contrast, "\n\n")
  res <- de_list[[contrast]][["all"]]
  res_mod <- res
  show <- as.data.frame(res_mod[1:10, c("lfc", "padj", "gene_name")])
  p1 <- EnhancedVolcano(res_mod,
    lab = res_mod$gene_name,
    pCutoff = 0.05,
    selectLab = c(show$gene_name),
    FCcutoff = 0.5,
    x = "lfc",
    y = "padj",
    title = contrast,
    col = c("darkgrey", "lightblue", "plum1", "purple"),
    subtitle = "",
    drawConnectors = TRUE,
    max.overlaps = Inf
  )
  print(p1)
  cat("\n\n")
}
```
:::

## Heatmap

::: {.panel-tabset}
```{r}
#| results: asis

for (contrast in names(de_list)) {
  cat("### ", contrast, "\n\n")
  res_sig <- de_list[[contrast]][["sig"]]
  ma <- norm_matrix[res_sig$gene_id, ]
  colma <- coldata[, c(column), drop = FALSE]
  ma_colors <- lapply(colnames(colma), function(c) {
    l.col <- colors[1:length(unique(colma[[c]]))]
    names(l.col) <- sort(unique(colma[[c]]))
    l.col
  })
  names(ma_colors) <- colnames(colma)
  p1 <- pheatmap(ma,
    color = inferno(10),
    cluster_rows = TRUE,
    show_rownames = FALSE,
    annotation = colma,
    annotation_colors = ma_colors,
    border_color = NA,
    fontsize = 10,
    scale = "row",
    fontsize_row = 10,
    height = 20
  )
  print(p1)
  cat("\n\n")
}
```
:::

## Plot Top 16 Genes

::: {.panel-tabset}
```{r}
#| results: asis
#| fig-height: 6
#| fig-width: 8

n <- 16
for (contrast in names(de_list)) {
  cat("### ", contrast, "\n\n")
  res_sig <- de_list[[contrast]][["sig"]]
  top_n <- res_sig %>%
    slice_min(order_by = padj, n = n, with_ties = FALSE) %>%
    dplyr::select(gene_name, gene_id)
  top_n_exp <- norm_matrix %>%
    as.data.frame() %>%
    rownames_to_column("gene_id") %>%
    pivot_longer(cols = !gene_id, names_to = "sample", values_to = "log2_expression") %>%
    right_join(y = top_n, relationship = "many-to-many") %>%
    left_join(y = coldata, by = "sample")
  p1 <- ggplot(top_n_exp, aes_string(x = column, y = "log2_expression")) +
    geom_boxplot(outlier.shape = NA, linewidth = 0.5, color = "grey") +
    geom_point() +
    facet_wrap(~gene_name) +
    ggtitle(str_interp("Expression of Top ${n} DEGs")) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  print(p1)
  cat("\n\n")
}
```
:::

# Pathway Enrichment

From the set of differentially expressed genes and using publicly available information about gene sets involved in biological processes and functions, we can calculate which biological processes and functions are significantly perturbed as a result of the treatment.

```{r setup-fa}
# Call the appropriate functional analysis databases for your analysis
if (grepl("hg|human", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains hg or human, then
  all_in_life <- get_databases_v2(sps = "human") # Get the human pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory
  run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run
} else if (grepl("mm|mouse", genome, ignore.case = TRUE)) { # If the genome value in the params section of the YAML contains mm or mouse, then
  all_in_life <- get_databases_v2(sps = "mouse") # Get the mouse pathway databases using the get_databases_v2 function form the FA.R file within the 00_libs directory
  run_FA <- TRUE # Assign the value of TRUE to a object called run_FA and it will allow the functional analysis sections to run
} else { # If your organism isn't mouse or human then
  warning("If you are working on an organism other than human or mouse: This pathway enrichment is based on annotations from MSigDB https://www.gsea-msigdb.org/gsea/msigdb which only has human and mouse collections. For non-model organism pathway analysis, please see the template rnaseq/03_functional/Nonmodel_Organism_Pathway_Analysis.Rmd") # Print this error message
  run_FA <- FALSE # Assign the value of FALSE to a object called run_FA and it will not allow the functional analysis sections to run
}
```

# Pathway Analysis- GSEA

Gene Set Enrichment Analysis (GSEA) is a computational method used to determine whether a predefined set of genes shows statistically significant, concordant differences between two biological states (e.g., disease vs. normal) in RNA-seq data or other types of gene expression data. Advantages of GSEA.

- Biological Insight: Helps in understanding the underlying biological processes and pathways affected, rather than focusing on individual genes.
- Incorporation of Prior Knowledge: Utilizes predefined gene sets, allowing integration of existing biological knowledge.
- Contextual Relevance: Can reveal subtle but coordinated changes in biologically meaningful gene sets that might not be apparent when looking at individual genes.

**In the GSEA results, NES (normalized enrichment score) refers to the direction of regulation of the pathway. NES < 0 means downregulated, while NES > 0 means upregulated.**

```{r run-gsea, warning=F, message=F, eval=run_FA}
# Go through each contrast in the de_list list object and assign data frames of significantly enriched pathways to to different elements of the list
fa_gsea_list <- lapply(de_list, function(contrast) {
  # For a given contrast pull out the full differential expression results
  res <- contrast[["all"]]
  # Creating the input for the GSEA analysis
  gsea_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
    dplyr::select(gene_id, lfc) # Extract the columns "gene_id" and "lfc"
  # Obtain the ENSEMBL ID and gene symbols for the genes in the gsea_input object
  # NOTE: Change to the correct species if not working in human or mouse
  input_entrezid <- AnnotationDbi::select(
    x = ann.org, # Annotation database to search
    keys = gsea_input$gene_id, # The gene_ids to search for
    keytype = "ENSEMBL", # The gene_ids (keys) are ENSEMBL annotations
    columns = c("ENTREZID", "SYMBOL")
  ) # Return the queried ENSEMBL ID, along with the ENTREZ ID and the gene symbol
  # Combine gene annotation with the lfc information
  input_entrezid <- inner_join(
    x = gsea_input, # Combine the ENSEMBL ID and lfc input with
    y = input_entrezid, # The ENTREZ ID and gene symbol information
    by = c("gene_id" = "ENSEMBL")
  ) %>% # Join these table together by common ENSEMBL IDs
    filter(!is.na(ENTREZID)) %>% # Remove the genes without an ENTREZ ID
    distinct(ENTREZID, .keep_all = TRUE) %>% # Retain the unique ENTREZ IDs and all columns of data
    arrange(desc(lfc)) # Arrange the data frame by descending values of lfc

  # Run the run_fgsea_v2 function found in FA.R within the 00_libs directory to run the GSEA analysis and assign the output to the object tb
  tb <- run_fgsea_v2(
    input = input_entrezid, # Query for the GSEA analysis
    all_in_life = all_in_life
  ) # Annotation databases to query against
  tb %>%
    filter(padj < 0.05) %>%
    arrange(padj) # Filter and sort the GSEA output for results with an adjust p-value less than 0.05 and return them
})
```

```{r print-gsea, results='asis', eval=run_FA}
# NOTE: DT::datatables doesn't work with tabset and for loops
# You can use the following code to print dynamically or call manually sanitize_datatable() multiple times
# Create an empty list to assign contrast values and DT data tables to
dt_list <- list()

# For each of the contrasts in the de_list list object to extract the name of the contrast and the GSEA results to be rendered into an HTML table
for (contrast in names(de_list)) {
  # Assign the significantly enriched pathways to the object res_sig
  res_sig <- fa_gsea_list[[contrast]] %>% # Use the GSEA obect for a given contrast. NOTE: These have already been filter for adjusted p-value less than 0.05.
    filter(padj < 0.05) %>% arrange(padj) # Filter and sort for adjusted p-values of less than 0.05
  # Populate the dt_list list
  dt_list <- c(
    dt_list, # With the contents of dt_list from the previous loop
    list(h3(contrast)), # Odd numbered objects in the list will be the contrast name in a HTML renderable Heading 3 size
    list(sanitize_datatable(res_sig)) # Even numbered object in the list will hold HTML renderable data table from the DT package containing the significantly enriched pathways
  )
}

# Render the dt_list into HTML
tagList(dt_list)
```

```{r viz-gsea, results = "asis", fig.width = 8, eval = run_FA}
cat("## Visualizations: GSEA enrichment plots for top pathways")

cat("Each plot shows the enrichment score for a given significant pathway. The enrichment score is calculated by taking our full list of genes (not just significantly DE), ranking it by fold change (high to low), and then moving through the list in order, increasing the score when the gene is part of the pathway and decreasing the score when it is not. The enrichment score graph will point upwards (positive) for upregulated pathways and downwards (negative) for downregulated pathways. The black bars at the bottom indicate the positions of the pathway genes in the full gene list. (See [the GSEA User Guide](https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideTEXT.htm#_Enrichment_Score_(ES)) for further explanation.)")

# # if you want to subset to just the top pathways
# cat("We will look at the top 10 pathways by p-value.")

# Go through each contrast in the de_list list object and make enrichment plots for GSEA results
for (contrast in names(de_list)) {
  # pull ranked list used to run GSEA: sorted numeric vector of LFC values where names are gene symbols
  gene_ranks <- de_list[[contrast]][["all"]] %>% filter(!is.na(padj)) %>%
    # fix missing gene names
    mutate(gene_name_fixed = ifelse(gene_name == "" | is.na(gene_name), gene_id, gene_name)) %>%
    # sort by LFC
    arrange(desc(lfc)) %>%
    # named vector
    dplyr::select(gene_name_fixed, lfc) %>% deframe()

  # # plot the ranked fold changes
  # barplot(sort(gene_ranks, decreasing = T))

  # pull pathway information: list of pathways where each element is a character vector of gene symbols
  pathways_to_plot <- fa_gsea_list[[contrast]] %>% ungroup() %>%
    # # top pathways only
    # dplyr::slice_min(order_by = padj, n = 10) %>%
    dplyr::select(pathway, genes) %>%
    # list of genes per pathway as comma-separated strings
    deframe() %>% as.list() %>%
    # list of genes per pathway as vectors
    lapply(function(gene_names) stringr::str_split_1(gene_names, ","))

  # make plot for all or top pathways
  lapply(names(pathways_to_plot), function(pathway_name) {
    # # make sure all the gene names match
    # print(all(pathways_to_plot[[pathway_name]] %in% names(gene_ranks)))

    # plot leading edge enrichment graph
    plot_enrich <- plotEnrichment(pathway = pathways_to_plot[[pathway_name]],
                                  stats = gene_ranks) +
      # wrap long titles
      labs(title = str_wrap(str_replace_all(pathway_name, pattern = "_", replacement = " "),
                            width = 20))

    # add a line representing the expression of all genes
    # going from red on left (upregulated) to blue on right (downregulated)
    # first set up a DF with gene, LFC, and rank
    gene_ranks_number <- gene_ranks %>%
      enframe(name = "gene", value = "lfc") %>%
      rowid_to_column(var = "rank")
    # # break rank into intervals (higher interval = higher expression = lower rank)
    # # reference: enrichplot::gseaplot2 "GSEA plot that mimic the plot generated by broad institute's GSEA software"
    # # https://github.com/YuLab-SMU/enrichplot/blob/devel/R/gseaplot.R
    # v <- seq(1, sum(gene_ranks_number$rank), length.out = 9)
    # break rank into intervals separately for up- and down-regulated genes
    v_up <- seq(1,         sum(gene_ranks_number[gene_ranks_number$lfc > 0, "rank"]), length.out = 5)
    v_dn <- seq(max(v_up), sum(gene_ranks_number[gene_ranks_number$lfc < 0, "rank"]), length.out = 5)
    v <- c(v_up, v_dn[-1]) # first value of v_dn is the same as last of v_up, so drop it
    gene_ranks_number$interval <- findInterval(rev(cumsum(gene_ranks_number$rank)), v)
    if (min(gene_ranks_number$interval) == 0) { gene_ranks_number$interval <- gene_ranks_number$interval + 1 }
    # add to plot
    plot_enrich <- plot_enrich +
      geom_line(data = gene_ranks_number,
                aes(x = rank, y = 0, color = interval),
                linewidth = 10, alpha = 0.5,
                # plot on top of existing graph using new data
                inherit.aes = FALSE) +
      # change colors: dark red -> white -> dark blue
      scale_color_gradientn(colors = c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))) +
      # remove legend
      theme(legend.position = "none")

    # final plot
    plot_enrich
  })
}
```

# Pathway Analysis- Over-representation

Over-Representation Analysis (ORA) is a statistical method used to determine whether a predefined set of genes (e.g., genes belonging to a specific biological pathway or function) is over-represented (or enriched) among a list of differentially expressed genes (DEGs) from RNA-seq data. Adventages of ORA:

- Simplicity: Easy to perform and interpret.
- Biological Insight: Helps to identify pathways and processes that are significantly affected in the condition studied.
- Prior Knowledge Integration: Utilizes existing biological knowledge through predefined gene sets.

```{r run-ora, warning=F, message=F, eval=run_FA}
# Go through each contrast in the de_list list object and
fa_list <- lapply(de_list, function(contrast) {
  # For a given contrast pull out the full differential expression results
  res <- contrast[["all"]]

  # Extract the universe of genes to the ORA on and assign it to the object universe
  universe <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj)) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object
    pull(gene_id) # Extract the gene_id column
  # Subset the full annotations to only contain universe we will evaluate within
  universe_mapping <- rdata %>% # Start with the full annotations
    filter(gene_id %in% universe, !is.na(entrez)) # Only retain the the annotations that are in the universe of genes (baseMean greater than 0, along with defined p-value and adjusted p-value) and have an ENTREZ ID
  # AnnotationDbi::select(ann.org, universe, 'ENSEMBL', columns=c('ENTREZID', 'SYMBOL'))

  # Create the input vector of genes for the ORA with a positive or negative lfc
  ora_input <- res %>% # Start with the full differential expression results (NOTE: these do exclude genes without any expression, p-value or adjusted p-value)
    filter(!is.na(padj), padj < 0.01, abs(lfc) > 0.3) %>% # Remove any adjusted p-values that are NA, but there should be since they were filtered when making the de_list object, and only keep genes with adjusted p-values less than 0.01 and the absolute value of the lfc is greater than 0.3
    pull(gene_id) # Extract the gene ID
  # Create the ENTREZ ID input for the ORA
  # NOTE: Change to the correct species if not working in human or mouse
  input_entrezid <- rdata %>% # Start with the full annotations