NGSBcell/NGS_DEPipeline.Rmd

---
title: "Next-Generation Sequencing Single Cell Differential Expression Analysis Pipeline"
author: "Anni Liu"
date: "June 19, 2022"
output: 
  rmdformats::downcute:
    downcute_theme: "chaos"
    fig_height: 4.5
    fig_width: 4.5 
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  options(scipen = 999, digits = 4),
  cache = TRUE,
  error = FALSE,
  message = FALSE,
  warning = FALSE,
  tidy.opts = list(width.cutoff = 60),
  tidy = TRUE,
  fig.width = 12,
  fig.height = 8
)
```


# Key points for the differential expression analysis pipeline

-   Sequence reads quality control
      + rule: **trim_galore_trim**
-   Quantify expression 
      + rule: **kallisto_quant**
-   Differential gene expression analysis with R 
      + **DEReport.Rmd**
  * Principal component analysis for different cell types
      + key functions: `rlog()` and `plotPCA()`
  * Volcano plot
      + key functions: `ggplot()`, `geom_point()`, `do.call()`, and `grid.arrange()`
  * Pairwise comparison for differential gene expression between cell types
      + key functions: `DESeq()` and `results()`

# Build the snakemake rule of taking FASTQ files from the seqencing facility, assessing the quality of reads, and quantifying abundances of transcripts from RNA-Seq data
```{python eval=FALSE}
# configfile: "config.yaml"
from collections import Counter
import re
SAMPLE_NAME = sorted(set(glob_wildcards("data/samples/{fname}_R1_001_copy{replicate}.fastq.gz").fname)) # Automatically detect the sorted unique sample names
SAMPLE_NAME_ALL = sorted(glob_wildcards("data/samples/{fname}_R1_001_copy{replicate}.fastq.gz").fname) # Automatically detect the sorted sample names
REPLICATE = sorted(set(glob_wildcards("data/samples/{fname}_R1_001_copy{replicate}.fastq.gz").replicate)) # Considering the user may have different numbers of technical replicates for each sample
CONDITION = [re.split(r"['_'\d]", element)[0] for element in SAMPLE_NAME] # Automatically detect the sorted conditions
CONDITION_ALL = [re.split(r"['_'\d]", element)[0] for element in SAMPLE_NAME_ALL] # Automatically detect the sorted conditions
BIOLOGICAL_REPLICATE = list(Counter(CONDITION).values()) # Automatically detect the number of conditions (i.e., the number of Bmem)
CONVERTED_LIST = [str(i) for i in BIOLOGICAL_REPLICATE] # Convert each integer element in a list into a string

rule all:
    input:
        expand("kallisto_results/{target}/abundance.tsv",
        target = SAMPLE_NAME),
        expand("kallisto_results/{target}/run_info.json",
        target = SAMPLE_NAME)
    output: "DEReport.html"
    params:
        sample_list = '"' + '","'.join(SAMPLE_NAME) + '"', # Construct a long string containing each individual sample name separated by comma, with the initial and ending double quotes
        condition_list = '"' + '","'.join(CONDITION) + '"', # Construct a long string containing each individual condition name corresponding to each individual sample name separated by comma, with the initial and ending double quotes
        condition_times = ",".join(CONVERTED_LIST) # Construct a long string containing the frequency of each condition separated by comma
    shell:
        """
        Rscript -e 'library(rmarkdown); library(rmdformats); rmarkdown::render(input = "DEReport.Rmd", params = list("SAMPLE_NAME" = c({params.sample_list}), "CONDITION" = c({params.condition_list}), "BIOLOGICAL_REPLICATE" = c({params.condition_times})), output_format = "all", output_file = "{output}")'
        """

rule trim_galore_trim:
    input:
        "data/samples/{sample}_R1_001_copy{replicate}.fastq.gz"
    output:
        "trimmed_reads/{sample}/{sample}_R1_001_copy{replicate}_trimmed.fq.gz"
    threads: 4
    shell:
        "trim_galore {input} "
        "--cores {threads} "
        "--quality 20 "
        "--output_dir trimmed_reads/{wildcards.sample} "
        "--no_report_file"

rule kallisto_quant:
    input:
        expand("trimmed_reads/{{sample}}/{{sample}}_R1_001_copy{replicate}_trimmed.fq.gz",
        replicate = REPLICATE)
    output:
        "kallisto_results/{sample}/abundance.tsv",
        "kallisto_results/{sample}/run_info.json"
    params:
        index = "mus_musculus/transcriptome.idx"
    threads: 4
    shell:
        "kallisto quant "
        "--index {params.index} "
        "--plaintext "
        "--output-dir kallisto_results/{wildcards.sample} "
        "--single -l 200 -s 20 "
        "{input}"
```


# Flow from expression quantification to differential expression analysis in R
```{r eval=FALSE}

# The following contents are extracted from DEReport.Rmd
---
title: "Differential Expression Analysis Report"
author: "Anni Liu"
output: 
  rmdformats::downcute:
    downcute_theme: "chaos"
date: "June 6 - June 12, 2022"
params:
  SAMPLE_NAME: [""]
  CONDITION: [""]
  BIOLOGICAL_REPLICATE: [""]
  
---

# Load libraries
library(tidyverse)
library(tximport)

# Set the file path
dir <- c("/Users/anniliu/Desktop/DE_pipeline")

# Propagate SAMPLE_NAME from params -to-> {params.sample_list} in `rule all` -to-> YAML in head of rmarkdown -to-> here: getElement(params, "SAMPLE_NAME")
sample_name <- getElement(params, "SAMPLE_NAME")
files <- file.path(dir, "kallisto_results", sample_name, "abundance.tsv")
names(files) <- sample_name # Assign individual names to individual tsv file path

# Import tx2 gene file for mice
library(readr)
tx2gene_mice <- read.table("/Users/anniliu/Desktop/DE_pipeline/mus_musculus/transcripts_to_genes.txt",
                           header = FALSE) %>%
  select(-V3) # Select away gene names
head(tx2gene_mice) # Take a peek at the first 6 records of the file

# Generate the count matrix for each sample
txi.kallisto.tsv <- tximport(files, 
                             type = "kallisto", 
                             tx2gene = tx2gene_mice, 
                             ignoreAfterBar = TRUE) # Split the rownames at the first bar “|”, and only includes the first “ENST” identifier.

# Look at the structure of count data
cts <- txi.kallisto.tsv$counts
str(cts)

# Look at the number of sequence fragments that are assigned to each gene
head(cts) %>% # Take a peek at the first 6 records of gene expression in raw counts
  knitr::kable()

# Create a data frame of sample information
condition_name <- getElement(params, "CONDITION")
condition_times <- getElement(params, "BIOLOGICAL_REPLICATE")
# print(condition_name)
# print(condition_times)
coldata <- data.frame(condition = condition_name, type = rep("single-read"))
rownames(coldata) <- sample_name
coldata %>%
  knitr::kable()

# Factorize two variables
coldata$condition <- factor(coldata$condition)
coldata$type <- factor(coldata$type)

# Construct DESeqDataSetFromMatrix
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = round(cts),
                              colData = coldata,
                              design = ~ condition)
dds

# Pre-filtering least variable genes
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
head(dds)

# Perform differential expression analysis
dds <- DESeq(dds)

# Extract the pairwise combination of 2 conditions
pair_condition <- combn(unique(condition_name), 2, simplify = FALSE)
pair_number <- length(pair_condition)
res <- list(length = pair_number)

# Return a result table with base mean, log2 fold changes, p values and adjusted p values for each pairwise comparison of 2 conditions
for (i in 1:pair_number){
  contrast <- c("condition", pair_condition[[i]])
    # print(contrast)
  res[i] <- list(results(dds, contrast = contrast))
}
print(res)

# Batch volcano plot
plot_array <- list()
for (i in 1:pair_number) {
  res_new <- res[[i]] %>%
    data.frame() %>%
    rownames_to_column(var = "ensgene") %>%
    mutate(threshold = padj < 0.5)

  plot_array[[i]] <- ggplot(res_new) +
    geom_point(mapping = aes(x = log2FoldChange,
                             y = -log10(padj),
                             color = threshold)) +
    labs(
      title = paste0(pair_condition[[i]][1], " vs ", pair_condition[[i]][2]),
      x = "log2 fold change",
      y = "-log10 adjusted p-value"
    ) +
    xlim(-10, 10) + 
    scale_color_manual(values = c("black", "#CFB53B")) +
    theme_bw() +
    guides(color = "none")
}
library(gridExtra)
do.call("grid.arrange", c(plot_array, ncol = 3))

# Perform principal component analysis
plotPCA(rlog(dds), intgroup = "condition")
```