ATACSeq/Integration/SingleCell/Integration_Full_v1.qmd

---
title: "Annotate cell types of human PBMC scATACseq datasets using human PBMC scRNAseq as reference"
format: html
editor: visual
execute: 
  echo: true
---

# Question of interest
How to annotate cells from an scATAC-seq experiment by leveraging a scRNA-seq dataset with annotated cell types?

```{r}
# Load image
load(file = "20240620_scRNA_scATAC_image.RData")

# Save image
save.image(file = "20240620_scRNA_scATAC_image.RData")
```

# Preprocess scRNAseq and scATACseq data
## Attach the libraries {style="blue"}

```{r}
#| label: load-packages
#| message: false
#| echo: false
# install.packages('Seurat')
# devtools::install_github('satijalab/seurat-data')
# BiocManager::install("EnsDb.Hsapiens.v86")
# BiocManager::install("biovizBase")
library(SeuratData)
options(timeout = 1000) # positive integer. The timeout for some Internet operations, in seconds. Default 60 (seconds) but can be set from environment variable R_DEFAULT_INTERNET_TIMEOUT.
# InstallData("pbmcMultiome.SeuratData")
library(Seurat)
packageVersion("Seurat")
# [1] '5.0.3'
library(Signac) # Analysis of Single-Cell Chromatin Data
library(biovizBase)
library(EnsDb.Hsapiens.v86)
# This package loads an SQL connection to a database containing annotations from Ensembl
# This data package was made from resources at Ensembl on Thu May 18 16:32:27 2017 and based on the 86
library(ggplot2)
library(cowplot)
library(Matrix)
```

## Load data

```{r}
pbmc_rna <- LoadData("pbmcMultiome", "pbmc.rna")
pbmc_atac <- LoadData("pbmcMultiome", "pbmc.atac")
dim(pbmc_atac)
dim(pbmc_rna)
```

## Convert pbmc_rna\[\["RNA"\]\] into another class type "Assay5"

```{r}
pbmc_rna[["RNA"]] <- as(pbmc_rna[["RNA"]], Class = "Assay5")
class(pbmc_rna[["RNA"]])
```

## Repeat QC steps

```{r}
pbmc_rna_qc <- subset(pbmc_rna, seurat_annotations != "filtered") 
pbmc_rna@meta.data[["seurat_annotations"]]
dim(pbmc_rna[["RNA"]])
dim(pbmc_rna_qc[["RNA"]]) 

pbmc_atac_qc <- subset(pbmc_atac, seurat_annotations != "filtered")
```

## Run standard scRNAseq analysis

Workflow: Normalize data -> Identify highly variable features -> Scale data -> Build PCA -> Build UMAP

```{r}
pbmc_rna_norm <- NormalizeData(object = pbmc_rna_qc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc_rna_vf <- FindVariableFeatures(object = pbmc_rna_norm, selection.method = "vst", clip.max = "auto")
dim(pbmc_rna_vf)

pbmc_rna_scale <- ScaleData(object = pbmc_rna_vf) 
pbmc_rna_pca <- RunPCA(pbmc_rna_scale, npcs = 50)
pbmc_rna_umap <- RunUMAP(object = pbmc_rna_pca, dims = 1:30, umap.method = "uwot")
```

## Run standard scATACseq analysis with added gene annotation information

Workflow: Extract genomic ranges from EnsDb object -> Set genome and annotation -> Build TF-IDF normalization -> Find top features -> Build SVD -> Build UMAP

```{r}
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86, standard.chromosomes = TRUE)
annotations@seqnames
# factor-Rle of length 3021151 with 25 runs
# Lengths:  92459  65287 276555 135207 190802 141516 182677 182178 116305 ... 133785  64670 107112   6155  51812 113543  33885     26
# Values :     X      20     1      6      3      7      12     11     4  ...     5      22     10     Y      18     15     21     MT
# Levels(25): X 20 1 6 3 7 12 11 4 17 2 16 8 19 9 13 14 5 22 10 Y 18 15 21 MT

seqlevelsStyle(annotations) <- "UCSC" # Rename the seqlevels of an object according to a given style
annotations@seqnames
# factor-Rle of length 3021151 with 25 runs
# Lengths:  92459  65287 276555 135207 190802 141516 182677 182178 116305 ... 133785  64670 107112   6155  51812 113543  33885     26
# Values :  chrX   chr20  chr1   chr6   chr3   chr7   chr12  chr11  chr4  ...  chr5   chr22  chr10  chrY   chr18  chr15  chr21  chrM 
# Levels(25): chrX chr20 chr1 chr6 chr3 chr7 chr12 chr11 chr4 chr17 chr2 ... chr9 chr13 chr14 chr5 chr22 chr10 chrY chr18 chr15 chr21 chrM

genome(x = annotations) <- "hg38" 
# Set the genome as hg38 for a ChromatinAssay

Annotation(object = pbmc_atac_qc) <- annotations 
# Set the annotation as annotations for a ChromatinAssay

pbmc_atac <- RunTFIDF(object = pbmc_atac_qc) 
pbmc_atac <- FindTopFeatures(object = pbmc_atac, min.cutoff = "q0") 

pbmc_atac <- RunSVD(object = pbmc_atac)
pbmc_atac_umap <- RunUMAP(pbmc_atac, 
                          reduction = "lsi", 
                          dims = 2:30, 
                          reduction.name = "umap_atac", 
                          reduction.key = "atacUMAP_")
pbmc_atac_umap@reductions$umap_atac
```

## Visualize the results from scRNAseq and scATACseq

```{r}
#| label: fig-bill-dims-species
#| fig-cap: A side-by-side UMAP plots of scRNAseq (annotated) and scATACseq cells.
#| fig-width: 11
#| fig-asp: 0.618
#| fig-alt: AA side-by-side UMAP plots of scRNAseq (annotated) and scATACseq cells.
#| warning: false
#| output-location: column-fragment

# Predict annotations for the scATAC-seq cells
pbmc_rna_viz <- DimPlot(pbmc_rna_umap, 
                        group.by = "seurat_annotations", 
                        label = TRUE) + 
  Seurat::NoLegend() + 
  ggtitle("RNA")

pbmc_atac_viz <- DimPlot(pbmc_atac_umap, 
                         group.by = "orig.ident", 
                         label = FALSE) + 
  NoLegend() + 
  ggtitle("ATAC")

pbmc_rna_viz + pbmc_atac_viz

combined_viz <- (pbmc_rna_viz + pbmc_atac_viz) & xlab("UMAP 1") & ylab("UMAP 2") & theme(axis.title = element_text(size = 16))
ggsave(filename = "./EDA_scRNAseq_scATACseq.jpg", height = 7, width = 12, plot = combined_viz, dpi = 300)
```

![](EDA_scRNAseq_scATACseq.jpg){#fig-integration}

See @fig-integration for an illustration. More info: <https://quarto.org/docs/authoring/cross-references.html>


## Identify anchors between scRNA-seq and scATAC-seq data

```{r}
# Estimate the transcriptional activity of each gene by quantifying ATAC-seq counts in the 2 kb-upstream region and gene body
library(Signac)
gene_act <- GeneActivity(object = pbmc_atac_umap, features = VariableFeatures(object = pbmc_rna_umap))
length(VariableFeatures(object = pbmc_rna_umap)) 
dim(gene_act) 
View(gene_act)
gene_act@Dimnames

# Allocate gene activities to a new assay
pbmc_atac_umap[["ACTIVITY"]] <- CreateAssayObject(counts = gene_act)

# Normalize gene activities
DefaultAssay(pbmc_atac_umap) <- "ACTIVITY" 
pbmc_atac_umap <- NormalizeData(object = pbmc_atac_umap, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc_atac_umap <- ScaleData(object = pbmc_atac_umap, features = rownames(pbmc_atac_umap))

# Detect anchors
transfer_anchors <- FindTransferAnchors(reference = pbmc_rna_umap, 
                                        query = pbmc_atac_umap, # Use the gene activity 
                                        features = VariableFeatures(object = pbmc_rna_umap),
                                        reference.assay = "RNA", 
                                        query.assay = "ACTIVITY", 
                                        reduction = "cca")
```

## Label-transfer the annotations from the scRNA-seq cells to scATAC-seq cells

```{r}
celltype_preds <- TransferData(anchorset = transfer_anchors, 
                               refdata = pbmc_rna_umap$seurat_annotations, # Transfer cell type from scRNAseq to scATACseq
                               weight.reduction = pbmc_atac_umap[["lsi"]], 
                               dims = 2:30)

class(pbmc_atac_umap[["lsi"]])

pbmc_atac_umap <- AddMetaData(pbmc_atac_umap, metadata = celltype_preds)

pbmc_atac_umap$predicted.id |> View() # Predicted cell types
pbmc_atac_umap$prediction.score.CD4.Naive |> View() # Confidence score of the predicted cell type CD4 Naive for each ATAC-seq cell
pbmc_atac_umap$prediction.score.max |> View() # Quantify the uncertainty associated with the predicted cell type for each ATAC-seq cell
```

## Compare predicted cell types and ground-truth in scATACseq data through figures and tables
```{r}
pbmc_atac_umap$preds_correct <- pbmc_atac_umap$predicted.id == pbmc_atac$seurat_annotations
sum(pbmc_atac_umap$predicted.id == pbmc_atac$seurat_annotations)/length(pbmc_atac_umap$predicted.id) # Overall accuracy is 0.780922

predict_viz <- DimPlot(pbmc_atac_umap, group.by = "predicted.id", label = TRUE) + 
  NoLegend() + 
  ggtitle("Predicted cell type")

truth_viz <- DimPlot(pbmc_atac_umap, group.by = "seurat_annotations", label = TRUE) + 
  NoLegend() + 
  ggtitle("Actual cell type")

predict_viz | truth_viz

(preds_tab <- table(pbmc_atac$seurat_annotations, pbmc_atac_umap$predicted.id)) # Row names are true cell types

(preds_tab_norm <- preds_tab/rowSums(preds_tab))
# ! Normalize by the number of cells in each true cell type; each cell means the fraction of cells of each type which are correctly predicted among all cells of each type

(preds_tab_df <- as.data.frame(preds_tab_norm))

(preds_heatmap <- ggplot(preds_tab_df, aes(Var1, Var2, fill = Freq)) + geom_tile() + scale_fill_gradient(name = "Fraction of cells",
    low = "#ffffc8", high = "#7d0025") + xlab("Ground-truth cell type") + ylab("Predicted cell type") +
    theme_cowplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)))

preds_metadata <- FetchData(pbmc_atac_umap, vars = c("prediction.score.max", "preds_correct"))

correct_num <- length(which(pbmc_atac$seurat_annotations == pbmc_atac_umap$predicted.id))
incorrect_num <- length(which(pbmc_atac$seurat_annotations != pbmc_atac_umap$predicted.id))

(preds_density <- ggplot(preds_metadata, aes(prediction.score.max, fill = preds_correct, colour = preds_correct)) + 
  geom_density(alpha = 0.5) + theme_cowplot() + 
  scale_fill_discrete(name = "Cell type prediction", labels = c(paste0("FALSE (n = ", incorrect_num, ")"), paste0("TRUE (n = ", correct_num, ")"))) + 
  scale_color_discrete(name = "Annotation Correct", labels = c(paste0("FALSE (n = ", incorrect_num, ")"), paste0("TRUE (n = ", correct_num, ")"))) + xlab("Prediction Score"))

preds_heatmap + preds_density
```

## Visualize scRNA-seq and scATAC-seq cells on the same plot
```{r}
genes_target <- VariableFeatures(pbmc_rna_umap)

ref_data <- GetAssayData(pbmc_rna_umap, assay = "RNA", slot = "data")[genes_target, ]

impute_atac <- TransferData(anchorset = transfer_anchors, # Repurpose previous calculated transfer_anchors
                            refdata = ref_data, 
                            weight.reduction = pbmc_atac_umap[["lsi"]],
                            dims = 2:30)

pbmc_atac_umap[["RNA"]] <- impute_atac

coembed_obj <- merge(x = pbmc_rna_umap, y = pbmc_atac_umap)

coembed_obj <- ScaleData(coembed_obj, features = genes_target, do.scale = FALSE, do.center = TRUE)
coembed_obj <- RunPCA(object = coembed_obj, features = genes_target, verbose = FALSE)
coembed_obj <- RunUMAP(object = coembed_obj, dims = 1:30)

DimPlot(coembed_obj, group.by = c("orig.ident", "seurat_annotations"))
```