SingleCellRNASeq/Seurat/SkinCell.Rmd

---
title: "Analysis of Mouse Skin Single Cell RNA-Seq Data with Seurat"
author: "Anni Liu"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output: 
  html_document:
    code_folding: show
---

```{r, shorcut, include=FALSE}
## RStudio keyboard shortcut
# Cursor at the beginning of a command line: Ctrl+A
# Cursor at the end of a command line: Ctrl+E
# Clear all the code from your console: Ctrl+L
# Create a pipe operator %>%: Ctrl+Shift+M (Windows) or Cmd+Shift+M (Mac)
# Create an assignment operator <-: Alt+- (Windows) or Option+-(Mac) 
# Knit a document (knitr): Ctrl+Shift+K (Windows) or Cmd+Shift+K (Mac)
# Comment or uncomment current selection: Ctrl+Shift+C (Windows) or Cmd+Shift+C (Mac)
```

# Load `Cell Ranger` results into `Seurat` object
```{r}
# install.packages("Seurat")
library(Seurat)
samID <- "CTRL" # CTRL: control
x10_file <- Read10X("./data/CTRL/") # Read 3 files: barcodes.tsv.gz; features.tsv.gz (gene ID); matrix.mtx.gz
obj <- CreateSeuratObject(counts = x10_file, project = samID, min.cells = 5, min.features = 200)
# min.cells: Include features detected in at least this many cells. 
# min.features: Include cells where at least this many features are detected.
obj$dset <- samID # Create a character vector "CTRL" in the meta.data
# obj@meta.data[["dset"]]
obj <- RenameCells(obj, add.cell.id = samID) # Rename the barcode (optional); add.cell.id: prefix to add cell names - active.ident object
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^mt-") # Calculate the mitochondrial contents in each cell; PercentageFeatureSet(): calculate the percentage of all counts that belong to a given set of features
```


# View the cell meta information
```{r}
# The object `meta.data` contains the cell barcode, mitochondrial contents, datasets, and clustering, etc.
obj@meta.data[1:6, ]
rownames(obj@meta.data[1:6, ]) # These are cell barcodes
obj@meta.data[1, "nCount_RNA"] # This cell contains 1290 reads
obj@meta.data[1, "nFeature_RNA"] # There are 535 genes detected for this cell in the CTRL group
obj@meta.data[1, "percent.mt"] # There are 43% of the reads belonging to the mitochondria
nrow(obj@meta.data) # There are 7939 cells
```


# View the raw count matrix
```{r}
head(obj@assays$RNA@counts)
# Each row represents each gene symbol; each column represents each cell

head(obj@assays$RNA@data)
# View the normalized count for each gene in each cell
```


# Evaluate the data
```{r}
# Compare read counts, gene counts, and mitochondrial content
VlnPlot(obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.2)
# nCount_RNA: read counts per cell; nFeature_RNA: number of genes per cell; percent.mt: mitochondrial percentage per cell
# pt.size: point size for geom_violin

# Pairwise compare the read counts and gene counts
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
# Notice that points on the upper right corner: two or more cells cluster in a single droplet

# Pairwise compare the read counts and mitochondrial content
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
# Notice that points on the upper left corner: cell debris with low contents of nuclear genes (nCount_RNA) while high contents of mitochondrial genes (percent.mt)
```


# Remove the droplets/cells with high mitochondrial content and low read counts
```{r}
summary(obj@meta.data$percent.mt)

mt_cutH <- 10 # Set the cut-off for the mitochondrial content; usually we use 10 percent; if we study the immune cell/muscle cell with the high mitochondrial content, we may assign a higher value than 10 to the cut-off
obj_unfiltered <- obj
obj <- subset(obj, subset = percent.mt < mt_cutH)
```


# Evaluate the filtered data
```{r}
VlnPlot(obj_unfiltered, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.2)
VlnPlot(obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.2)

FeatureScatter(obj_unfiltered, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(obj_unfiltered, feature1 = "nCount_RNA", feature2 = "percent.mt")
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
```


# Determine the cell cycle phase 
```{r}
# TD: update the following codes
# Convert the human marker genes for S phase and G2/M phase into the corresponding mouse genes
# Ref: https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#step1-identifying-the-database-you-need
convertHumanGeneList <- function(x){
  require("biomaRt")
  human <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", mirror = "www")
  mouse <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www")
  genesV2 = getLDS(attributes = c("hgnc_symbol"), 
                   filters = "hgnc_symbol", 
                   values = x , 
                   mart = human,
                   attributesL = c("mgi_symbol"), 
                   martL = mouse, 
                   uniqueRows = T)
  humanx <- unique(genesV2[, 2])
  return(humanx)}
s_gene <- convertHumanGeneList(cc.genes$s.genes)
g2m_gene <- convertHumanGeneList(cc.genes$g2m.genes)
```


# Estimate the cell cycle
```{r}
obj <- CellCycleScoring(obj, s.features = s_gene, g2m.features = g2m_gene, set.ident = T)
obj@meta.data[1:2, ]

# Show the number of cells in each cell cycle
yd_dat <- as.data.frame(table(obj@meta.data$dset, obj@meta.data$Phase))
head(yd_dat)

# Visualize the distribution of cell cycles in a bar plot
library(ggplot2)
ggplot(yd_dat, aes(x = Var1, y = Freq, fill = Var2)) +
  geom_bar(stat = "identity", position = "stack") + 
  labs(x = "", y = "Counts", fill = "Phase") + 
  theme_classic()
```


# *Normalize the data
```{r}
obj <- ScaleData(obj, vars.to.regress = c("percent.mt", "S.score", "G2M.score", "Phase"))
```


# Construct the principal component analysis 
```{r}
set.seed(1000)
obj <- RunPCA(obj, npcs = 30, verbose = FALSE)
# npcs: total Number of PCs to compute and store 
# verbose: print the top genes associated with high/low loadings for the PCs

# Use the elbow plot to estimate the number of principle components
ElbowPlot(obj, ndims = 30)
# ndims: number of PCs

numPC <- 15
```


# Compute the nearest neighbor graph
```{r}
set.seed(1000)
maxPC <- numPC # Determined by the elbow plot
# Construct the shared nearest-neighbor graph
obj <- FindNeighbors(obj, reduction = "pca", dims = 1:maxPC)
# Identify clusters of cells by a shared nearest neighbor (SNN) modularity optimization based clustering algorithm
obj <- FindClusters(obj, resolution = 0.5)

# resolution: use a value above (below) 1.0 if you want to obtain a larger (smaller) number of clusters.
```


# Clustering by UMAP and t-SNE
```{r}
obj <- RunUMAP(obj, reduction = "pca", dims = 1:maxPC)
obj <- RunTSNE(obj, reduction = "pca", dims = 1:maxPC)

# Create a UMAP plot
DimPlot(obj, reduction = "umap")

# Create a t-SNE plot
DimPlot(obj, reduction = "tsne")
```


# Identify marker genes for each cluster
```{r}
obj <- SetIdent(obj, value = "seurat_clusters") # obj@meta.data$seurat_clusters
clust.markers <- FindAllMarkers(obj, 
                                only.pos = TRUE,
                                min.pct = 0.25, 
                                logfc.threshold = 0.25)
# min.pct: only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expressed. Default is 0.1
# logfc.threshold: limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells. Default is 0.25 Increasing logfc.threshold speeds up the function, but can miss weaker signals.
head(clust.markers)
```


# Select top 5 marker genees for each cluster
```{r}
topG <- clust.markers %>% 
  group_by(cluster) %>% 
  top_n(n = 5, wt = avg_log2FC)
head(topG)
# top_n {dplyr}: select top n rows (by value)
# wt: the variable to use for ordering; avg_log2FC is a column from `clust.markers`
```


# Advanced plots
## Adjust point size
```{r}
# Create a UMAP plot with customized point size
DimPlot(obj, reduction = "umap", pt.size = 0.2)
```


## Label clusters
```{r}
# Create a UMAP plot with clustering labels; no need to map the legend color to the cluster color
DimPlot(obj, reduction = "umap", pt.size = 0.2, label = T) +
  NoLegend()
```


## *Make a heatmap of top 5 marker gene expression
```{r}
DoHeatmap(obj, features = topG$gene) + 
  NoLegend()
```


## Make a feature plot of marker gene expression
```{r}
# Examine the location in the UMAP cluster expressed by the specific marker gene; what proportion of cells are expressed by this marker gene
gene_marker <- c("Krt1", "Pthlh", "Krt14", "Cenpa", "Shh")
FeaturePlot(obj, features = gene_marker, pt.size = 0.2)
```


## !Make a ridge plot of marker gene expression
```{r}
# Display the distribution of expression levels of the specific marker gene per cluster
RidgePlot(obj, features = gene_marker)
```


# Compare the cell cycle phases in each cluster
```{r}
tbl <- table(obj@meta.data$seurat_clusters, obj@meta.data$Phase)
tbl_dat <- as.data.frame(tbl)
to <- rowSums(tbl)
names(to) <- rownames(tbl)
tbl_dat$to <- to[match(names(to), tbl_dat$Var1)]
tbl_dat$prop <- tbl_dat$Freq / tbl_dat$to
tbl_dat[1:2, ]

ggplot(tbl_dat, aes(x = Var1, y = prop, fill = Var2)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(x = "Seurat_clusters", y = "Proportion", fill = "Phase") +
  theme_classic()

# Notice that the second cluster may need to be removed, due to the potential cell cycle bias in much fewer proportions of G1 phases

cellID <- rownames(obj@meta.data)[!obj@meta.data$seurat_clusters == 2]
obj_rm <- obj[, cellID]
saveRDS(obj_rm, "scSeq_Seurat_clean.rds")
```


# More ...
## Manipulate data with Seurat
```{r}
library(Seurat)
# Load the Seurat object
obj2 <- readRDS("./data/scSeq_CKO_1kCell_ori.rds")

# Calculate the mitochondrial contents of each cell
obj2[["percent.mt"]] <- PercentageFeatureSet(obj2, pattern = "^mt-") 

# View the meta data information
obj2@meta.data[1:6, ]
```


## Perform the quality control
```{r}
# Access the read counts (nCount_RNA), gene counts (nFeature_RNA), and mitochondrial content (percent.mt) for each cell and draw a violin plot of each
VlnPlot(obj2, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), 
        cols = c("#82AC7C"), pt.size = 0.3)


# Make a scatter plot for nCount_RNA vs nFeature_RNA
FeatureScatter(obj2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
# Notice that points on the upper right corner: two or more cells cluster in a single droplet

# Make a scatter plot for nCount_RNA vs percent.mt
FeatureScatter(obj2, feature1 = "nCount_RNA", feature2 = "percent.mt")
# Notice that points on the upper left corner: these may be cell debris with low contents of nuclear genes (nCount_RNA) while high contents of mitochondrial genes (percent.mt)

# Remove cells with percent.mt >= 10
summary(obj2@meta.data$percent.mt)
mt.cutoff <- 10 
obj3 <- subset(obj2, subset = percent.mt < mt.cutoff)

# Evaluate the filtered data
VlnPlot(obj2, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.3)
VlnPlot(obj3, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), pt.size = 0.3)

FeatureScatter(obj2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(obj3, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(obj2, feature1 = "nCount_RNA", feature2 = "percent.mt")
FeatureScatter(obj3, feature1 = "nCount_RNA", feature2 = "percent.mt")
```


## Evaluate the cell cycle
```{r}
# Estimate the cell cycle phase of each cell and make a table to describe how many cells per phase
## Convert the human marker genes for S phase and G2/M phase into the corresponding mouse genes
convertHumanGeneList <- function(x){
  require("biomaRt")
  human <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", mirror = "useast")
  mouse <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "useast")
  genesV2 = getLDS(attributes = c("hgnc_symbol"), 
                   filters = "hgnc_symbol", 
                   values = x , 
                   mart = human,
                   attributesL = c("mgi_symbol"), 
                   martL = mouse, 
                   uniqueRows = T)
  humanx <- unique(genesV2[, 2])
  return(humanx)}
s_gene <- convertHumanGeneList(cc.genes$s.genes)
g2m_gene <- convertHumanGeneList(cc.genes$g2m.genes)

obj3 <- CellCycleScoring(obj3, s.features = s_gene, g2m.features = g2m_gene, set.ident = T)
obj@meta.data[1:2, ]

## Show the number of cells in each cell cycle
as.data.frame(table(obj@meta.data$dset, obj@meta.data$Phase))
```


## Normalize the data 
```{r}
# Scale data by regressing to mitochondrial content (percent.mt) and cell cycle (S.score, G2M.score, Phase)
obj3 <- ScaleData(obj3, vars.to.regress = c("percent.mt", "S.score", "G2M.score", "Phase"))
```


## Cluster the data
```{r}
# Construct principle component analysis (PCA) and estimate how many PCs would best represent this data
set.seed(1000)
obj3 <- RunPCA(obj3, npcs = 30, verbose = FALSE)
ElbowPlot(obj3, ndims = 30)
numPC <- 10

# Compute the nearest neighbor graph
maxPC <- numPC
obj3 <- FindNeighbors(obj3, reduction = "pca", dims = 1:maxPC)
obj3 <- FindClusters(obj3, resolution = 0.5)

# Create UMAP and t-SNE plots
obj3 <- RunUMAP(obj3, reduction = "pca", dims = 1:maxPC)
obj3 <- RunTSNE(obj3, reduction = "pca", dims = 1:maxPC)
DimPlot(obj3, reduction = "umap")
DimPlot(obj3, reduction = "tsne")
```


## Identify marker genes for each cluster
```{r}
# Select marker genes for each cluster and generate a heatmap of the top five marker genes for each cluster
obj3 <- SetIdent(obj3, value = "seurat_clusters")
cluster.marker <- FindAllMarkers(obj3, 
                                 only.pos = T,
                                 min.pct = 0.25, 
                                 logfc.threshold = 0.25)

top5.gene <- cluster.marker %>% 
  group_by(cluster) %>% 
  top_n(n = 5, wt = avg_log2FC)
head(top5.gene)

DoHeatmap(obj3, features = top5.gene$gene) + 
  NoLegend()
```


## Show the top marker genes of each cluster in FeaturePlot and RidgePlot
```{r}
# Examine the location in the UMAP cluster expressed by the specific marker gene; what proportion of cells are expressed by this marker gene
gene.marker <- top5.gene$gene
FeaturePlot(obj3, features = gene.marker, pt.size = 0.2)

# Display the distribution of expression levels of the specific marker gene per cluster
RidgePlot(obj3, features = gene.marker)
```