-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIntegration_Full_v1.qmd
264 lines (202 loc) · 10.6 KB
/
Integration_Full_v1.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
---
title: "Annotate cell types of human PBMC scATACseq datasets using human PBMC scRNAseq as reference"
format: html
editor: visual
execute:
echo: true
---
# Question of interest
How to annotate cells from an scATAC-seq experiment by leveraging a scRNA-seq dataset with annotated cell types?
```{r}
# Load image
load(file = "20240620_scRNA_scATAC_image.RData")
# Save image
save.image(file = "20240620_scRNA_scATAC_image.RData")
```
# Preprocess scRNAseq and scATACseq data
## Attach the libraries {style="blue"}
```{r}
#| label: load-packages
#| message: false
#| echo: false
# install.packages('Seurat')
# devtools::install_github('satijalab/seurat-data')
# BiocManager::install("EnsDb.Hsapiens.v86")
# BiocManager::install("biovizBase")
library(SeuratData)
options(timeout = 1000) # positive integer. The timeout for some Internet operations, in seconds. Default 60 (seconds) but can be set from environment variable R_DEFAULT_INTERNET_TIMEOUT.
# InstallData("pbmcMultiome.SeuratData")
library(Seurat)
packageVersion("Seurat")
# [1] '5.0.3'
library(Signac) # Analysis of Single-Cell Chromatin Data
library(biovizBase)
library(EnsDb.Hsapiens.v86)
# This package loads an SQL connection to a database containing annotations from Ensembl
# This data package was made from resources at Ensembl on Thu May 18 16:32:27 2017 and based on the 86
library(ggplot2)
library(cowplot)
library(Matrix)
```
## Load data
```{r}
pbmc_rna <- LoadData("pbmcMultiome", "pbmc.rna")
pbmc_atac <- LoadData("pbmcMultiome", "pbmc.atac")
dim(pbmc_atac)
dim(pbmc_rna)
```
## Convert pbmc_rna\[\["RNA"\]\] into another class type "Assay5"
```{r}
pbmc_rna[["RNA"]] <- as(pbmc_rna[["RNA"]], Class = "Assay5")
class(pbmc_rna[["RNA"]])
```
## Repeat QC steps
```{r}
pbmc_rna_qc <- subset(pbmc_rna, seurat_annotations != "filtered")
[email protected][["seurat_annotations"]]
dim(pbmc_rna[["RNA"]])
dim(pbmc_rna_qc[["RNA"]])
pbmc_atac_qc <- subset(pbmc_atac, seurat_annotations != "filtered")
```
## Run standard scRNAseq analysis
Workflow: Normalize data -> Identify highly variable features -> Scale data -> Build PCA -> Build UMAP
```{r}
pbmc_rna_norm <- NormalizeData(object = pbmc_rna_qc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc_rna_vf <- FindVariableFeatures(object = pbmc_rna_norm, selection.method = "vst", clip.max = "auto")
dim(pbmc_rna_vf)
pbmc_rna_scale <- ScaleData(object = pbmc_rna_vf)
pbmc_rna_pca <- RunPCA(pbmc_rna_scale, npcs = 50)
pbmc_rna_umap <- RunUMAP(object = pbmc_rna_pca, dims = 1:30, umap.method = "uwot")
```
## Run standard scATACseq analysis with added gene annotation information
Workflow: Extract genomic ranges from EnsDb object -> Set genome and annotation -> Build TF-IDF normalization -> Find top features -> Build SVD -> Build UMAP
```{r}
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86, standard.chromosomes = TRUE)
annotations@seqnames
# factor-Rle of length 3021151 with 25 runs
# Lengths: 92459 65287 276555 135207 190802 141516 182677 182178 116305 ... 133785 64670 107112 6155 51812 113543 33885 26
# Values : X 20 1 6 3 7 12 11 4 ... 5 22 10 Y 18 15 21 MT
# Levels(25): X 20 1 6 3 7 12 11 4 17 2 16 8 19 9 13 14 5 22 10 Y 18 15 21 MT
seqlevelsStyle(annotations) <- "UCSC" # Rename the seqlevels of an object according to a given style
annotations@seqnames
# factor-Rle of length 3021151 with 25 runs
# Lengths: 92459 65287 276555 135207 190802 141516 182677 182178 116305 ... 133785 64670 107112 6155 51812 113543 33885 26
# Values : chrX chr20 chr1 chr6 chr3 chr7 chr12 chr11 chr4 ... chr5 chr22 chr10 chrY chr18 chr15 chr21 chrM
# Levels(25): chrX chr20 chr1 chr6 chr3 chr7 chr12 chr11 chr4 chr17 chr2 ... chr9 chr13 chr14 chr5 chr22 chr10 chrY chr18 chr15 chr21 chrM
genome(x = annotations) <- "hg38"
# Set the genome as hg38 for a ChromatinAssay
Annotation(object = pbmc_atac_qc) <- annotations
# Set the annotation as annotations for a ChromatinAssay
pbmc_atac <- RunTFIDF(object = pbmc_atac_qc)
pbmc_atac <- FindTopFeatures(object = pbmc_atac, min.cutoff = "q0")
pbmc_atac <- RunSVD(object = pbmc_atac)
pbmc_atac_umap <- RunUMAP(pbmc_atac,
reduction = "lsi",
dims = 2:30,
reduction.name = "umap_atac",
reduction.key = "atacUMAP_")
pbmc_atac_umap@reductions$umap_atac
```
## Visualize the results from scRNAseq and scATACseq
```{r}
#| label: fig-bill-dims-species
#| fig-cap: A side-by-side UMAP plots of scRNAseq (annotated) and scATACseq cells.
#| fig-width: 11
#| fig-asp: 0.618
#| fig-alt: AA side-by-side UMAP plots of scRNAseq (annotated) and scATACseq cells.
#| warning: false
#| output-location: column-fragment
# Predict annotations for the scATAC-seq cells
pbmc_rna_viz <- DimPlot(pbmc_rna_umap,
group.by = "seurat_annotations",
label = TRUE) +
Seurat::NoLegend() +
ggtitle("RNA")
pbmc_atac_viz <- DimPlot(pbmc_atac_umap,
group.by = "orig.ident",
label = FALSE) +
NoLegend() +
ggtitle("ATAC")
pbmc_rna_viz + pbmc_atac_viz
combined_viz <- (pbmc_rna_viz + pbmc_atac_viz) & xlab("UMAP 1") & ylab("UMAP 2") & theme(axis.title = element_text(size = 16))
ggsave(filename = "./EDA_scRNAseq_scATACseq.jpg", height = 7, width = 12, plot = combined_viz, dpi = 300)
```
![](EDA_scRNAseq_scATACseq.jpg){#fig-integration}
See @fig-integration for an illustration. More info: <https://quarto.org/docs/authoring/cross-references.html>
## Identify anchors between scRNA-seq and scATAC-seq data
```{r}
# Estimate the transcriptional activity of each gene by quantifying ATAC-seq counts in the 2 kb-upstream region and gene body
library(Signac)
gene_act <- GeneActivity(object = pbmc_atac_umap, features = VariableFeatures(object = pbmc_rna_umap))
length(VariableFeatures(object = pbmc_rna_umap))
dim(gene_act)
View(gene_act)
gene_act@Dimnames
# Allocate gene activities to a new assay
pbmc_atac_umap[["ACTIVITY"]] <- CreateAssayObject(counts = gene_act)
# Normalize gene activities
DefaultAssay(pbmc_atac_umap) <- "ACTIVITY"
pbmc_atac_umap <- NormalizeData(object = pbmc_atac_umap, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc_atac_umap <- ScaleData(object = pbmc_atac_umap, features = rownames(pbmc_atac_umap))
# Detect anchors
transfer_anchors <- FindTransferAnchors(reference = pbmc_rna_umap,
query = pbmc_atac_umap, # Use the gene activity
features = VariableFeatures(object = pbmc_rna_umap),
reference.assay = "RNA",
query.assay = "ACTIVITY",
reduction = "cca")
```
## Label-transfer the annotations from the scRNA-seq cells to scATAC-seq cells
```{r}
celltype_preds <- TransferData(anchorset = transfer_anchors,
refdata = pbmc_rna_umap$seurat_annotations, # Transfer cell type from scRNAseq to scATACseq
weight.reduction = pbmc_atac_umap[["lsi"]],
dims = 2:30)
class(pbmc_atac_umap[["lsi"]])
pbmc_atac_umap <- AddMetaData(pbmc_atac_umap, metadata = celltype_preds)
pbmc_atac_umap$predicted.id |> View() # Predicted cell types
pbmc_atac_umap$prediction.score.CD4.Naive |> View() # Confidence score of the predicted cell type CD4 Naive for each ATAC-seq cell
pbmc_atac_umap$prediction.score.max |> View() # Quantify the uncertainty associated with the predicted cell type for each ATAC-seq cell
```
## Compare predicted cell types and ground-truth in scATACseq data through figures and tables
```{r}
pbmc_atac_umap$preds_correct <- pbmc_atac_umap$predicted.id == pbmc_atac$seurat_annotations
sum(pbmc_atac_umap$predicted.id == pbmc_atac$seurat_annotations)/length(pbmc_atac_umap$predicted.id) # Overall accuracy is 0.780922
predict_viz <- DimPlot(pbmc_atac_umap, group.by = "predicted.id", label = TRUE) +
NoLegend() +
ggtitle("Predicted cell type")
truth_viz <- DimPlot(pbmc_atac_umap, group.by = "seurat_annotations", label = TRUE) +
NoLegend() +
ggtitle("Actual cell type")
predict_viz | truth_viz
(preds_tab <- table(pbmc_atac$seurat_annotations, pbmc_atac_umap$predicted.id)) # Row names are true cell types
(preds_tab_norm <- preds_tab/rowSums(preds_tab))
# ! Normalize by the number of cells in each true cell type; each cell means the fraction of cells of each type which are correctly predicted among all cells of each type
(preds_tab_df <- as.data.frame(preds_tab_norm))
(preds_heatmap <- ggplot(preds_tab_df, aes(Var1, Var2, fill = Freq)) + geom_tile() + scale_fill_gradient(name = "Fraction of cells",
low = "#ffffc8", high = "#7d0025") + xlab("Ground-truth cell type") + ylab("Predicted cell type") +
theme_cowplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)))
preds_metadata <- FetchData(pbmc_atac_umap, vars = c("prediction.score.max", "preds_correct"))
correct_num <- length(which(pbmc_atac$seurat_annotations == pbmc_atac_umap$predicted.id))
incorrect_num <- length(which(pbmc_atac$seurat_annotations != pbmc_atac_umap$predicted.id))
(preds_density <- ggplot(preds_metadata, aes(prediction.score.max, fill = preds_correct, colour = preds_correct)) +
geom_density(alpha = 0.5) + theme_cowplot() +
scale_fill_discrete(name = "Cell type prediction", labels = c(paste0("FALSE (n = ", incorrect_num, ")"), paste0("TRUE (n = ", correct_num, ")"))) +
scale_color_discrete(name = "Annotation Correct", labels = c(paste0("FALSE (n = ", incorrect_num, ")"), paste0("TRUE (n = ", correct_num, ")"))) + xlab("Prediction Score"))
preds_heatmap + preds_density
```
## Visualize scRNA-seq and scATAC-seq cells on the same plot
```{r}
genes_target <- VariableFeatures(pbmc_rna_umap)
ref_data <- GetAssayData(pbmc_rna_umap, assay = "RNA", slot = "data")[genes_target, ]
impute_atac <- TransferData(anchorset = transfer_anchors, # Repurpose previous calculated transfer_anchors
refdata = ref_data,
weight.reduction = pbmc_atac_umap[["lsi"]],
dims = 2:30)
pbmc_atac_umap[["RNA"]] <- impute_atac
coembed_obj <- merge(x = pbmc_rna_umap, y = pbmc_atac_umap)
coembed_obj <- ScaleData(coembed_obj, features = genes_target, do.scale = FALSE, do.center = TRUE)
coembed_obj <- RunPCA(object = coembed_obj, features = genes_target, verbose = FALSE)
coembed_obj <- RunUMAP(object = coembed_obj, dims = 1:30)
DimPlot(coembed_obj, group.by = c("orig.ident", "seurat_annotations"))
```