-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPseudobulk_DE.Rmd
154 lines (131 loc) · 5.15 KB
/
Pseudobulk_DE.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
---
title: "Pseudobulk Differential Expression Analysis in Single Cell RNA-Seq Data"
author: "Anni Liu"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
code_folding: show
---
# Attach the libraries
```{r}
# Reference: https://github.com/ScienceComputing/Project_Programming/blob/main/UsefulR/Attach_Download_Package.R
bio_pkgs <- c("edgeR", "S4Vectors", "SingleCellExperiment", "apeglm", "DESeq2")
gen_pkgs <- c("data.table", "tidyverse", "Matrix.utils", "Matrix", "reshape2", "pheatmap", "png", "RColorBrewer", "cowplot", "readr")
for (pkg in c(bio_pkgs, gen_pkgs)) {
if (!require(pkg, character.only = T) & (pkg %in% bio_pkgs)) {
BiocManager::install(pkg)
library(pkg, character.only = T)
} else if (!require(pkg, character.only = T) & (pkg %in% gen_pkgs)) {
install.packages(pkg)
library(pkg, character.only = T)
} else {
library(pkg, character.only = T)
}
}
```
# Load the data
```{r}
# Reference: https://github.com/satijalab/seurat/wiki/Assay
seurat_obj <- read_rds(multi_seurat_obj_harmony_2.rds)
seurat_count <- seurat_obj@assays$RNA@counts
seurat_meta <- [email protected]
seurat_meta$cluster_id <- factor([email protected])
sce_obj <- SingleCellExperiment(assays = list(counts = seurat_count), colData = seurat_meta)
write_rds(sce_obj, file = "sce_obj.rds")
```
# Explore the data
```{r}
assays(sce_obj)
sce_count <- counts(sce_obj)
dim(sce_count); sce_count[1:10, 1:10]
sce_meta <- colData(sce_obj)
dim(sce_meta); head(sce_meta)
```
# Generate the pseudobulk count by cell type
```{r}
(cluster_name <- levels(sce_meta$cluster_id))
(sample_name <- levels(sce_meta$sample_id))
col_name <- c("cluster_id", "sample_id")
pb_group <- sce_meta[, col_name]; head(pb_group) # Record the sample id and cluster id per barcode/cell
pb_count <- aggregate.Matrix(t(sce_count), groupings = pb_group, fun = "sum")
structure(pb_count); pb_count[1:10, 1:10]
pb_count_t <- t(pb_count)
col_prefix <- sub("_.*", "", colnames(pb_count_t)) # tstrsplit(colnames(pb_count_t), "_")[[1]]
num_cluster <- length(cluster_name)
pb_count_list <- vector(mode = "list", length = num_cluster)
for (i in 1:num_cluster) {
col_target <- which(col_prefix == cluster_name[i])
pb_count_t[, col_target] -> pb_count_list[[i]]
cluster_name[i] -> names(pb_count_list)[i]
}
write_rds(pb_count_list, file = "pb_count_list.rds")
```
# Format the metadata
```{r}
col_name <- c("group_id", "patient_id", "sample_id") # !
pb_meta <- as.data.frame(sce_meta)[, col_name]
pb_meta <- pb_meta[!duplicated(pb_meta), ]
dim(pb_meta); head(pb_meta)
rownames(pb_meta) <- pb_meta$sample_id
# Estimate the frequency of cells mapping to each cluster per sample
freq_cell_cluster_sample <- table(sce_meta$cluster_id, sce_meta$sample_id)
pb_meta_list <- vector(mode = "list", length = num_cluster)
for (i in 1:num_cluster) {
meta_df <- data.frame(cluster_sample_id = colnames(pb_count_list[[i]]))
meta_df$cluster_id <- sub("_.*", "", meta_df$cluster_sample_id) # !
meta_df$sample_id <- sub(".*_", "", meta_df$cluster_sample_id) # !
cell_count_per_sample <- freq_cell_cluster_sample[which(rownames(freq_cell_cluster_sample) == unique(meta_df$cluster_id)), ]
cell_count_per_sample_filter <- cell_count_per_sample[cell_count_per_sample > 0]
sample_order <- match(meta_df$sample_id, colnames(cell_count_per_sample_filter))
cell_count_per_sample_order <- cell_count_per_sample_filter[, sample_order]
meta_df$cell_count <- cell_count_per_sample_order
meta_df_2 <- plyr::join(x = meta_df, y = pb_meta, by = intersect(colnames(meta_df), colnames(pb_meta)), type = "left")
rownames(meta_df_2) <- meta_df_2$cluster_sample_id
pb_meta_list[[i]] <- meta_df_2
names(pb_meta_list)[i] <- unique(meta_df_2$cluster_id)
}
str(pb_meta_list)
write_rds(pb_meta_list, file = "pb_meta_list.rds")
```
# Perform DESeq2-based differential expression analysis
## Build a DESeq2 object
```{r}
all(names(pb_count_list) == names(pb_meta_list))
cluster_target <- which(names(pb_count_list) == "NK cells")
nk_count <- pb_count_list[[cluster_target]]; nk_count[1:10, 1:10]
nk_meta <- pb_meta_list[[cluster_target]]; head(nk_meta)
all(colnames(nk_count) == rownames(nk_meta))
dds <- DESeqDataSetFromMatrix(countData = nk_count, colData = nk_meta, design = ~ group_id)
```
## Sample-level quality control
```{r}
vsd <- vst(dds, blind = T)
DESeq2::plotPCA(vsd, ntop = 500, intgroup = "group_id")
DESeq2::plotPCA(vsd, ntop = 500, intgroup = "cell_count")
vsd_cor <- cor(assay(vsd))
pheatmap(vsd_cor, annotation = nk_meta[, c("group_id"), drop = F])
```
## Model raw counts per gene using `DESeq`
```{r}
dds <- DESeq(dds)
plotDispEsts(dds)
```
## Test for differential expression
```{r}
resultsNames(dds)
res <- results(dds, name = "group_id_kd_vs_wt",
alpha = 0.05)
res_shrink <- lfcShrink(dds = dds, coef = "group_id_kd_vs_wt", res = res, type = "apeglm")
res_all <- res_shrink |>
data.frame() |>
rownames_to_column(var = "gene") |>
as_tibble() |>
dplyr::arrange(padj)
res_sig <- dplyr::filter(res_all, padj < 0.05) |>
dplyr::arrange(padj)
log2FC_up <- 0.58
log2FC_down <- -1
res_sig_up <- dplyr::filter(res_sig, log2FoldChange >= log2FC_up)
res_sig_down <- dplyr::filter(res_sig, log2FoldChange <= log2FC_down)
```
# Visualization