-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDifferentialAnalysis.Rmd
277 lines (242 loc) · 15.4 KB
/
DifferentialAnalysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
---
title: "ATAC-Seq Data Analysis with Mouse Tissue Data - Perform Differential and Functional Enrichment Analysis of Peaks"
author: "Anni Liu"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
code_folding: show
---
# !Differential ATACseq peaks - to identify changes in read counts of non-redundant peaks in open regions between one condition and another condition
Here we will establish a collection of unique peaks that are found in a minimum of 2 samples. These peaks will serve as the basis for evaluating alterations in nucleosome-free ATAC-seq data using `DESeq2`.
TD: alternative approach [DiffBind](https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf)
## Gplimpse the peak data across samples
```{r}
# Overall goal: to examine peaks across all our samples and condense them into a unified collection of unique peaks. Subsequently, we generate a matrix indicating whether each peak is present or absent in each sample.----
library(GenomicRanges)
library(rtracklayer)
(peak_dir <- dir("peak_data", full.name = T)) # 6 samples
peak_all <- lapply(peak_dir, ChIPQC:::GetGRanges, simple = T)
lapply(1:length(peak_all), function (i) peak_all[[i]] |> head(3))
for (i in 1:length(peak_all)) { print(peak_all[[i]] |> head(3)) }
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 3672288-3672369 *
# [2] chr1 3994901-3995006 *
# [3] chr1 4142676-4142762 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 3994858-3994957 *
# [2] chr1 4560267-4560343 *
# [3] chr1 4571695-4572204 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 3433991-3434095 *
# [2] chr1 3575895-3575972 *
# [3] chr1 3670961-3671038 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 3130336-3130413 *
# [2] chr1 3400112-3400250 *
# [3] chr1 3433954-3434042 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 3671695-3671781 *
# [2] chr1 4466776-4466928 *
# [3] chr1 4602508-4602618 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
# GRanges object with 3 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 4471412-4471506 *
# [2] chr1 4785622-4785781 *
# [3] chr1 4786096-4786271 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
```
## Obtain a set of non-redundant peaks
```{r}
(peak_set <- GRangesList(peak_all) |> unlist() |> reduce(min.gapwidth=1L))
# reduce first orders the ranges in x from left to right, then merges the overlapping or adjacent ones.
# min.gapwidth: Ranges separated by a gap of at least min.gapwidth positions are not merged.
# GRanges object with 89769 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 3130336-3130413 *
# [2] chr1 3400112-3400250 *
# [3] chr1 3433954-3434095 *
# [4] chr1 3515020-3515102 *
# [5] chr1 3575895-3576078 *
# ... ... ... ...
# [89765] chrY 90805058-90805151 *
# [89766] chrY 90807391-90807562 *
# [89767] chrY 90808761-90808841 *
# [89768] chrY 90812961-90813237 *
# [89769] chrY 90829708-90829782 *
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
class(peak_set)
# [1] "GRanges"
# attr(,"package")
# [1] "GenomicRanges"
# IRanges::reduce: transform all the ranges together as a set to produce a new set of ranges.
```
## Filter the non-redundant peaks
```{r}
##----We estimate how often each of these non-redundant peaks occurs in each sample----
overlap_logic <- lapply(peak_all, function(peak_each) peak_set %over% peak_each)
# query %over% subject; *if query overlaps with subject, return T; otherwise, return F.
# Search for interval overlaps between two "range-based" objects: a query and a subject
overlap_mat <- do.call(cbind, overlap_logic)
dim(overlap_mat) # Return a T/F matrix with 89769 rows and 6 columns
colnames(overlap_mat) <- basename(dir("peak_data"))
overlap_mat[1:6, 1:6]
mcols(peak_set) <- overlap_mat # Assign overlap_mat as metadata columns of all_peaks_set
peak_set[1:2, ]
##----*We remove peaks found in the mm10 blacklist and on chrM (mitochondrial chromosome) before conducting differential tests, to eliminate the possibility of artifact differential call results----
blacklist_mm10 <- import.bed("peak_data/ENCFF547MET.bed.gz")
(peak_set_filter <- peak_set[(!peak_set %over% blacklist_mm10) & (seqnames(peak_set) != "chrM")])
##----*We determine how many times each of our non-redundant/reduced peaks appears in the 6 samples, and select peaks found in a minimum of 2 samples----
mcols(peak_set_filter)
# DataFrame with 89654 rows and 6 columns
peak_count_across_samples <- mcols(peak_set_filter) |> as.data.frame() |> rowSums()
table(peak_count_across_samples)
(peak_set_filter_2 <- peak_set_filter[peak_count_across_samples >= 2, ])
# GRanges object with 36320 ranges and 6 metadata columns
```
## Estimate the number of reads affiliated with each non-redundant peak region
```{r}
##----*We count paired reads landing in unique/non-redundant/reduced peak regions that are found in a minimum of 2 samples----
library(GenomicAlignments)
bam2count <- dir("bam_data/", full.names = T, pattern = "*.\\.bam$")
# *: Match any sequence of characters.
# \\.: Match a literal dot (.) character. The double backslash (\\) is used to escape the dot because dot has a special meaning in regular expressions.
# bam$: Match the string "bam" at the end of a file name, followed by the end of the line.
read_count_per_peak <- summarizeOverlaps(features = peak_set_filter_2,
reads = bam2count,
singleEnd = F)
# features is A GRanges or a GRangesList object of genomic regions of interest.
colnames(read_count_per_peak) <- c("hind_brain_1", "hind_brain_2", "kidney_1", "kidney_2", "liver_1", "liver_2")
save(read_count_per_peak, 'peak_data/read_count_per_peak.RData') # Suitable for BMAseq
```
## Run `DESeq2`-based differential tests to examine changes in read counts of non-redundant peaks in open regions between one condition and another condition
```{r}
##----We run DESeq2 for deriving differential ATACseq peaks----
library(DESeq2)
load("peak_data/read_count_per_peak.RData")
atac_count <- read_count_per_peak
meta_data <- data.frame(Group = factor(c("hind_brain", "hind_brain", "kidney", "kidney", "liver", "liver")),
row.names = colnames(atac_count))
atac_dds <- DESeqDataSetFromMatrix(countData = assay(atac_count),
colData = meta_data,
design = ~ Group,
rowRanges = rowRanges(atac_count)) # !
atac_dds <- DESeq(atac_dds)
# estimating size factors
# estimating dispersions
# gene-wise dispersion estimates # This should be changed into peak-wise dispersion estimates
# mean-dispersion relationship
# -- note: fitType='parametric', but the dispersion trend was not well captured by the
# function: y = a/x + b, and a local regression fit was automatically substituted.
# specify fitType='local' or 'mean' to avoid this message next time.
# final dispersion estimates
# fitting model and testing
(kidney_vs_hindbrain <- results(atac_dds, contrast = c("Group", "kidney", "hind_brain"), format = "GRanges"))
# format
# character, either "DataFrame", "GRanges", or "GRangesList", whether the results should be printed as a DESeqResults DataFrame, or if the results DataFrame should be attached as metadata columns to the **GRanges** or GRangesList rowRanges of the DESeqDataSet. If the rowRanges is a GRangesList, and GRanges is requested, the **range of each gene will be returned**
# ! This GRanges format is very important, as it allows us to subset the results to specific open regions (e.g., promoters)
# contrast
# this argument specifies what comparison to extract from the object to build a results table. one of either:
# a character vector with exactly three elements: the name of a factor in the design formula, the name of the numerator level for the fold change, and the name of the denominator level for the fold change (simplest case)
# a list of 2 character vectors: the names of the fold changes for the numerator, and the names of the fold changes for the denominator. these names should be elements of resultsNames(object). if the list is length 1, a second element is added which is the empty character vector, character(). (more general case, can be to combine interaction terms and main effects)
# a numeric contrast vector with one element for each element in resultsNames(object) (most general case)
(kidney_vs_hindbrain_order <- kidney_vs_hindbrain[order(kidney_vs_hindbrain$pvalue)])
(kidney_vs_hindbrain_up_region_500 <- kidney_vs_hindbrain_order[kidney_vs_hindbrain_order$log2FoldChange > 0, ][1:500])
(kidney_vs_hindbrain_down_region_500 <- kidney_vs_hindbrain_order[kidney_vs_hindbrain_order$log2FoldChange < 0, ][1:500])
##----We subset results to open regions within promoters----
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
(promoter_region <- promoters(x = TxDb.Mmusculus.UCSC.mm10.knownGene,
upstream = 500,
downstream = 500))
# **upstream** defines the number of nucleotides toward the 5' end, and **downstream** defines the number toward the 3' end, relative to the transcription start site (TSS). Promoter regions are formed by merging the upstream and downstream ranges.
# promoters: returns an object of the same type and length as x containing promoter ranges. Promoter ranges extend around the transcription start site (TSS) which is defined as start(x) for ranges on the + or * strand and as end(x) for ranges on the - strand. The upstream and downstream arguments define the number of nucleotides in the 5' and 3' direction, respectively. More precisely, the output range is defined as
# (start(x) - upstream) to (start(x) + downstream - 1) -> for ranges on the + or * strand, and as
# (end(x) - downstream + 1) to (end(x) + upstream) -> for ranges on the - strand.
# seqnames ranges strand | tx_id tx_name
# <Rle> <IRanges> <Rle> | <integer> <character>
# ENSMUST00000193812.1 chr1 3072753-3073752 + | 1 ENSMUST00000193812.1
# ENSMUST00000082908.1 chr1 3101516-3102515 + | 2 ENSMUST00000082908.1
# ENSMUST00000192857.1 chr1 3252257-3253256 + | 3 ENSMUST00000192857.1
# ENSMUST00000161581.1 chr1 3466087-3467086 + | 4 ENSMUST00000161581.1
# ENSMUST00000192183.1 chr1 3531295-3532294 + | 5 ENSMUST00000192183.1
# ... ... ... ... . ... ...
# ENSMUST00000184505.1 chrUn_GL456381 16222-17221 - | 142442 ENSMUST00000184505.1
# ENSMUST00000178705.1 chrUn_GL456385 30743-31742 + | 142443 ENSMUST00000178705.1
# ENSMUST00000180206.1 chrUn_GL456385 32219-33218 + | 142444 ENSMUST00000180206.1
# ENSMUST00000179505.7 chrUn_JH584304 59168-60167 - | 142445 ENSMUST00000179505.7
# ENSMUST00000178343.1 chrUn_JH584304 59191-60190 - | 142446 ENSMUST00000178343.1
# -------
# seqinfo: 66 sequences (1 circular) from mm10 genome
(kidney_vs_hindbrain_order_promoter <- kidney_vs_hindbrain_order[(!is.na(kidney_vs_hindbrain_order$padj)) & (kidney_vs_hindbrain_order$padj < 0.05) & (kidney_vs_hindbrain_order %over% promoter_region), ])
# GRanges object with 571 ranges and 6 metadata columns:
# seqnames ranges strand | baseMean log2FoldChange lfcSE stat pvalue padj
# <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
# [1] chr4 22488379-22488925 * | 96.8199 -3.484755 0.272029 -12.81021 1.43736e-37 5.01798e-33
# [2] chr1 24613428-24614006 * | 285.2533 1.465584 0.161832 9.05623 1.35040e-19 2.35719e-15
# [3] chr9 3020206-3020334 * | 808.0880 -0.793603 0.112104 -7.07919 1.45004e-12 6.32781e-09
# [4] chr16 18213704-18214070 * | 27.5220 -4.536604 0.676970 -6.70133 2.06525e-11 6.00833e-08
# [5] chr7 4123750-4124017 * | 22.3862 -4.253127 0.649868 -6.54460 5.96544e-11 1.22506e-07
# ... ... ... ... . ... ... ... ... ... ...
# [567] chr5 77265720-77265949 * | 16.8217 1.859305 0.627672 2.96222 0.00305428 0.0489793
# [568] chr5 91402874-91403158 * | 10.0064 2.585540 0.873043 2.96153 0.00306118 0.0490450
# [569] chr12 11436515-11436838 * | 26.0419 -1.188009 0.401368 -2.95990 0.00307737 0.0492365
# [570] chr11 78550637-78550963 * | 73.5612 -0.807302 0.273209 -2.95489 0.00312784 0.0498840
# [571] chr17 44078643-44079094 * | 28.9860 -1.301105 0.440351 -2.95470 0.00312973 0.0498913
# -------
# seqinfo: 22 sequences from an unspecified genome; no seqlengths
# Make a HTML table to review the results in IGV
library(tracktables)
a <- makebedtable(kidney_vs_hindbrain_order_promoter, "kidney_vs_hindbrain_order_promoter.html", 'peak_result')
browseURL(a)
```
## Annotate differential ATACseq regions with gene information
```{r}
library(ChIPseeker)
kidney_vs_hindbrain_order_promoter_annotate <- annotatePeak(
peak = kidney_vs_hindbrain_order_promoter,
TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene,
verbose = F)
atac_promoter_differential_binding <- as.data.frame(kidney_vs_hindbrain_order_promoter_annotate)
atac_promoter_differential_binding |> tail()
```
## Detect functional enrichment of differential ATACseq regions
```{r}
library(clusterProfiler)
atac_go <- enrichGO(gene = atac_differential_binding$geneId,
OrgDb = "org.Mm.eg.db",
ont = "BP",
maxGSSize = 5000) # Maximal size of genes annotated for testing
atac_go@result |> View()
```
## Save DESeq2 and enrichGO results in an excel workbook
```{r}
wb <- createWorkbook()
addWorksheet(wb, sheetName = "Differential_Peaks_Annotated")
addWorksheet(wb, sheetName = "Functional_Enrichment")
writeData(wb, sheet = 1, atac_promoter_differential_binding)
writeData(wb, sheet = 2, atac_go)
saveWorkbook(wb, file = "peak_result/promoter_diff_atac_result.xlsx")
```