-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIdentifyMotif.Rmd
114 lines (98 loc) · 5.51 KB
/
IdentifyMotif.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
---
title: "ATAC-Seq Data Analysis with Mouse Tissue Data - Identify Motifs"
author: "Anni Liu"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
code_folding: show
---
# Map peaks to motifs using `motifmatchr`
```{r}
library(motifmatchr)
opts <- list()
opts[["tax_group"]] <- "vertebrates"
opts[["collection"]] <- "CORE"
opts[["all_versions"]] <- F # Only include the latest version of a motif
motifs_database <- getMatrixSet(JASPAR2022, opts)
?matchMotifs # Find motif matches; matchMotifs(pwms, subject, ...)
load("./peak_data/read_count_per_peak.RData")
(atac_count <- read_count_per_peak)
(peak_range <- rowRanges(atac_count))
```
## Center the GRanges of peak regions
```{r}
library(BSgenome.Mmusculus.UCSC.mm10)
genome_mm <- BSgenome.Mmusculus.UCSC.mm10
(peak_range_center <- resize(peak_range, fix = "center", width = 100)) # Retrieve the middle range with left 50 and right 50 positions
# (peak_range_center_pos <- resize(peak_range, fix = "center", width = 1))
peak_range_center |> class()
# [1] "GRanges"
# attr(,"package")
# [1] "GenomicRanges"
```
## Get sequences of peak regions
```{r}
(peak_sequence <- getSeq(genome_mm , peak_range_center)) # Retrieve sequences of peak regions by aligning the GRanges of peak data with the GRanges of a reference BSGenome object
names(peak_sequence) <- as.character(peak_range_center) # Assign the chromosome:ranges to each sequence
peak_sequence
length(peak_sequence) # [1] 36320
```
## Find where the peaks match the motifs - `out = 'positions'`
```{r}
##----We locate the motif positions by scanning for 10 selected motifs from JASPAR2022 using the sequence of the first 200 ATACseq peaks----
motifs_position <- matchMotifs(motifs_database[1:10], peak_sequence[1:200], out = "positions")
# Value
# Either returns a SummarizedExperiment with a sparse matrix with values set to TRUE for a match (if out == 'matches'), a SummarizedExperiment with a matches matrix as well as matrices with the maximum motif score and total motif counts (if out == 'scores'), or a GenomicRangesList or a list of IRangesList with all the positions of matches (if out == 'positions')
length(motif_positions) # 10
lapply(1:length(motifs_position), function (i) motifs_position[[i]] |> head(3)) # In each selected motifs from the database, each of 200 ATACseq peak regions are examined if they match the motif sequence
names(motifs_database[1:10]) == names(motifs_position)
motif_position$MA0006.1 # Each motif has 200 elements, each of which shows the start, end, width, strand, and score of the matched motif in each centered ATACseq peak with width 100; if element is empty, suggesting there is no motif in this peak
##----We view the peaks that match a particular motif----
ma0006.1_hit <- motifs_position$MA0006.1
names(ma0006.1_hit) <- names(peak_sequence[1:200])
unlist(ma0006.1_hit, use.names = T)
# IRanges object with 6 ranges and 2 metadata columns:
# start end width | strand score
# <integer> <integer> <integer> | <character> <numeric>
# chr1:4785445-4785544 42 47 6 | - 10.8638
# chr1:4785445-4785544 67 72 6 | - 10.8638
# chr1:4807733-4807832 10 15 6 | - 10.8638
# chr1:4807733-4807832 82 87 6 | - 10.8638
# chr1:16619333-16619432 45 50 6 | + 10.8638
# chr1:16665174-16665273 13 18 6 | + 10.8638
# !What can we gain from the above results?
# The position of first peak hit which matches the motif MA0006.1 starts from (4785445 + 42) to (4785445 + 47).
```
## Find if the peaks match the motifs, with no interest in matching positions - `out = 'matches'`
```{r}
##----We examine if the first 200 ATACseq peaks match 10 selected motifs----
(motif_hit <- matchMotifs(motifs_database[1:10], peak_sequence[1:200], out = "matches"))
# class: SummarizedExperiment
# dim: 200 10 # ! # Row: number of examined peaks; column: number of selected motifs
# metadata(0):
# assays(1): motifMatches
# rownames: NULL
# rowData names(0):
# colnames(10): MA0004.1 MA0006.1 ... MA0059.1 MA0066.1
# colData names(1): name
peak_map_motif_mat <- motifMatches(motif_hit)
peak_map_motif_mat[1:6, 1:6] # This is a sparse matrix: . -> there is no motif in this peak; | -> there is a motif in this peak
# 6 x 6 sparse Matrix of class "lgCMatrix"
# MA0004.1 MA0006.1 MA0019.1 MA0029.1 MA0030.1 MA0031.1
# [1,] . . . . . .
# [2,] . . . . . .
# [3,] . . . . . .
# [4,] . . . . . .
# [5,] . . . . . .
# [6,] . . . . . .
##----We estimate the number of peaks matching each motif----
library(Matrix)
colSums(peak_map_motif_mat)
# MA0004.1 MA0006.1 MA0019.1 MA0029.1 MA0030.1 MA0031.1 MA0040.1 MA0051.1 MA0059.1 MA0066.1
# 7 4 0 3 2 2 6 0 2 2
# Alternative way without attaching the Matrix library
colSums(as.array(peak_map_motif_mat))
##----We retrieve the seqnames, ranges and other meta data information of peaks which map to a particular motif----
# peak_range_center <- resize(peak_ranges, fix = "center", width = 100)
(peak_map_ma0006.1 <- peak_range_center[peak_map_motif_mat[, "MA0006.1"] == 1]) # GRanges object with 727 ranges and 6 metadata columns
```