-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDAR_DEG_contact_ratios_with_contrast.R
More file actions
153 lines (127 loc) · 4.47 KB
/
DAR_DEG_contact_ratios_with_contrast.R
File metadata and controls
153 lines (127 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#
# Script to decide on how to merge DAR/DEG windows
# Produces randomized background data for enrichment tests
#
options(readr.num_threads = 1)
options(scipen = 99999)
options(stringsAsFactors = FALSE)
options(digits = 7)
library(tidyverse)
library(ggrepel)
library(ggbeeswarm)
library(patchwork)
outdir = "DARDEG"
###################
# Get DAR/DEG regions before overlapping with HITS
dars = read_tsv("./data/DARs_uq_windows.bed") %>% arrange(chrom, start)
degs = read_tsv("/data/DEGS_uq_windows.bed") %>% arrange(chrom, start)
chrOrder = paste0("chr", c(1:19))
dars = dars %>% filter(chrom %in% chrOrder)
degs = degs %>% filter(chrom %in% chrOrder)
###################
# Determine merge distance effect by incrementing window join size
regioncounts = c()
for(dmax in 1:10) {
BINS = 30000 * dmax
dardeg_windows = bind_rows(dars, degs) %>%
distinct(chrom, start, end) %>%
arrange(chrom, start) %>%
mutate(
grp = cumsum(start - lag(start, default = 0) > BINS),
cgrp = cumsum(start - lag(start, default = 0)),
dist = start - lag(start)
) %>%
group_by(chrom, grp) %>%
mutate(
grpstart = min(start),
grpend = max(start),
grplength = max(start) - min(start),
grpstr = sprintf("%s:%s-%s", chrom, grpstart, grpend)
) %>%
arrange(chrom, grp) %>%
distinct(grpstr)
regioncounts = regioncounts %>% bind_rows(data.frame(dmax = dmax, regions = nrow(dardeg_windows)))
}
ggplot(regioncounts %>% mutate(loss = lag(regions, default = 0) - regions), aes(x = as.factor(dmax), y = abs(loss))) +
geom_point() +
scale_y_log10() +
ggtitle("Number of regions lost joining [dmax] windows") +
theme_bw()
bind_rows(dars %>% mutate(g = "dar"), degs %>% mutate(g = "deg")) %>%
filter(chrom %in% paste0("chr", 1:6)) %>%
ggplot(aes(xmin = start, xmax = pmax(end, start + 50000), ymin = 0, ymax = 1, fill = g)) +
geom_rect() +
facet_grid(chrom ~ .) +
ggtitle("Positions of DAR and DEG loci (chroms 1-6)") +
theme_bw()
###################
# Create randomized regions and define merged DAR/DEG blocks
BINS = 30000
dardeg_windows = bind_rows(dars, degs) %>%
distinct(chrom, start, end) %>%
group_by(chrom) %>%
arrange(start) %>%
mutate(
grp = cumsum(start - lag(start, default = 0) > BINS),
dist = start - lag(start)
) %>%
group_by(chrom, grp) %>%
mutate(
grpstart = min(start),
grpend = max(start),
grplength = max(start) - min(start) + 30000,
grpstr = sprintf("%s:%s-%s", chrom, grpstart, grpend)
) %>%
ungroup() %>%
arrange(chrom, grp)
# Inspect grouping
dardeg_windows %>% distinct(chrom, grp) %>% count()
dardeg_windows %>% distinct(grpstr) %>% count()
dars %>% count()
dars %>% count(chrom)
degs %>% count()
degs %>% count(chrom)
dardeg_windows %>% group_by(chrom) %>% summarize(gpm = max(grp))
dardeg_windows %>% distinct(chrom, grp)
# Plot distribution of distances between unmerged regions
dardeg_windows.dists = dardeg_windows %>% filter(!is.na(dist), dist > 30000) %>% count(dist = log10(dist))
ggplot(dardeg_windows.dists, aes(x = 10^dist, y = n)) +
geom_point() +
scale_x_log10() +
scale_y_log10() +
ggtitle("Distance between DAR/DEG regions before merging") +
theme_bw()
###################
# Randomize regions for background generation
g.background = read_tsv("genome_windows_30000.bed", col_names = c("chrom", "start", "end")) %>%
group_by(chrom) %>%
summarise(glen = plyr::round_any(max(end), 30000, floor))
g.background %>% count(chrom) %>% print(n = 22)
# Join genomic background size info
dardeg_windows = dardeg_windows %>% left_join(g.background, by = c("chrom"))
system(sprintf("mkdir -p ./results/%s/randomregions/", outdir))
numrand = 100
rands = unique(plyr::round_any(runif(400, 30000 * 10, 30000 * 10000), 30000))[1:numrand]
shift = c(0, sort(rands))
i = 0
for(k in shift) {
i = i + 1
print(sprintf("%s: %s", i, k))
shifted_windows = dardeg_windows %>%
mutate(
start.adj = (start + k) %% glen,
end.adj = (end + k) %% glen,
grpstart.shifted = grpstart + k,
grpend.shifted = grpend + k,
g2 = grpend.shifted > glen,
g1 = grpstart.shifted < glen & g2
) %>%
filter(!g1) # Exclude regions that wrap around chromosome ends
shifted_windows %>%
write_tsv(sprintf("./results/%s/randomregions/dardegs_%d.tsv", outdir, k))
shifted_windows %>%
select(chrom, start = start.adj, end = end.adj, grp, grplength) %>%
write_tsv(sprintf("./results/%s/randomregions/dardegs_%d.bed", outdir, k))
}
length(shift)
sum(duplicated(shift))