-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgrabUnmappedReads.R
More file actions
115 lines (82 loc) · 3.43 KB
/
grabUnmappedReads.R
File metadata and controls
115 lines (82 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
libs <- c("stringr",
"dplyr",
"ggplot2",
"GenomicRanges",
"ShortRead",
"BiocParallel",
"BSgenome")
null <- suppressMessages(sapply(libs, library, character.only=TRUE))
options(stringsAsFactors=FALSE)
options(dplyr.width = Inf)
#' increase output width to console width
wideScreen <- function(howWide=as.numeric(strsplit(system('stty size', intern=T), ' ')[[1]])[2]) {
options(width=as.integer(howWide))
}
wideScreen()
site.res <- read.csv("./truth.site.call.txt", header=TRUE, sep="\t")
site.res.nf <- dplyr::filter(site.res,
is.na(subjectHits.uniq) &
is.na(subjectHits.multi))
keyFile <- list.files(".", pattern="keys.RData", recursive=TRUE)
lostSiteName <- with(site.res.nf,
paste(chr,
ifelse(strand=="+", "p", "m"),
position, sep="")
)
lapply(keyFile, function(f) {
f=keyFile[60] # 60, 61, 62
keys <- get(load(f))
keys$sitename <- str_match(keys$names, "chr.*[pm]\\d+")[,1]
keys.sitename <- str_match(keys$names, "chr.*[pm]\\d+")
if( !any(keys.sitename %in% lostSiteName) ) return(NULL)
return(TRUE)
key.lostSite <- unique(keys.sitename[keys.sitename %in% lostSiteName])
load(sub("keys.RData", "hits.R1.RData", f))
load(sub("keys.RData", "hits.R2.RData", f))
hits.R1$siteName <- str_match(names(hits.R1), "chr.*[pm]\\d+")[,1]
hits.R2$siteName <- str_match(names(hits.R2), "chr.*[pm]\\d+")[,1]
hits.R1 <- subset(hits.R1, siteName %in% key.lostSite)
strand(hits.R1) <- c("+","-")[3-as.integer((factor(as.factor(strand(hits.R1)), levels=c("+","-"))))]
hits.R2 <- subset(hits.R2, siteName %in% key.lostSite)
hits <- c(hits.R1, hits.R2)
hits$readID <- str_match(names(hits), "\\d+$")[,1]
hits <- split(hits, hits$readID)
fa.R1 <- readFasta(sub("keys.RData", "R1-1.fa", f))
fa.R2 <- readFasta(sub("keys.RData", "R2-1.fa", f))
lostSite.keys <- subset(keys, sitename %in% lostSiteName)
lostSite.keys$fa1 <- as.character(sread(fa.R1)[lostSite.keys$R1])
lostSite.keys$fa2 <- as.character(sread(fa.R2)[lostSite.keys$R2])
read.1 <- ShortRead(DNAStringSet( lostSite.keys$fa1 ),
BStringSet( lostSite.keys$names ) )
read.2 <- ShortRead(DNAStringSet( lostSite.keys$fa2 ),
BStringSet( lostSite.keys$names ) )
writeFasta(read.1, file="lostSite.R1.fa")
writeFasta(read.2, file="lostSite.R2.fa")
} )
R1.gr <- subset(hits[[1]], from=="R1")
names(R1.gr) <- NULL
R2.gr <- subset(hits[[1]], from=="R2")
names(R2.gr) <- NULL
R1R2 <- merge(as.data.frame(R1.gr),
as.data.frame(R2.gr),
by=c("siteName", "readID", "seqnames", "strand"),
suffixes = c(".1",".2"))
R1R2$tBaseInsert.1 <- NULL
R1R2$from.1 <- NULL
R1R2$tBaseInsert.2 <- NULL
R1R2$from.2 <- NULL
R1R2$position <- with(R1R2, ifelse(strand=="+", start.2, end.2))
R1R2$breakpoint <- with(R1R2, ifelse(strand=="+", end.1, start.1))
subset(R1R2,
seqnames=="chr8" &
abs(start.2 - 42377425) < 5000 )
tmp <- subset(R1R2,
seqnames=="chr8" &
abs(start.1 - 42377425) < 5000 )
options("showHeadLines"=7)
options("showTailLines"=2)
## Revert to default values
options("showHeadLines"=NULL)
options("showTailLines"=NULL)
options("showTailLines"=Inf)
options("showHeadLines"=Inf)