-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtest-pipeline-noinput.R
330 lines (310 loc) · 10.5 KB
/
test-pipeline-noinput.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
library(testthat)
library(PeakSegPipeline)
library(data.table)
context("noinput")
## first test bigWigCoverage.
count.dt <- fread("
chrom chromStart chromEnd coverage
chr1 0 10 1
chr1 10 20 2
chr2 30 40 1
")
chromInfo <- data.table(chrom=c("chr1", "chr2"), chromEnd=50)
work.dir <- tempdir()
chromInfo.txt <- file.path(work.dir, "chromInfo.txt")
fwrite(chromInfo, chromInfo.txt, sep="\t", col.names=FALSE)
input.bedGraph <- file.path(work.dir, "input.bedGraph")
fwrite(count.dt, input.bedGraph, col.names=FALSE, sep="\t")
count.bigWig <- file.path(work.dir, "count.bigWig")
bedGraphToBigWig(
input.bedGraph,
chromInfo.txt,
count.bigWig)
bigWig.dt <- bigWigCoverage(count.bigWig)
test_that("bigWigCoverage stats are OK", {
expect_equal(bigWig.dt$total.coverage, 40)
expect_equal(bigWig.dt$mean.coverage, 0.4)
expect_equal(bigWig.dt$total.bases, 100)
})
test_that("intermediate file is deleted", {
count.bedGraph <- file.path(work.dir, "count.bedGraph")
expect_true(!file.exists(count.bedGraph))
})
## bigger genome example, int overflow.
chromInfo <- data.table(
chrom=c("chr1", "chr2"),
chromEnd=.Machine$integer.max)
fwrite(chromInfo, chromInfo.txt, sep="\t", col.names=FALSE)
bedGraphToBigWig(
input.bedGraph,
chromInfo.txt,
count.bigWig)
bigWig.dt <- bigWigCoverage(count.bigWig)
test_that("no integer overflow for big chroms", {
expect_true(is.finite(bigWig.dt$total.bases))
expect_true(is.finite(bigWig.dt$mean.coverage))
})
## also test denormalize.
norm.dt <- fread("
chrom chromStart chromEnd coverage
chr1 9979 9993 0.07
chr1 9993 9998 0.14
chr1 9998 10024 0.21
chr1 10024 10043 0.28
chr1 10043 10049 0.35
chr1 10049 10054 0.42
chr1 10054 10091 0.48
chr1 10091 10135 0.55
chr1 10135 10149 0.48
chr1 10149 10154 0.42
")
chromInfo <- fread("
chr1 100000
")
dir.create(work.dir <- file.path(tempdir(), "folder (bad name)"))
chromInfo.txt <- file.path(work.dir, "chromInfo.txt")
fwrite(chromInfo, chromInfo.txt, sep="\t", col.names=FALSE)
input.bedGraph <- file.path(work.dir, "input.bedGraph")
fwrite(norm.dt, input.bedGraph, col.names=FALSE, sep="\t")
norm.bigWig <- file.path(work.dir, "norm.bigWig")
bedGraphToBigWig(
input.bedGraph,
chromInfo.txt,
norm.bigWig)
denorm.bigWig <- file.path(work.dir, "denorm.bigWig")
test_that("intermediate bedGraph files are deleted", {
denormalizeBigWig(norm.bigWig, denorm.bigWig)
denorm.bedGraph <- file.path(work.dir, "denorm.bedGraph")
norm.bedGraph <- file.path(work.dir, "norm.bedGraph")
expect_true(!file.exists(denorm.bedGraph))
expect_true(!file.exists(norm.bedGraph))
})
denorm.dt <- readBigWig(denorm.bigWig, "chr1", 0, 100000)
test_that("coverage converted to integers", {
expect_true(is.integer(denorm.dt$count))
})
## second test, 0.1 0.2 etc.
norm.dt <- fread("
chrom chromStart chromEnd coverage
chr1 9993 9998 0.1
chr1 9998 10024 0.2
chr1 10024 10043 0.3
chr1 10043 10049 0.4
chr1 10049 10054 0.5
chr1 10054 10091 0.6
chr1 10091 10135 0.7
chr1 10135 10149 0.8
chr1 10149 10154 0.9
")
chromInfo <- fread("
chr1 100000
")
work.dir <- tempdir()
chromInfo.txt <- file.path(work.dir, "chromInfo.txt")
fwrite(chromInfo, chromInfo.txt, sep="\t", col.names=FALSE)
input.bedGraph <- file.path(work.dir, "input.bedGraph")
fwrite(norm.dt, input.bedGraph, col.names=FALSE, sep="\t")
norm.bigWig <- file.path(work.dir, "norm.bigWig")
bedGraphToBigWig(
input.bedGraph,
chromInfo.txt,
norm.bigWig)
denorm.bigWig <- file.path(work.dir, "denorm.bigWig")
denormalizeBigWig(norm.bigWig, denorm.bigWig)
denorm.dt <- readBigWig(denorm.bigWig, "chr1", 0, 100000)
test_that("coverage counts 0:9", {
expect_identical(denorm.dt$count, 1:9)
})
test_that("denorm.bedGraph.sorted is deleted", {
denorm.bedGraph.sorted <- sub("bigWig$", "bedGraph.sorted", denorm.bigWig)
expect_false(file.exists(denorm.bedGraph.sorted))
})
test_that("denorm.chromInfo is deleted", {
denorm.chromInfo <- sub("bigWig$", "chromInfo", denorm.bigWig)
expect_false(file.exists(denorm.chromInfo))
})
## Then test pipeline.
test.data.dir <- file.path("~/PeakSegPipeline-test")
non.integer.dir <- file.path(test.data.dir, "non-integer (bad)")
unlink(non.integer.dir, recursive=TRUE, force=TRUE)#start fresh.
demo.dir <- file.path(test.data.dir, "noinput (bad)")
index.html <- file.path(demo.dir, "index.html")
download.to <- function
(u, f, writeFun=if(grepl("bigWig", f))writeBin else writeLines){
if(!file.exists(f)){
require(httr)
f.dir <- dirname(f)
dir.create(f.dir, showWarnings=FALSE, recursive=TRUE)
request <- GET(u)
stop_for_status(request)
writeFun(content(request), f)
}
}
res.list <- list(
walltime = 3600, #in seconds.
ncpus=1,
ntasks=1,
chunks.as.arrayjobs=TRUE)
## Download bigWig files from github.
bigWig.part.vec <- c(
"Input/MS010302",
"bcell/MS010302",
## "Input/MS002202",
## "kidney/MS002202",
## "Input/MS026601",
## "bcell/MS026601",
## "Input/MS002201",
"kidney/MS002201"
)
label.txt <- "
chr10:33,061,897-33,162,814 noPeaks
chr10:33,456,000-33,484,755 peakStart kidney
chr10:33,597,317-33,635,209 peakEnd kidney
chr10:33,662,034-33,974,942 noPeaks
chr10:35,182,820-35,261,001 noPeaks
chr10:35,261,418-35,314,654 peakStart bcell kidney
chr10:35,343,031-35,398,459 peakEnd bcell kidney
chr10:38,041,023-38,102,554 noPeaks
chr10:38,296,008-38,307,179 peakStart bcell kidney
chr10:38,379,045-38,391,967 peakStart bcell kidney
chr10:38,404,899-38,412,089 peakEnd bcell kidney
chr10:38,413,073-38,444,133 noPeaks
chr10:38,585,584-38,643,190 noPeaks
chr10:38,643,191-38,650,766 peakStart bcell kidney
chr10:38,731,066-38,750,574 peakEnd bcell kidney
chr10:38,750,960-38,790,663 noPeaks
"
chrom.sizes.file <- tempfile()
chrom.sizes <- data.table(chrom="chr10", bases=128616069)
fwrite(chrom.sizes, chrom.sizes.file, sep="\t", col.names=FALSE)
repos.url <- "https://raw.githubusercontent.com/tdhock/input-test-data/master/"
for(bigWig.part in bigWig.part.vec){
bigWig.file <- file.path(
non.integer.dir, "samples",
bigWig.part, "coverage.bigWig")
bigWig.url <- paste0(repos.url, bigWig.part, ".bigwig")
download.to(bigWig.url, bigWig.file)
demo.bigWig <- sub("non-integer", "noinput", bigWig.file)
if(!file.exists(demo.bigWig)){
dir.create(dirname(demo.bigWig), showWarnings=FALSE, recursive=TRUE)
denormalizeBigWig(bigWig.file, demo.bigWig)
}
}
for(set.dir in c(non.integer.dir, demo.dir)){
labels.file <- file.path(set.dir, "labels", "some_labels.txt")
dir.create(dirname(labels.file), showWarnings=FALSE, recursive=TRUE)
writeLines(label.txt, labels.file)
problems.bed <- file.path(set.dir, "problems.bed")
unlink(problems.bed)
cat("chr10 60000 17974675
chr10 18024675 38818835
chr10 38868835 39154935
chr10 42746000 46426964
chr10 47529169 47792476
chr10 47892476 48055707
chr10 48105707 49095536
chr10 49195536 51137410
chr10 51187410 51398845
chr10 51448845 125869472
chr10 125919472 128616069
", file=problems.bed)
}
test_that("pipeline error for non-integer data in bigWigs", {
expect_error({
jobs_create_run(non.integer.dir, steps=1)
}, "non-integer data in")
})
test_that("coverage error for non-integer data in bigWigs", {
non.int.prob.dirs <- Sys.glob(file.path(
non.integer.dir, "samples", "*", "*", "problems", "*"))
unlink(file.path(non.int.prob.dirs, "coverage.bedGraph"))
one.prob.dir <- non.int.prob.dirs[1]
expected.err <- paste("non-integer data in", one.prob.dir)
expect_error({
problem.coverage(one.prob.dir)
}, expected.err, fixed=TRUE)
})
## Set time limit manually.
(sample.dir.vec <- Sys.glob(file.path(
demo.dir, "samples", "*", "*")))
prob.dir.vec <- file.path(
sample.dir.vec, "problems", "chr10:18024675-38818835")
limit.dt <- data.table(minutes=5)
for(prob.dir in prob.dir.vec){
dir.create(prob.dir, showWarnings=FALSE, recursive=TRUE)
limit.file <- file.path(prob.dir, "target.minutes")
fwrite(limit.dt, limit.file, col.names=FALSE)
}
## Remove one sampleID/problems dir to simulate what happens when
## running jobs_create (which does not create problems dirs) then
## jobs_submit.
one.problems.dir <- dirname(prob.dir)
unlink(one.problems.dir, recursive=TRUE)
test_that("problem.coverage makes a directory", {
prob <- problem.coverage(prob.dir)
expect_true(file.exists(file.path(prob.dir, "coverage.bedGraph")))
})
limit.file <- file.path(prob.dir, "target.minutes")
fwrite(limit.dt, limit.file, col.names=FALSE)
## test for informative error early if ucsc not available.
path.vec <- stop.without.ucsc()
prog <- path.vec[["bigWigInfo"]]
old.mode <- file.info(prog)$mode
Sys.chmod(prog, "444") #read, not write, not exec.
test_that("jobs_create fails if UCSC not available", {
expect_error({
jobs_create(demo.dir, verbose=1)
}, "bigWigInfo")
})
Sys.chmod(prog, old.mode)
jobs <- jobs_create(demo.dir, verbose=1)
test_that("jobs_create returns dt", {
expect_identical(names(jobs), c("step", "fun", "arg"))
expect_is(jobs, "data.table")
})
## Pipeline should run to completion using SLURM. See .travis.yml file
## for how to configure SLURM for testing on Ubuntu.
if(FALSE){
## If the jobs are not being scheduled then check sinfo - if node
## State is down, then maybe need to bring node back up
## https://slurm.schedmd.com/faq.html#return_to_service
system("sudo scontrol update NodeName=localhost State=RESUME")
}
sleep.fun <- function(i){
system(paste("squeue -u", Sys.getenv("USER")))
10
}
unlink(index.html)
test_that("index.html is created via batchtools", {
reg.list <- jobs_submit_batchtools(jobs, res.list)
reg <- reg.list[[length(reg.list)]]
result <- batchtools::waitForJobs(reg=reg, sleep=sleep.fun)
expect_true(file.exists(index.html))
log.glob <- file.path(
shQuote(normalizePath(demo.dir)),
"registry", "*", "logs", "*")
system(paste("tail -n 10000", log.glob))
})
test_that("entries of peaks matrix are 0/1", {
mat.tsv.gz <- normalizePath(file.path(demo.dir, "peaks_matrix_sample.tsv.gz"))
peak.dt <- fread(cmd=paste("zcat", shQuote(mat.tsv.gz)))
class.vec <- as.character(sapply(peak.dt, class))
expected.class.vec <- c("character", rep("integer", length(bigWig.part.vec)))
expect_identical(class.vec, expected.class.vec)
binary.mat <- as.matrix(peak.dt[, -1, with=FALSE])
expect_true(all(binary.mat %in% c(0,1)))
})
some.jobs <- jobs[step >= 5]
unlink(index.html)
test_that("run only steps 5-6 creates index.html", {
reg.list <- jobs_submit_batchtools(some.jobs, res.list)
reg <- reg.list[[length(reg.list)]]
result <- batchtools::waitForJobs(reg=reg, sleep=sleep.fun)
expect_true(file.exists(index.html))
for(step.i in names(reg.list)){
log.glob <- file.path(
shQuote(normalizePath(demo.dir)),
"registry", step.i, "logs", "*")
system(paste("tail -n 10000", log.glob))
}
})