-
Notifications
You must be signed in to change notification settings - Fork 34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add FastQC to Optimus and SmartSeq2 #202
base: master
Are you sure you want to change the base?
Changes from 11 commits
5abc522
6646b3c
497f2a0
6f3bda6
23e57ed
2436d2f
0905e04
5adc1a6
e866909
78b4ef3
b296533
abb02f0
2b84308
e794f71
c139eef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
task FastQC { | ||
Array[File] fastq_files | ||
File? limits_file | ||
Int startRead = 250000 | ||
Int nRead = 250000 | ||
String docker = "quay.io/biocontainers/fastqc:0.11.8--1" | ||
Int machine_mem_mb = 3850 | ||
Int disk = 100 | ||
Int preemptible = 3 | ||
|
||
String dollar = "$" | ||
parameter_meta { | ||
fastq_files: "input fastq files" | ||
limits_file: "(optional) limits file to use with fastqc" | ||
startRead: "(optional) start fastqc at the nth read of the file" | ||
nRead: "(optional) use (at most) n reads for fastqc" | ||
docker: "(optional) the docker image containing the runtime environment for this task" | ||
disk: "(optional) the amount of disk space (GiB) to provision for this task" | ||
preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine" | ||
machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" | ||
|
||
} | ||
|
||
command <<< | ||
set -e | ||
|
||
mkdir outputs | ||
declare -a fastqs=() | ||
for fastq in ${sep=' ' fastq_files} | ||
do | ||
outname=`basename ${dollar}fastq .fastq.gz`_skip${startRead}_read${nRead}.fastq | ||
zcat ${dollar}fastq | head -n ${4*(startRead + nRead)} | tail -n ${4*nRead} > ${dollar}outname | ||
fastqs+=(${dollar}outname) | ||
done | ||
|
||
fastqc ${dollar}{fastqs[@]} -o outputs ${"--limits " + limits_file} | ||
>>> | ||
|
||
runtime { | ||
docker: docker | ||
memory: "${machine_mem_mb} MiB" | ||
disks: "local-disk ${disk} HDD" | ||
preemptible: preemptible | ||
} | ||
|
||
output { | ||
Array[File] fastqc_htmls = glob("outputs/*.html") | ||
Array[File] fastqc_zips = glob("outputs/*.zip") | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ import "RunEmptyDrops.wdl" as RunEmptyDrops | |
import "ZarrUtils.wdl" as ZarrUtils | ||
import "Picard.wdl" as Picard | ||
import "UmiCorrection.wdl" as UmiCorrection | ||
import "FastQC.wdl" as FastQC | ||
|
||
workflow Optimus { | ||
meta { | ||
|
@@ -25,6 +26,9 @@ workflow Optimus { | |
Array[File]? i1_fastq | ||
String sample_id | ||
|
||
# fastqc input | ||
File? fastqc_limits | ||
|
||
# organism reference parameters | ||
File tar_star_reference | ||
File annotations_gtf | ||
|
@@ -51,6 +55,7 @@ workflow Optimus { | |
r2_fastq: "reverse read, contains cDNA fragment generated from captured mRNA" | ||
i1_fastq: "(optional) index read, for demultiplexing of multiple samples on one flow cell." | ||
sample_id: "name of sample matching this file, inserted into read group header" | ||
fastqc_limits: "(optional) limits file for fastqc" | ||
tar_star_reference: "star genome reference" | ||
annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)" | ||
ref_genome_fasta: "genome fasta file (must match star reference)" | ||
|
@@ -77,6 +82,13 @@ workflow Optimus { | |
r2_unmapped_bam = FastqToUBam.bam_output, | ||
whitelist = whitelist | ||
} | ||
|
||
call FastQC.FastQC as FastQC { | ||
input: | ||
fastq_files = [r1_fastq[index], r2_fastq[index], non_optional_i1_fastq[index]], | ||
limits_file = fastqc_limits, | ||
disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB") + size(non_optional_i1_fastq[index], "GiB")) * 1.2 + 10) | ||
} | ||
} | ||
|
||
# if the index is not passed, proceed without it. | ||
|
@@ -87,9 +99,18 @@ workflow Optimus { | |
r2_unmapped_bam = FastqToUBam.bam_output, | ||
whitelist = whitelist | ||
} | ||
|
||
call FastQC.FastQC as FastQCNoIndex { | ||
input: | ||
fastq_files = [r1_fastq[index], r2_fastq[index]], | ||
limits_file = fastqc_limits, | ||
disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB")) * 1.2 + 10) | ||
} | ||
} | ||
|
||
File barcoded_bam = select_first([AttachBarcodes.bam_output, AttachBarcodesNoIndex.bam_output]) | ||
Array[File] fastqc_output_htmls = select_first([FastQC.fastqc_htmls,FastQCNoIndex.fastqc_htmls]) | ||
Array[File] fastqc_output_zips = select_first([FastQC.fastqc_zips,FastQCNoIndex.fastqc_zips]) | ||
} | ||
|
||
call Merge.MergeSortBamFiles as MergeUnsorted { | ||
|
@@ -222,5 +243,9 @@ workflow Optimus { | |
|
||
# zarr | ||
Array[File]? zarr_output_files = OptimusZarrConversion.zarr_output_files | ||
|
||
# fastqc | ||
Array[Array[File]] fastqc_htmls = fastqc_output_htmls | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unless you really want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed, will do. |
||
Array[Array[File]] fastqc_zips = fastqc_output_zips | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,14 @@ task ValidateOptimus { | |
File matrix | ||
File gene_metrics | ||
File cell_metrics | ||
|
||
Array[File] fastqc_htmls | ||
Int required_disk = ceil((size(bam, "G") + size(matrix, "G")) * 1.1) | ||
|
||
String expected_bam_hash | ||
String expected_matrix_hash | ||
String expected_gene_metric_hash | ||
String expected_cell_metric_hash | ||
Array[String] expected_fastqc_html_hashes | ||
|
||
command <<< | ||
|
||
|
@@ -51,6 +52,14 @@ task ValidateOptimus { | |
fail=true | ||
fi | ||
|
||
for htmlfile in ${sep=' ' fastqc_htmls}; do | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you can't easily check the zips here? The test could at least assert that the number of zip files generated is correct. |
||
hash=$(md5sum $htmlfile | awk '{print $1}') | ||
if [[ " ${sep=' ' expected_fastqc_html_hashes} " != *" $hash "* ]]; then | ||
>&2 echo "fastq_html_hash ($hash) did not match expected hash (${sep=' ' expected_fastqc_html_hashes})" | ||
fail=true | ||
fi | ||
done | ||
|
||
if [ $fail == "true" ]; then exit 1; fi | ||
|
||
>>> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you are expecting multiple zips or htmls then I think
select_all()
would be better here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since only one of
FastQC
andFastQCNoIndex
will ever be run, usingselect_all()
would just result in an extra unnecessary level of array.