-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathSnakefile
More file actions
124 lines (112 loc) · 4.24 KB
/
Snakefile
File metadata and controls
124 lines (112 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
Process Single nuclei reads with cellranger
"""
import os
import glob
import pandas
# Input and Output directories
CELLRANGER_DIR = "cellranger"
FASTQC_DIR = "fastqc"
MULTIQC_DIR = "multiqc"
COMBINED_10X_METRICS = "10x_metrics.csv"
# References
CELLRANGER_REFERENCE = "cellranger-mm10/refdata-gex-mm10-2020-A"
# Snake variables
# Read pattern once it is transferred out of raw_data folder to scratch
# This specific pattern is required by cellranger
BCL2_READS_PATTERN = "{{sample}}_S1_L001_R{read}_001.fastq.gz"
BCL2_FASTQC_PATTERN = "{{sample}}_S1_L001_R{read}_001_fastqc.{ext}"
BCL2_MULTIQC_IN_PATTERN = "{sample}_S1_L001_R{{read}}_001_fastqc.zip"
# Variables for subsetting of 250k reads/cell datasets
# 325M reads is the maximum number of reads for samples that are 25k reads/cell
MAX_READS = 325_000_000
def out(*dirs):
return expand(os.path.join(*dirs), sample=SAMPLES)
rule all:
input:
out(CELLRANGER_DIR, "{sample}"),
COMBINED_10X_METRICS,
expand(os.path.join(MULTIQC_DIR, "R{read}/R{read}_multiqc_report.html"), read=["1", "2"]),
CELL_RANGER_MEM_GB = "70"
rule cellranger:
input:
expand(os.path.join(SCRATCH_FASTQ_DIR, "{{sample}}", BCL2_READS_PATTERN), read=["1", "2"])
output:
cellranger = directory(os.path.join(SCRATCH_CELLRANGER_DIR, "{sample}")),
# metrics_summary links explicitly to metrics_summary rule since its inputs aren't created yet
# Snakemake also creates the os.path.join(SCRATCH_CELLRANGER_DIR, '{sample}', 'outs') directory
# Cell Ranger doesn't like that (google "NOT A PIPESTANCE" for details)
# So we need to delete {output.cellranger} in the shell script before running cellranger
metrics_summary = os.path.join(SCRATCH_CELLRANGER_DIR, "{sample}/outs/metrics_summary.csv")
resources:
cpus_per_task = 10,
mem_mb = f"{CELL_RANGER_MEM_GB}GB",
runtime = '60h'
shell: # see above comments for why we rm -rf
"""
set +u
module load Bioinformatics cellranger/7.1.0
set -u
echo "CellRanger version: $(cellranger --version)"
return_wd="$PWD"
mkdir -p {SCRATCH_CELLRANGER_DIR}
cd {SCRATCH_CELLRANGER_DIR}
rm -rf {output.cellranger}
cellranger count \
--id {wildcards.sample} \
--transcriptome {CELLRANGER_REFERENCE} \
--fastqs {SCRATCH_FASTQ_DIR}/{wildcards.sample} \
--include-introns true \
--localcores {resources.cpus_per_task} \
--localmem {CELL_RANGER_MEM_GB}
cd $return_wd
"""
# Get sequencing saturation from cellranger report
rule metrics_summary:
input: SAMPLE_TO_10X.values()
output: COMBINED_10X_METRICS
run:
import pandas as pd
import re
# e.g. 6887-CL-3
regex = re.compile('[0-9]{4}-[A-Z]{2}-[0-9]')
results = []
for metrics_path in input:
key = regex.search(metrics_path).group(0)
val = pd.read_csv(metrics_path).set_axis([key]).rename_axis("Sample")
results.append(val)
pd.concat(results).to_csv(output[0])
rule fastqc:
input: expand(os.path.join(SCRATCH_FASTQ_DIR, "{{sample}}", BCL2_READS_PATTERN), read=["1", "2"])
output:
expand(os.path.join(FASTQC_DIR, BCL2_FASTQC_PATTERN),
read=["1", "2"], ext=["html", "zip"])
log: "logs/{sample}_fastqc.log"
resources:
cpus_per_task = 1,
mem_mb = f"30GB",
runtime = '20h'
shell:
"""
module load Bioinformatics fastqc
fastqc --version
fastqc --outdir {FASTQC_DIR} {input} &> {log}
"""
rule multiqc:
input: out(FASTQC_DIR, BCL2_MULTIQC_IN_PATTERN)
# {read} is the wildcard
params:
outdir = os.path.join(MULTIQC_DIR, "R{read}")
output:
rep = os.path.join(MULTIQC_DIR, "R{read}", "R{read}_multiqc_report.html"),
filelist = temp(os.path.join(MULTIQC_DIR, "R{read}", "filelist.txt"))
shell:
"""
rm -rf {params.outdir}
mkdir {params.outdir}
echo {input} | tr " " "\n" > {output.filelist}
multiqc --version
multiqc --file-list {output.filelist} \
--filename R{wildcards.read}_multiqc_report \
--outdir {params.outdir} --force
"""