Skip to content

Commit 1bd8d4f

Browse files
authored
Merge pull request #69 from cnobles/update_1.1.0
Update 1.1.0
2 parents 20920fa + fd49b79 commit 1bd8d4f

35 files changed

+1650
-616
lines changed

.version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
v1.0.2
1+
v1.1.0

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,17 @@ To get started, checkout the iGUIDE documentation at [iGUIDE.ReadTheDocs.io](htt
1414

1515
### Changelog:
1616

17-
**v1.0.0 (August 15th,2019)**
17+
**v1.1.0 (March 8th, 2020)**
18+
19+
* Modified how samples designated as Mock are treated during the analysis
20+
* Mock samples can now be indicated by "None" or "Control" as well
21+
(case-insensitive)
22+
* Abundance can now be selected as [Read], [UMI], or [Fragment]{default} within
23+
config parameters and this selection will identify the abundance method used
24+
for analysis
25+
* Added support for alternative UMI method (dx.doi.org/10.17504/protocols.io.wikfccw)
26+
27+
**v1.0.0 (August 15th, 2019)**
1828

1929
* Release of version 1.0.0!!!
2030
* iGUIDE is a computational pipeline that supports the detection of DSBs induced

Snakefile

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,24 @@ elif ".tsv" in config["Sample_Info"]:
6666
else:
6767
raise SystemExit("\n Sample Info file needs to contain extention '.csv' or '.tsv'.\n")
6868

69+
# Default params if not included in config
70+
if not "maxNcount" in config:
71+
config["maxNcount"] = 1
72+
73+
if not "demultiCores" in config:
74+
demulti_cores = snakemake.utils.available_cpu_count()
75+
else:
76+
demulti_cores = min(
77+
config["demultiCores"], snakemake.utils.available_cpu_count()
78+
)
79+
80+
if not "skipDemultiplexing" in config:
81+
config["skipDemultiplexing"] = False
82+
83+
if not "Alternate_UMI_Method" in config:
84+
config["Alternate_UMI_Method"] = False
85+
86+
6987
# Sample information
7088
sampleInfo = import_sample_info(
7189
config["Sample_Info"], config["Sample_Name_Column"], delim)
@@ -75,28 +93,19 @@ READ_TYPES=config["Read_Types"]
7593
READS=config["Genomic_Reads"]
7694
REQ_TYPES=READS[:]
7795

78-
if config["UMItags"]:
96+
if config["UMItags"] and not config["Alternate_UMI_Method"]:
7997
REQ_TYPES.append("I2")
8098

8199
R1_LEAD=choose_sequence_data(config["R1_Leading_Trim"], sampleInfo)
82100
R1_OVER=choose_sequence_data(config["R1_Overreading_Trim"], sampleInfo)
83101
R2_LEAD=choose_sequence_data(config["R2_Leading_Trim"], sampleInfo)
84-
R2_LEAD_ODN=choose_sequence_data(config["R2_Leading_Trim_ODN"], sampleInfo)
85102
R2_OVER=choose_sequence_data(config["R2_Overreading_Trim"], sampleInfo)
86103

87-
# Default params if not included in config
88-
if not "maxNcount" in config:
89-
config["maxNcount"] = 1
90-
91-
if not "demultiCores" in config:
92-
demulti_cores = snakemake.utils.available_cpu_count()
104+
if config["Alternate_UMI_Method"]:
105+
R1_LEAD_ODN=choose_sequence_data(config["R1_Leading_Trim_ODN"], sampleInfo)
93106
else:
94-
demulti_cores = min(
95-
config["demultiCores"], snakemake.utils.available_cpu_count()
96-
)
107+
R2_LEAD_ODN=choose_sequence_data(config["R2_Leading_Trim_ODN"], sampleInfo)
97108

98-
if not "skipDemultiplexing" in config:
99-
config["skipDemultiplexing"] = False
100109

101110
## Memory and default params
102111
if not "demultiMB" in config:
@@ -160,7 +169,10 @@ rule all:
160169
stats=RUN_DIR + "/reports/runstats." + RUN + ".html"
161170

162171
# Architecture Rules
163-
include: "rules/arch.rules"
172+
if (config["Alternate_UMI_Method"]):
173+
include: "rules/arch.umi_alt_method.rules"
174+
else:
175+
include: "rules/arch.rules"
164176

165177
# Processing Rules
166178
if (config["skipDemultiplexing"]):
@@ -169,11 +181,18 @@ else:
169181
include: "rules/demulti.rules"
170182

171183
include: "rules/binning.rules"
172-
include: "rules/trim.rules"
184+
185+
if (config["Alternate_UMI_Method"]):
186+
include: "rules/trim.umi_alt_method.rules"
187+
else:
188+
include: "rules/trim.rules"
173189

174190
if (config["UMItags"]):
175-
include: "rules/umitag.rules"
176-
UMIseqs = sampleInfo["barcode2"]
191+
if (config["Alternate_UMI_Method"]):
192+
include: "rules/umitag.umi_alt_method.rules"
193+
else:
194+
include: "rules/umitag.rules"
195+
UMIseqs = sampleInfo["barcode2"]
177196
else:
178197
include: "rules/umitag_stub.rules"
179198

@@ -182,10 +201,16 @@ include: "rules/filt.rules"
182201
if (config["Aligner"] == "BLAT" or config["Aligner"] == "blat"):
183202
include: "rules/consol.rules"
184203
include: "rules/align.blat.rules"
185-
include: "rules/quality.blat.rules"
204+
if (config["Alternate_UMI_Method"]):
205+
include: "rules/quality.blat.umi_alt_method.rules"
206+
else:
207+
include: "rules/quality.blat.rules"
186208
elif (config["Aligner"] == "BWA" or config["Aligner"] == "bwa"):
187209
include: "rules/consol_stub.rules"
188-
include: "rules/align.bwa.rules"
210+
if (config["Alternate_UMI_Method"]):
211+
include: "rules/align.bwa.umi_alt_method.rules"
212+
else:
213+
include: "rules/align.bwa.rules"
189214
include: "rules/quality.sam.rules"
190215
else:
191216
raise SystemExit(

configs/cluster.config.yml

Lines changed: 0 additions & 8 deletions
This file was deleted.

configs/simulation.config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Supplemental_Info : "sampleInfo/simulation.supp.csv"
55
Ref_Genome : "hg38"
66
Aligner : "blat"
77
UMItags : TRUE
8+
Abundance_Method : "Fragment"
89

910
# Sequence files
1011
Seq_Path : "etc/tests/Data"

configs/umi_alt_example.config.yml

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# Run configuration
2+
Run_Name : "umi_alt_example"
3+
Sample_Info : "sampleInfo/umi_alt_example.sampleInfo.csv"
4+
Supplemental_Info : "sampleInfo/umi_alt_example.supp.csv"
5+
Ref_Genome : "hg38"
6+
Aligner : "bwa"
7+
UMItags : TRUE
8+
Abundance_Method : "UMI"
9+
Alternate_UMI_Method : TRUE
10+
11+
# Sequence files
12+
Seq_Path : "analysis/umi_alt_example/input_data"
13+
R1: "Undetermined_S0_L001_R1_001.fastq.gz"
14+
R2: "Undetermined_S0_L001_R2_001.fastq.gz"
15+
I1: "Undetermined_S0_L001_I1_001.fastq.gz"
16+
I2: "Undetermined_S0_L001_I2_001.fastq.gz"
17+
18+
# SampleInfo formating
19+
Sample_Name_Column : "sampleName"
20+
21+
# Sequence information
22+
R2_Leading_Trim : "NNNNNNNNSCTACAAGAGCGGTGAGT"
23+
R2_Overreading_Trim : "GTTTAATTGAGTTGTCATATGTTAATAACGG" # Minus option, use "CCGTTATTAACATATGACAACTCAATTAAAC" for Plus.
24+
R1_Leading_Trim : "NNNCCGTTATTAACATATGACAACTCAATTAAAC" # Minus option, use "NNNGTTTAATTGAGTTGTCATATGTTAATAACGG" for Plus.
25+
R1_Leading_Trim_ODN : "." # Minus option, use "TAT" for Plus.
26+
R1_Overreading_Trim : "ACTCACCGCTCTTGTAG"
27+
28+
# Target sequence information, do not include PAM sequence with gRNAs
29+
Target_Sequences:
30+
EMXs1 : "GAGTCCGAGCAGAAGAAGAA"
31+
On_Target_Sites :
32+
EMXs1 : "chr2:+:72933869"
33+
34+
# Specimen target treatment
35+
# Data can be input here or referenced in sampleInfo by just putting
36+
# 'sampleInfo:{column}'. For simplicity, if all specimens were treated with the
37+
# same target sequence(s), the sample can be named 'all' followed by the treated
38+
# target sequence name(s). If placing data in sampleInfo, delimit multiple
39+
# target sequence names with a semicolon ';'.
40+
Treatment :
41+
all : "EMXs1"
42+
43+
# Specimen nuclease treatment
44+
# Similar to target treatment above, this parameter dictates which nuclease(s)
45+
# where used on the specimens. This refers to the class of nuclease, such as
46+
# Cas9 or Cpf1, which behave differently when they edit DNA. Notation can follow
47+
# the same as above, if all specimens were treated with the same class of
48+
# nuclease, then just specify 'all : "{nuclease_profile}"', or list out by
49+
# specimen. Additionally you can specify the column in sampleInfo in the same
50+
# format as above. Currently, iGUIDE does not support processing for specimens
51+
# with multiple classes of nuclease profiles. Only one profile can be specified
52+
# per specimen.
53+
Nuclease :
54+
all : "Cas9"
55+
56+
# Nuclease profile
57+
# Whatever name is given to the nuclease, specify parameter below that dictate
58+
# how the nuclease cuts DNA. 'PAM' - protospacer adjacent motif - should be
59+
# specified here and can contain ambiguous nucleotides. 'PAM_Loc' indicates the
60+
# location of the PAM with respect to the pattern, either '5p', '3p' or FALSE.
61+
# 'PAM_Tol' indicates the tolerance for mismatches in the PAM sequence (ignorned
62+
# if PAM is FALSE). 'Cut_Offset' indicates the offset from the 5' nucleotide of
63+
# the PAM sequence where the nuclease creates a double strand break, unless PAM
64+
# is FALSE, then the 5' position of the target sequence (also accepts
65+
# "mid_insert" to specify middle of region between paired alignments).
66+
# 'Insert_size' is used if target sequences are expected to flank each other for
67+
# editing, such as with TALENs, and indicates the expected size of the insert.
68+
# To input a range, delimit the min and max by a colon, ie. 15:21. All names of
69+
# nucleases used to treat specimens need to have a profile. Additional profiles
70+
# should be added under the 'Nuclease_Profiles' parameter.
71+
Nuclease_Profiles :
72+
Cas9 :
73+
PAM : "NGG"
74+
PAM_Loc : "3p"
75+
PAM_Tol : 1
76+
Cut_Offset : -4
77+
Insert_size : FALSE
78+
79+
# ------------------------------------------------------------------------------
80+
# iGUIDE configuration
81+
Read_Types : ["R1", "R2"]
82+
Genomic_Reads : ["R1", "R2"]
83+
readNamePattern : "'[\\w\\:\\-\\+]+'"
84+
85+
# Memory Management (in MB units)
86+
defaultMB : 2000
87+
demultiMB : 40000
88+
trimMB : 8000
89+
filtMB : 4000
90+
consolMB : 4000
91+
alignMB : 20000
92+
qualCtrlMB : 16000
93+
assimilateMB : 16000
94+
evaluateMB : 8000
95+
reportMB : 4000
96+
97+
# Demultiplexing parameters
98+
skipDemultiplexing : TRUE
99+
barcode1Length : 8
100+
barcode2Length : 8
101+
barcode1 : "I1"
102+
barcode2 : "I2"
103+
bc1Mismatch : 0
104+
bc2Mismatch : 0
105+
maxNcount : 1
106+
107+
# Sequence trimming
108+
## R1 sequence
109+
R1leadMismatch : 3
110+
R1odnMismatch : 0
111+
R1overMismatch : 4
112+
R1overMaxLength : 20
113+
## R2 sequence
114+
R2leadMismatch : 2
115+
R2overMismatch : 4
116+
R2overMaxLength : 20
117+
118+
# Binning
119+
bins : 10
120+
level : 25000
121+
122+
# Reference Alignment
123+
BLATparams : "-tileSize=11 -stepSize=9 -minIdentity=85 -maxIntron=5 -minScore=27 -dots=1000 -out=psl -noHead"
124+
BWAparams : "-k 30 -w 2500 -P -L 25 -a"
125+
126+
# Post-alignment filtering
127+
maxAlignStart : 5
128+
minPercentIdentity : 95
129+
minTempLength : 30
130+
maxTempLength : 2500
131+
132+
# Post-processing
133+
refGenes :
134+
file : "genomes/hg38.refSeq.ext.nomodel.rds"
135+
symbolCol : "name2"
136+
oncoGeneList :
137+
file : "http://bushmanlab.org/assets/doc/allOnco_Feb2017.tsv"
138+
symbolCol : "symbol"
139+
specialGeneList :
140+
file : "http://bushmanlab.org/assets/doc/humanLymph.tsv"
141+
symbolCol : "symbol"
142+
maxTargetMismatch : 6
143+
upstreamDist : 100
144+
downstreamDist : 30
145+
pileUpMin : 3
146+
recoverMultihits : FALSE
147+
148+
# Report
149+
suppFile : TRUE
150+
tables : FALSE
151+
figures : TRUE
152+
reportData : FALSE
153+
infoGraphic : TRUE
154+
signature : "Christopher L. Nobles, Ph.D. [Bushman Lab]"

docs/changelog.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,25 @@
33
ChangeLog
44
=========
55

6+
**v1.1.0 (March 8th, 2020)**
7+
8+
- Modified how samples designated as Mock are treated during the analysis
9+
- Mock samples can now be indicated by "None" or "Control" as well
10+
(case-insensitive)
11+
- Abundance can now be selected as [Read], [UMI], or [Fragment]{default} within
12+
config parameters and this selection will identify the abundance method used
13+
for analysis
14+
- Added support for alternative UMI method (dx.doi.org/10.17504/protocols.io.wikfccw)
15+
16+
**v1.0.2 (February 15th, 2020)**
17+
18+
- Bugfix: UMItags set to FALSE will now process through to completion
19+
- Rebuild: Updated to build version 1.0.1
20+
21+
**v1.0.1 (December 3rd, 2019)**
22+
23+
- Bugfix: Updated Gene set enrichment test within report
24+
625
**v1.0.0 (August 15th, 2019)**
726

827
- Complete support for BLAT and BWA aligners

docs/usage.rst

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -518,10 +518,20 @@ Run configuration
518518
listed here.
519519

520520
``UMItags``
521-
This is a logical parameter indicating whether to use unique molecular indices
522-
(UMI) sequence tags ('TRUE') or to only use unique fragments lengths (see
523-
`SonicAbundance <https://doi.org/10.1093/bioinformatics/bts004>`) to quantify
524-
abundances of unique observations.
521+
This is a logical parameter indicating whether to capture unique molecular
522+
indices (UMI) sequence tags ('TRUE') during processing. **Note:** Ambiguous
523+
nucleotides will need to be identified in the barcode fields of the sampleInfo
524+
file. Please see supplied simulated sampleInfo file for example.
525+
526+
``Abundance_Method``
527+
Options include 'Read', 'UMI', or 'Fragment' (default) for calculating the
528+
abundance method. 'Fragment' refers to the use of unique fragments lengths
529+
(see `SonicAbundance <https://doi.org/10.1093/bioinformatics/bts004>`) to
530+
quantify abundances of unique observations. 'UMI' will change the abundance
531+
method to use the unique molecular indices (**Note** that the ``UMItags``
532+
option will need to be set to TRUE for this feature to work). 'Read' will
533+
change the abundance counts to read counts, yet this method may be unreliable
534+
due to PCR jackpotting or bias.
525535

526536

527537
Sequence files
File renamed without changes.

etc/tests/simulation.digests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ file2 :
1313
file3 :
1414
name : "stats.eval.simulation.csv"
1515
path : "analysis/simulation/output/stats.eval.simulation.csv"
16-
md5 : "26cd4b00fa40212cd01027d1c11cd76f"
16+
md5 : "3c4eff47a2f419a9b47c45259ac70778"

0 commit comments

Comments
 (0)