-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCUT & Tag_UP Stream
More file actions
360 lines (257 loc) · 9.15 KB
/
CUT & Tag_UP Stream
File metadata and controls
360 lines (257 loc) · 9.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
### 202302 cuttag
# Hyacinth Meng
### 0.Pre.
```
conda install mamba
conda install multiqc
mamba install -c bioconda FastQC Bowtie2 samtools bedtools Picard SEACR deepTools
mkdir 1.fq
```
### 1.fq deal
```
# major path (absolute /relative path)
projPath="/home/zju/桌面/202302_SXiaoHui/ANNO"
projPath="/home/zju/桌面/202302_SXiaoHui/nuohe"
# move
#find ../2023cuttag_2_Cleandata |grep fq.gz |xargs -I file mv file #../2023cuttag_2_Cleandata/1.fq
sudo find $projPath |grep Rawdata |grep fq.gz |xargs -I file mv file $projPath/1.fq
```
### 2.QC(before trim.)
```bash
mkdir ./2.fastqc |ls ./1.fq/*.fq.gz | while read i
do
xargs fastqc -t 80 -o ./2.fastqc/
done
# combine all report
mkdir ./3.merged_QC | multiqc ./2.fastqc
mv *mu* ./3.merged_QC
```
### 3.fastp
```
# output DIR.
# sample name structure
# G5_R1.fq.gz
# many samples
nohup mkdir ./4.cleanfastq |ls ./1.fq |grep R1 |cut -d _ -f 1,1 \
| while read id; do
fastp -i ./1.fq/${id}_R1.fq.gz -I ./1.fq/${id}_R2.fq.gz \
-o ./4.cleanfastq/${id}_R1.fq.gz -O ./4.cleanfastq/${id}_R2.fq.gz \
-l 36 -q 20 --compression=6 -R ./4.cleanfastq/${id} \
-w 16 -h ./4.cleanfastq/${id}.fastp.html -j ./4.cleanfastq/${id}.fastp.json
done &
for id in sxh-6
do
fastp -i ./1.fq/${id}_R1.fq.gz -I ./1.fq/${id}_R2.fq.gz \
-o ./4.cleanfastq/${id}_R1.fq.gz -O ./4.cleanfastq/${id}_R2.fq.gz \
-l 36 -q 20 --compression=6 -R ./4.cleanfastq/${id} \
-w 16 \ #线程(软件最大支持数量)
-h ./4.cleanfastq/${id}.fastp.html -j ./4.cleanfastq/${id}.fastp.json
done
```
### 4.QC(after trim)
```bash
ls ./4.cleanfastq/*.fq.gz | while read i
do
xargs fastqc -t 80 -o ./4.cleanfastq/
done
for id in s
do
fastqc -o ./4.cleanfastq/ -t 80 ./4.cleanfastq/${id}_R1.fq.gz \
./4.cleanfastq/${id}_R2.fq.gz
done
# combine all report
mkdir ./5.merged_QC_clean | multiqc ./4.cleanfastq
mv *mu* ./5.merged_QC_clean
```
### 5.Alignment
#### 5.1 index
```
## Build the bowtie2 reference genome index if needed:
## bowtie2-build .a/hg38.fa .
## nohup bowtie2-build hg38.fa hg38 & > nohup01.out
#Huuma
bowtie2-build --threads 40 Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
#大肠杆菌一个菌株基因组index
bowtie2-build --threads 40 GCF_000005845.2_ASM584v2_genomic.fna.gz ./eoliindex
#鼠
bowtie2-build --threads 50 Mus_musculus.GRCm39.dna.primary_assembly.fa.gz ./mm39
```
#### 5.2 Alignment to HG38.
```
ref="./0.ref/hg38index/hg38"
#multi-sample loop
mkdir ./6.alignment
nohup ls ./4.cleanfastq |grep R1.fq.gz |cut -d _ -f 1,1 \
| while read id; do
bowtie2 --end-to-end --very-sensitive --no-mixed --phred33 \
-I 10 -X 700 -p 80 -x $ref \
-1 ./4.cleanfastq/${id}_R1.fq.gz -2 ./4.cleanfastq/${id}_R2.fq.gz \
-S ./6.alignment/${id}.sam 2> ./6.alignment/${id}_bowtie2.txt
done &
for id in sxh-2 sxh-3 sxh-4 sxh-5 sxh-6
do
bowtie2 --end-to-end --very-sensitive --no-mixed --phred33 \
-I 10 -X 700 -p 80 -x $ref \
-1 ./4.cleanfastq/${id}_R1.fq.gz -2 ./4.cleanfastq/${id}_R2.fq.gz \
-S ./6.alignment/${id}.sam 2> ./6.alignment/${id}_bowtie2.txt
done &
```
### 6. remove dul.(选做)
```
# In practice, we have found that the apparent duplication rate is low for high quality ##CUT&Tag datasets, and even the apparent ‘duplicate’ fragments are likely to be true ##fragments. Thus, we do not recommend removing the duplicates.
## sort
nohup mkdir ./7.removeDuplicate |ls ./6.alignment |grep sam |cut -d . -f 1,1 \
| while read id; do
picard SortSam I=./6.alignment/${id}.sam \
O=./7.removeDuplicate/${id}.sorted.sam \
SORT_ORDER=coordinate
done &
for id in G2 G3 G4 G5 G6
do
picard SortSam I=./6.alignment/${id}.sam \
O=./7.removeDuplicate/${id}.sorted.sam \
SORT_ORDER=coordinate
done &
## mark and remove reads
ls ./6.alignment |grep sam |cut -d . -f 1,1 \
| while read id; do
picard MarkDuplicates \
I=./7.removeDuplicate/${id}.sorted.sam \
O=./7.removeDuplicate/${id}.sorted.rmDup.sam \
REMOVE_DUPLICATES=true \
METRICS_FILE=./7.removeDuplicate/${id}_picard.rmDup.txt
done
```
### 7. Assess fragment distribution
#####
```
# -F unmap
mkdir ./8.fragmentLen
ls ./4.cleanfastq |grep R1 |cut -d _ -f 1,1 \
| while read id; do
do
samtools view -F 0x04 ./7.removeDuplicate/${id}.sorted.rmDup.sam | \
awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | \
sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' \
> ./8.fragmentLen/${id}_fragmentLen.txt &
done
for id in B1 B2 B3 B4 B5 B6 G1 G2 G3 G4 G5 G6
do
samtools view -F 0x04 ./7.removeDuplicate/${id}.sorted.rmDup.sam | \
awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | \
sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' \
> ./8.fragmentLen/${id}_fragmentLen.txt &
done
```
### 8.filter and file format transform
```
# Get alignmented paired-end bed file
ls ./7.removeDuplicate |grep sorted.rmDup.sam |cut -d . -f 1,1 \
| while read id; do
## filter and paired-end reads
samtools view -@ 80 -bS -F 0x04 ./7.removeDuplicate/${id}.sorted.rmDup.sam \
>./7.removeDuplicate/${id}.mapped.bam
## Convert into bed file format
bedtools bamtobed -i ./7.removeDuplicate/${id}.mapped.bam -bedpe \
>./7.removeDuplicate/${id}.bed
done
# awk start-end <1000 (Keep the read pairs that are on the same chromosome and fragment length less than 1000bp.)
ls ./4.cleanfastq |grep bed |cut -d . -f 1,1 \
| while read id; do
awk '$1==$4 && $6-$2 < 1000 {print $0}' ./7.removeDuplicate/${id}.bed \
>./7.removeDuplicate/${id}.clean.bed
done
## Only extract the fragment related columns
ls ./4.cleanfastq |grep clean.bed |cut -d . -f 1,1 \
| while read id; do
do
cut -f 1,2,6 ./7.removeDuplicate/${id}.clean.bed | sort -k1,1 -k2,2n -k3,3n \ >./8.fragmentLen/${id}.fragments.bed
done
#4.3 Assess replicate reproducibility (continue section 3.5)
##== linux command ==##
## We use the mid point of each fragment to infer which 500bp bins does this fragment
## belong to.
binLen=500
ls ./4.cleanfastq |grep R1 |cut -d _ -f 1,1 \
| while read id; do
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' ./8.fragmentLen/${id}.fragments.bed| sort -k1,1V -k2,2n | uniq -c | awk -v OFS="\t" '{print $2, $3, $1}' | sort -k1,1V -k2,2n >./8.fragmentLen//${id}.fragmentsCount.bin$binLen.bed
done
```
### 9.peak calling
```
# ------------->
# peak calling
# ------------->
# SEACR (you can try it )
# In this paper ,MACS2 was used
## annuo
for id in sxh-1.mapped.bam sxh-2.mapped.bam sxh-3.mapped.bam sxh-4.mapped.bam sxh-5.mapped.bam sxh-6.mapped.bam
do
macs2 callpeak -t ${id} \
-g hs -f BAMPE \
-n macs2_peak_p0.05.${id} \
--outdir ../9.p0.05macs2 -p 0.05 --keep-dup all \
2>../9.p0.05macs2/macs2Peak_summary.${id}.txt
done
# 0.1
for id in sxh-1.mapped.bam sxh-2.mapped.bam sxh-3.mapped.bam sxh-4.mapped.bam sxh-5.mapped.bam sxh-6.mapped.bam
do
macs2 callpeak -t ${id} \
-g hs -f BAMPE \
-n macs2_peak_p0.1.${id} \
--outdir ../9.p0.1macs2 -p 0.1 --keep-dup all \
2>../9.p0.1macs2/macs2Peak_summary.${id}.txt
done
```
## IGV Visualization
```
##== linux command ==##
ls ./7.removeDuplicate |grep sorted.rmDup.sam |cut -d . -f 1,1 \
| while read id; do
samtools view -@ 80 -bS -F 0x04 ./7.removeDuplicate/${id}.sorted.rmDup.sam \
>./7.removeDuplicate/${id}.mapped.bam
## Convert into bed file format
# bed 文件
bedtools bamtobed -i ./7.removeDuplicate/${id}.mapped.bam -bedpe \
>./7.removeDuplicate/${id}.bed
done
# bigwig 文件
mkdir ./11.bigwig ls ./7.removeDuplicate |grep sorted.rmDup.sam |cut -d . -f 1,1 \
| while read id; do
samtools sort -@80 -o ./7.removeDuplicate/${id}.sorted.bam ./7.removeDuplicate/${id}.mapped.bam
done
ls ./7.removeDuplicate |grep sorted.rmDup.sam |cut -d . -f 1,1 \
| while read id; do
samtools index ./7.removeDuplicate/${id}.sorted.bam
done
ls ./7.removeDuplicate |grep sorted.rmDup.sam |cut -d . -f 1,1 \
| while read id; do
bamCoverage -b ./7.removeDuplicate/${id}.sorted.bam -o ./11.bigwig/${id}_raw.bw
done
for id in G6
do
bamCoverage -b ./7.removeDuplicate/${id}.sorted.bam -o ./11.bigwig/${id}_raw.bw
done
# bamCoverage --bam a.bam -o a.SeqDepthNorm.bw \
--binSize 10
--normalizeUsing RPGC
--effectiveGenomeSize 2150570000
--ignoreForNormalization chrX
--extendReads
```
### 7.2.1 Heatmap over transcription units
Hide
```
##== linux command ==##
cores=8
computeMatrix scale-regions -S $projPath/alignment/bigwig/D1.bw \
$projPath/alignment/bigwig/D2.bw \
$projPath/alignment/bigwig/M1.bw \
$projPath/alignment/bigwig/M2.bw \
-R $projPath/data/hg38_gene/hg38_gene.tsv \
--beforeRegionStartLength 3000 \
--regionBodyLength 5000 \
--afterRegionStartLength 3000 \
--skipZeros -o $projPath/data/hg38_gene/matrix_gene.mat.gz -p $cores
plotHeatmap -m $projPath/data/hg38_gene/matrix_gene.mat.gz -out $projPath/data/hg38_gene/Histone_gene.png --sortUsing sum
```