-
Notifications
You must be signed in to change notification settings - Fork 263
Expand file tree
/
Copy path1Pipeline.sh
More file actions
2129 lines (1826 loc) · 104 KB
/
1Pipeline.sh
File metadata and controls
2129 lines (1826 loc) · 104 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
[TOC]
# 1EasyMetagenome Pipeline (1易宏基因组流程)
# Authors(作者): Yong-Xin Liu(刘永鑫), Defeng Bai(白德凤), Tong Chen(陈同) et al.
# Version(版本): 1.24, 2025/11/22
# Operation System(操作系统): Linux Ubuntu 22.04+ / CentOS 7.7+
# Homepage(主页): https://github.com/YongxinLiu/EasyMetagenome
# Cititon(引文): Bai, et al. 2025. EasyMetagenome: A User‐Friendly and Flexible Pipeline for Shotgun Metagenomic Analysis in Microbiome Research. iMeta 4: e70001. https://doi.org/10.1002/imt2.70001
# 1. Data preprocessing(数据预处理)
## 1.1 Preparing(准备工作)
# 1. First-time use, please refer to the `0Install.sh` to install the software and database (approximately 1-3 days, only once).
# 2. Copy the EasyMetagenome workflow `1Pipeline.sh` to the project folder, e.g., in this case, `meta`.
# 3. Prepare the sequencing data (seq/*.fq.gz) and sample metadata (result/metadata.txt) in the project folder.
# 1. 首次使用请参照`0Install.sh`脚本,安装软件和数据库(大约1-3天,仅一次)
# 2. 易宏基因组(EasyMetagenome)流程`1Pipeline.sh`复制到项目文件夹,如本次为meta
# 3. 项目文件夹准备测序数据(seq/*.fq.gz)和样本元数据(result/metadata.txt)
# **Software, database and work directory settings(数据库、软件和工作目录设置)**
# **The follwoing paragraph must run before(分析前必须运行)**
# Conda directory(软件安装目录), e.g. /anaconda3 , `conda env list` view software list,
soft=~/miniconda3
# database(db, 数据库位置), e.g. /db or ~/db
db=~/db
# work directory(wd, 工作目录),e.g. meta
wd=~/meta
# Create and enter the working directory 创建并进入工作目录
mkdir -p $wd && cd $wd
# Create sequences, temporary files, and a results directory 创建序列,临时文件和结果目录
mkdir -p seq temp result
# Add software/scripts to environment variables 添加软件/脚本至环境变量
PATH=$soft/bin:$soft/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$db/EasyMicrobiome/linux:$db/EasyMicrobiome/script
echo $PATH
# **Metadata and sequence files (元数据和序列文件) **
# Metadata (元数据)
# Edit metadata in Excel then save txt format in result, demo in result or download (编写元数据metadata.txt并保存至result目录,此处下载演示)
wget -c http://www.imeta.science/github/EasyMetagenome/result/metadata.txt
mv metadata.txt result/metadata.txt
# Format check: ^I is tab, $ is linux new line, ^M$ is windows new line, ^M is Mac new line
# 格式预览,^I为制表符,$为Linux换行,^M$为Windows回车,^M为Mac换行符
cat -A result/metadata.txt
# Format windows ^M$ to linux $, delete ^M in regulation expression is '\r'
# 转换Windows回车为Linux换行,即删除^M (正则表达式为\r)
sed -i 's/\r//' result/metadata.txt
cat -A result/metadata.txt
# Sequencing data (序列文件)
# Using filezilla upload to seq directory (用户使用filezilla上传测序文件至seq目录)
# This test uses theChina National Center for Bioinformation's GSA network for downloading data. Backup links are also provided via iMeta and BaiduNetDisk.
# 本次测试国家生信息中心GSA网络下载,同时提供iMeta、百度网盘备用站点数据下载链接
# Site 1. GSA download links and batch rename by metadata (站点1. 国家生信息中心下载链接,按实验设计编号批量下载并改名)
# GSA: https://ngdc.cncb.ac.cn/gsa/ search CRA032890, find link test one example C3
wget -c ftp://download.big.ac.cn/gsa5/CRA032890/1/CRR2274865/CRR2274865_r1.fq.gz -O seq/C3_1.fq.gz
wget -c ftp://download.big.ac.cn/gsa5/CRA032890/1/CRR2274865/CRR2274865_r2.fq.gz -O seq/C3_2.fq.gz
awk 'BEGIN{OFS=FS="\t"}{system("wget -c ftp://download.big.ac.cn/gsa5/"$8"/"$9"/"$9"_r1.fq.gz -O seq/"$1"_1.fq.gz")}' \
<(tail -n+2 result/metadata.txt)
awk 'BEGIN{OFS=FS="\t"}{system("wget -c ftp://download.big.ac.cn/gsa5/"$8"/"$9"/"$9"_r2.fq.gz -O seq/"$1"_2.fq.gz")}' \
<(tail -n+2 result/metadata.txt)
# Site 2. iMeta Science download link (站点2. iMeta下载链接)
cd seq
awk '{system("wget -c http://www.imeta.science/github/EasyMetagenome/seq/"$1"_1.fq.gz")}' <(tail -n+2 ../result/metadata.txt)
awk '{system("wget -c http://www.imeta.science/github/EasyMetagenome/seq/"$1"_2.fq.gz")}' <(tail -n+2 ../result/metadata.txt)
cd ..
# Site 3. BaiduNetDisk (站点3. 百度网盘) in /db/meta/seq directory from https://pan.baidu.com/s/1Ikd_47HHODOqC3Rcx6eJ6Q?pwd=0315
# ls show file info, -l (l: list) show detail, -sh s: size, h: human readable
# ls查看文件大小等信息,-l 列出详细信息 (l: list),-sh 显示人类可读方式文件大小 (s: size; h: human readable)
ls -lsh seq/*.fq.gz
# Statistic basic info of sequences 统计序列信息
time seqkit stat seq/*.fq.gz > result/seqkit.txt
cat result/seqkit.txt
# **Sequence file format check (序列文件格式检查)**
# Use zless/zcat to view compressible files and check the sequence quality format
# (quality values in uppercase are in standard Phred33 format, lowercase in Phred64 format);
# check if the pair-end sequence IDs have duplicate names, and if so, rename them.
# Refer to **Appendix – Quality Control kneaddata: End-to-End Mismatch After Host Removal; Sequence Renaming**.
# zless/zcat查看可压缩文件,检查序列质量格式(质量值大写字母为标准Phred33格式,小写字母为Phred64);
# 检查双端序列ID是否重名,如重名需要改名。参考**附录 —— 质控kneaddata,去宿主后双端不匹配;序列改名**。
# Set a sample name be variable i, which can be reused multiple times, reducing the number of modifications required.
# 设置某个样本名为变量i,后面可多次重用,减少修改次数
i=C1
# zless is used to view compressed files; spacebar to turn pages, q to exit; head specifies the number of lines to display.
# zless查看压缩文件,空格翻页,q退出; head指定显示行数
zless seq/${i}_1.fq.gz | head -n4
# **Summary of Working Directory and File Structure**
# **工作目录和文件结构总结**
# ├── pipeline.sh
# ├── result
# │ └── metadata.txt
# ├── seq
# │ ├── C1_1.fq.gz
# │ ├── ...
# │ └── Y1_2.fq.gz
# └── temp
# * `1pipeline.sh` is the analysis workflow code;
# * The `seq` directory contains 6 samples of short-reads paired-end 150 bp sequencing and 12 sequence files;
# * `temp` is a temporary folder that stores intermediate analysis files. It can be deleted entirely after analysis to save space;
# * `result` contains important node files and formatted analysis results figures. `metadata.txt` is also located here.
# * 1pipeline.sh是分析流程代码;
# * seq目录中有2个样本Illumina双端测序,4个序列文件;
# * temp是临时文件夹,存储分析中间文件,结束可全部删除节约空间
# * result是重要节点文件和整理化的分析结果图表,实验设计metadata.txt也在此
## 1.2 Fastp Quality Control(质量控制)
# Create a directory to record software versions and citations
# 创建目录,记录软件版本和引文
mkdir -p temp/qc result/qc
fastp
# Single sample quality control (单样本质控)
i=`tail -n+2 result/metadata.txt|cut -f1|head -n1`
fastp -i seq/${i}_1.fq.gz -I seq/${i}_2.fq.gz \
-o temp/qc/${i}_1.fastq -O temp/qc/${i}_2.fastq
# Multiple samples are processed in parallel; this step requires 5 times the space of the original data.
# 多样本并行,此步占用原始数据5x空间
# -j 2: indicates processing 2 samples simultaneously; for 6 samples, j2, 2minmutes
# -j 2: 表示同时处理2个样本;6个样本j2, 2分钟(minutes, m)
time tail -n+2 result/metadata.txt|cut -f1|rush -j 2 \
"fastp -i seq/{1}_1.fq.gz -I seq/{1}_2.fq.gz \
-j temp/qc/{1}_fastp.json -h temp/qc/{1}_fastp.html \
-o temp/qc/{1}_1.fastq -O temp/qc/{1}_2.fastq \
> temp/qc/{1}.log 2>&1"
# Summary of Quality Control Results (质控后结果汇总)
echo -e "SampleID\tRaw\tClean" > temp/fastp
for i in `tail -n+2 result/metadata.txt|cut -f1`;do
echo -e -n "$i\t" >> temp/fastp
grep 'total reads' temp/qc/${i}.log|uniq|cut -f2 -d ':'|tr '\n' '\t' >> temp/fastp
echo "" >> temp/fastp
done
sed -i 's/ //g;s/\t$//' temp/fastp
# Sort by metadata (按metadata排序)
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=$0}NR>FNR{print a[$1]}' temp/fastp result/metadata.txt \
> result/qc/fastp.txt
cat result/qc/fastp.txt
## 1.3 KneadData Host removal (去宿主)
# The kneaddata relies on bowtie2 to align with the host sequence, and then filters out non-host sequences for downstream analysis.
# kneaddata是流程主要依赖bowtie2比对宿主,然后筛选非宿主序列用于下游分析。
# Create directories, activate the environment, and record versions.
# 创建目录、启动环境、记录版本
mkdir -p temp/hr
conda activate kneaddata
kneaddata --version # v0.12.3
# Single-sample host removal (单样本去宿主)
i=`tail -n+2 result/metadata.txt|cut -f1 | head -n1`
time kneaddata -i1 temp/qc/${i}_1.fastq -i2 temp/qc/${i}_2.fastq \
-o temp/hr \
--bypass-trim --bypass-trf --reorder \
--bowtie2-options '--very-sensitive --dovetail' \
-db ${db}/kneaddata/human/hg_39 \
--remove-intermediate-output -v -t 3
# Multi-sample host removal, this step occupies 5 times the space of the original data, 3m
# 多样本去宿主,此步占用原始数据5x空间,5m
time tail -n+3 result/metadata.txt | cut -f1 | rush -j 2 \
"kneaddata \
-i1 temp/qc/{1}_1.fastq -i2 temp/qc/{1}_2.fastq \
-o temp/hr/ \
--bypass-trim --bypass-trf --reorder \
--bowtie2-options '--very-sensitive --dovetail' \
-db ${db}/kneaddata/human/hg_39 \
--remove-intermediate-output -v -t 3"
# To check the size, * matches any number of characters, and ? matches any single character.
# 查看大小,*匹配任意多个字符,?匹配任意一个字符
ls -shtr temp/hr/*_paired_?.fastq
# Simplified renaming (简化改名)
# Ubuntu System renaming (Ubuntu系统改名)
rename 's/_1_kneaddata_paired//' temp/hr/*.fastq
# CentOS System renaming (CentOS系统改名)
rename '_1_kneaddata_paired' '' temp/hr/*.fastq
# Summary of Quality Control Results (质控结果汇总)
kneaddata_read_count_table --input temp/hr \
--output temp/kneaddata.txt
# Filter key results column (筛选重点结果列)
cut -f 1,2,5,6 temp/kneaddata.txt | sed 's/_1_kneaddata//' > result/qc/sum.txt
# View table alignment (对齐方式查看表格)
csvtk -t pretty result/qc/sum.txt
# Verify if the IDs match in pair-end reads (校验ID是否配对)
paste <(head -n40 temp/hr/`tail -n+2 result/metadata.txt|cut -f1|head -n1`_1.fastq|grep @) <(head -n40 temp/hr/`tail -n+2 result/metadata.txt|cut -f1|head -n1`_2.fastq|grep @)
# Large file cleanup: samples with high host content can save >90% of space.
# 大文件清理,高宿主含量样本可节约>90%空间
# Using the absolute path of a command ensures that it is a parameterless command. Administrators can use aliases to define commands with parameters, which can affect the operation results.
# 使用命令的绝对路径确保使用无参数的命令,管理员用alias自定义命令含参数,影响操作结果
/bin/rm -rf temp/hr/*contam* temp/hr/*unmatched* temp/hr/reformatted* temp/hr/_temp*
ls -l temp/hr/
# After confirming the host removal results, the intermediate files after quality control can be deleted
# 确认去宿主结果后,可以删除质控后中间文件
rm temp/qc/*.fastq
# Human data should be published as dehosted files to reduce the risk of privacy leaks.
# 人类数据建议发布去宿主后的文件,减少隐私泄露风险
# Read-based HUMAnN4/Kraken2 (二、基于读长分析)
## 2.1 HUMAnN4分析
# HUMAnN 4: https://docs.google.com/document/d/1rCx5JkuO7wCKWrL8_-UJx_FkopJAfcDFtZktgPspak0/edit?pli=1&tab=t.0
### HUMAnN4 taxonomic and functional annotation (物种和功能分析)
# Prepare input file (准备输入文件)
# HUMAnN requires a file containing paired-end sequences to be merged as input.
# A for loop is used to merge paired-end sequences in batches according to the experimental design sample names.
# Note the asterisk (*) and question mark (?), which represent multiple and a single character, respectively.
# Of course, you must also pay attention to the fact that each line of code ends with a backslash (\\).
# HUMAnN要求双端序列合并的文件作为输入,for循环根据实验设计样本名批量双端序列合并。
# 注意星号(\*)和问号(?),分别代表多个和单个字符。当然大家更不能溜号,行分割的代码行末有一个\\
mkdir -p temp/concat
# Merge pair-end files into a single file (双端合并为单个文件)
for i in `tail -n+2 result/metadata.txt|cut -f1`;do
cat temp/hr/${i}_?.fastq \
> temp/concat/${i}.fq; done &
# Check sample quantity and size (查看样品数量和大小)
ls -shl temp/concat/*.fq
# The data is too large and the computation time is long. You can use head to truncate the single-end analysis to a 20M sequence, i.e., 3G, with 80M lines. See Appendix: HUANN2 Reduce Input File Speedup.
# 数据太大,计算时间长,可用head对单端分析截取20M序列,即3G,行数为80M行,详见附录:HUMAnN2减少输入文件加速
# HUMAnN4 analysis tutorial and test (教学和测试)
# * Taxonomic annotation call MetaPhlAn4 (物种组成调用)
# * Input(输入):temp/concat/*.fq Merged sequences (合并的序列)
# * Output(输出):temp/humann4/
# * Y1_pathabundance.tsv
# * Y1_pathcoverage.tsv
# * Y1_genefamilies.tsv
# Start the humann4 environment and check the database configuration (启动环境并检查数据库配置)
conda activate humann4
mkdir -p temp/humann4
humann --version # v4.0.0.alpha.1
humann_config
# Single-sample test (单样本测试): Y1 657M, 8p, 32min
i=`tail -n+2 result/metadata.txt|cut -f1 | head -n1`
time humann --input temp/concat/${i}.fq --threads 8 \
--metaphlan-options "--input_type fastq --bowtie2db ${db}/metaphlan4 --index mpa_vOct22_CHOCOPhlAnSGB_202403 --offline -t rel_ab_w_read_stats --nproc 8" \
--output temp/humann4
# Multi-sample parallel computing, test data with 6 samples in dual parallel: 2h, recommended 16p, 3h/6G;
# 多样本并行计算,测试数据6个样本双并行:2h,推荐16p,3h/6G;
# -n+3 start from second samples, --threads set 8/16/32 to accelerate
time tail -n+3 result/metadata.txt | cut -f1 | rush -j 2 \
"humann --input temp/concat/{1}.fq --threads 8 \
--metaphlan-options '--input_type fastq --bowtie2db ${db}/metaphlan4 --index mpa_vOct22_CHOCOPhlAnSGB_202403 --offline -t rel_ab_w_read_stats --nproc 8' \
--output temp/humann4/ "
# (Optional) Run MetaPhlAn4 separately (可选)单独运行MetaPhlAn4
metaphlan -v # MetaPhlAn version 4.1.1 (11 Mar 2024)
mkdir -p temp/metaphlan4
i=`tail -n+2 result/metadata.txt|cut -f1 | head -n1`
# taxonomy classification 8p, 4min
time metaphlan --input_type fastq temp/hr/${i}_1.fastq \
temp/metaphlan4/${i}.txt --nproc 8 --bowtie2db ${db}/metaphlan4 --index mpa_vOct22_CHOCOPhlAnSGB_202403 --offline
### Taxonomic composition table (物种组成表)
# Output(输出):
# result/metaphlan4/taxonomy.tsv
# result/metaphlan4/taxonomy.spf spf for STAMP(用于STAMP分析)
# Sample Merge, correct name, preview (样品合并、修正ID、预览) | sed '2i #metaphlan4'
mkdir -p result/metaphlan4
merge_metaphlan_tables.py temp/humann4/*_metaphlan_profile.tsv | sed 's/_1_metaphlan//g' | tail -n+2 | sed '1 s/clade_name/ID/' | sed '2i #metaphlan4' \
> result/metaphlan4/taxonomy.tsv
csvtk -t stat result/metaphlan4/taxonomy.tsv
head -n5 result/metaphlan4/taxonomy.tsv
# Format to stamp spf(转换为STAMP输入spf格式)
# duplicate rows redandency by sort & uniq
metaphlan_to_stamp.pl result/metaphlan4/taxonomy.tsv \
|sort -r | uniq > result/metaphlan4/taxonomy.spf
# stat and view, 14 columns, 274 rows
csvtk -t stat result/metaphlan4/taxonomy.spf
head result/metaphlan4/taxonomy.spf
# STAMP not support unclassified, filtered left 203 row
grep -v 'unclassified' result/metaphlan4/taxonomy.spf > result/metaphlan4/taxonomy2.spf
csvtk -t stat result/metaphlan4/taxonomy2.spf
head result/metaphlan4/taxonomy2.spf
# Download metadata.txt & taxonomy2.spf, and open in STAMP software 结果文件下载可用STAMP分析
## 2.4 Functional composition analysis (功能组成分析)
# 整合后的输出:
# * result/metaphlan4/taxonomy.tsv 物种丰度表
# * result/metaphlan4/taxonomy.spf 物种丰度表(用于stamp分析)
# * result/humann3/pathabundance_relab_unstratified.tsv 通路丰度表
# * result/humann3/pathabundance_relab_stratified.tsv 通路物种组成丰度表
# * stratified(每个菌对此功能通路组成的贡献)和unstratified(功能组成)
# Samples merging table (样本合并为功能组成表)
mkdir -p result/humann4
humann_join_tables --input temp/humann4 \
--file_name pathabundance \
--output result/humann4/path.tsv
# format sampleID, stat and view, 样本名统一、统计和预览
sed -i 's/_Abundance//g' result/humann4/path.tsv
csvtk -t stat result/humann4/path.tsv
head -n5 result/humann4/path.tsv
# Normolization to relative abundance relab(1) or per million cpm(1,000,000) (标准化为相对丰度relab或百万比cpm)
humann_renorm_table \
--input result/humann4/path.tsv \
--units relab \
--output result/humann4/path_relab.tsv
head -n5 result/humann4/path_relab.tsv
# Stratify into function related species (stratified) and function only (unstratified) 分层为功能包括物种组成和仅功能组成
humann_split_stratified_table \
--input result/humann4/path_relab.tsv \
--output result/humann4/
### Difference comparison and bar chart (差异比较和柱状图)
# * Input: Pathway abundance table result/humann4/path.tsv and experimental design result/metadata.txt
# * Intermediate: Pathway abundance table file containing grouping information result/humann4/path.pcl
# * Output: result/humann4/associate.txt
# * 输入数据:通路丰度表格 result/humann4/path.tsv和实验设计 result/metadata.txt
# * 中间数据:包含分组信息的通路丰度表格文件 result/humann4/path.pcl
# * 输出结果:result/humann4/associate.txt
# Add grouping to pathway abundance (在通路丰度中添加分组)
# List of samples ID (提取样品列表)
head -n1 result/humann4/path.tsv | sed 's/# Pathway HUMAnN v4.0.0.alpha.1/SampleID/' | tr '\t' '\n' > temp/header
# The sample corresponds to group; in this example group is the 3rd column ($3) 样本对应分组,本示例分组为第3列($3)
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=$3}NR>FNR{print a[$1]}' result/metadata.txt temp/header | tr '\n' '\t'|sed 's/\t$/\n/' > temp/group
# replace grouping
cat <(head -n1 result/humann4/path.tsv) temp/group <(tail -n+2 result/humann4/path.tsv) \
> result/humann4/path.pcl
head -n5 result/humann4/path.pcl
tail -n5 result/humann4/path.pcl
# For intergroup comparisons, small sample sizes may no significant differences
# 组间比较,样本量少无差异
# Backup demo data from HMP (hmp_pathabund.pcl replace to path.pcl)
# wget -c http://www.imeta.science/github/EasyMetagenome/result/humann2/hmp_pathabund.pcl
# cp -f hmp_pathabund.pcl result/humann4/path.pcl
# Set input file 设置输入文件名
pcl=result/humann4/path.pcl
# Count the number of rows and columns in the table 7x4789 (统计表格行、列数量)
csvtk -t stat ${pcl}
head -n3 ${pcl} | cut -f 1-5
# For the KW test with grouping, note the grouping column names in the second column 按分组KW检验,注意第二列的分组列名
humann_associate --input ${pcl} \
--focal-metadatum Group --focal-type categorical \
--last-metadatum Group --fdr 0.2 \
--output result/humann4/associate.txt
# 4 colume include ID, mean, P-value and Q-value, 322 rows
csvtk -t stat result/humann4/associate.txt
head -n5 result/humann4/associate.txt
# barplot show pathway taxonomic composition: L-lysine fermentation to acetate and butanoate
# 柱状图展示通路的物种组成,如:L-赖氨酸发酵生成乙酸和丁酸
# Set significant pathway ID from associate.txt,--sort sum metadata 按丰度和分组排序
# eg. P163-PWY (P 0.03, Q 0.09)/ 1CMET2-PWY (P 0.12, Q 0.18)
path=P163-PWY
grep $path result/humann4/associate.txt
humann_barplot \
--input ${pcl} --focal-feature ${path} \
--focal-metadata Group --last-metadata Group \
--output result/humann4/barplot_${path}.pdf --sort sum metadata
### KEGG annotation (注释)
# Support GO, PFAM, eggNOG, level4ec, KEGG, etc. Detail see `humann_regroup_table -h`。
# regroup gene family into KO(uniref90_ko), options eggNOG(uniref90_eggnog) or EC(uniref90_level4ec)
for i in `tail -n+2 result/metadata.txt|cut -f1`;do
humann_regroup_table \
-i temp/humann4/${i}_2_genefamilies.tsv \
-g uniref90_ko \
-o temp/humann4/${i}_ko.tsv
done
# Merge sample and correct name (合并并修正样本名)
humann_join_tables \
--input temp/humann4/ \
--file_name ko \
--output result/humann4/ko.tsv
sed -i '1s/_Abundance-RPKs//g' result/humann4/ko.tsv
tail result/humann4/ko.tsv
# Similar to pathabundance, it can perform operations such as normalization, stratified, and barplot.
# 与pathabundance类似,可进行标准化renorm、分层stratified、柱状图barplot等操作
# Stratify into function related species (stratified) and function only (unstratified)
# 分层为功能包括物种组成和仅功能组成
humann_split_stratified_table \
--input result/humann4/ko.tsv \
--output result/humann4/
wc -l result/humann4/ko*
# KO to level 1/2/3, 9, 53 and 332 respectively (KO合并为更高层级L3/L2/L1)
summarizeAbundance.py \
-i result/humann4/ko_unstratified.tsv \
-m ${db}/EasyMicrobiome/kegg/KO1-4.txt \
-c 2,3,4 -s ',+,+,' -n raw --dropkeycolumn \
-o result/humann4/KEGG
wc -l result/humann4/KEGG*
## 2.2 GraPhlAn taxonomic and phylogenetic trees 高颜值物种或进化树
# Use export2graphlan to create plotting files
# 使用export2graphlan制作绘图文件
conda activate graphlan
# Detail parameters in PPT or run `export2graphlan.py --help`
export2graphlan.py --skip_rows 1,2 -i result/metaphlan4/taxonomy.tsv \
--tree temp/merged_abundance.tree.txt \
--annotation temp/merged_abundance.annot.txt \
--most_abundant 100 --abundance_threshold 20 --least_biomarkers 10 \
--annotations 3,4 --external_annotations 7
# graphlan annotation
graphlan_annotate.py --annot temp/merged_abundance.annot.txt \
temp/merged_abundance.tree.txt temp/merged_abundance.xml
# output PDF figure, annoat and legend
graphlan.py temp/merged_abundance.xml result/metaphlan4/graphlan.pdf --external_legends
## 2.3 LEfSe Differential analysis (差异分析物种)
# Set input & output director for different versions (可调目录尝试不同版本)
# For example, 12 sample examples, replace result to result12 (例如12个样本示例)
# wget -c http://www.imeta.science/db/EasyMetagenome/result12.zip
# unzip result12.zip
result=result
# Prepare the input file, and change the sample name to the group name (this can be done manually).
# 准备输入文件,修改样本品为组名(可手动修改)
# Extract the sample rows and replace them with one row for each sample, then change the ID to SampleID.
# 提取样本行替换为每个样本一行,修改ID为SampleID
sed -i 's/\r//g' $result/metaphlan4/taxonomy.tsv
head -n1 $result/metaphlan4/taxonomy.tsv|tr '\t' '\n'|sed '1 s/ID/SampleID/' > temp/sampleid
head -n3 temp/sampleid
# 提取SampleID对应的分组Group(假设为metadata.txt中第二列$2),替换换行\n为制表符\t,再把行末制表符\t替换回换行
awk 'BEGIN{OFS=FS="\t"}NR==FNR{a[$1]=$3}NR>FNR{print a[$1]}' \
$result/metadata.txt temp/sampleid|tr '\n' '\t'|sed 's/\t$/\n/' >temp/groupid
cat temp/groupid
sed 's/SampleID/subject_id/' temp/sampleid | tr '\n' '\t'|sed 's/\t$/\n/' > temp/sampleid2
# 合并分组和数据(替换表头)
cat temp/groupid temp/sampleid2 <(tail -n+2 $result/metaphlan4/taxonomy.tsv) | grep -v '#' > $result/metaphlan4/lefse.txt
head -n3 $result/metaphlan4/lefse.txt
# Method 1. ImageGP 2 https://www.bic.ac.cn/BIC/#/analysis?page=b%27MzY%3D%27&tool_type=tool
# 方法1. 推荐 https://www.bic.ac.cn/BIC/ 中LEfSe分析
# Error delete the subject_id line (在线出错,请删除subject_id行)
# Method 2. LEfSe command line analysis
# 方法2. LEfSe命令行分析
# Example:https://github.com/SegataLab/lefse/blob/master/example/bioconda-lefse_run.sh
conda activate lefse
result=result
lefse_run.py -h # LEfSe 1.1.01
# LEfSe format (转换lefse格式),s 2 correction for dependent comparison
lefse_format_input.py $result/metaphlan4/lefse.txt \
temp/input.in -c 1 -s 2 -o 1000000
# LEfSe run
lefse_run.py temp/input.in temp/input.res
# Plot cladogram (绘制物种树注释差异)
lefse_plot_cladogram.py temp/input.res \
$result/metaphlan4/lefse_cladogram.pdf --format pdf
# Plot features barplot (绘制所有差异特征的柱状图)
lefse_plot_res.py temp/input.res \
$result/metaphlan4/lefse_res.pdf --format pdf
# Draw a bar chart of a single feature (绘制单个特征柱状图)
# View differences features sorting by abundance (按丰度排序查看显著差异特征)
grep -v '-' temp/input.res | sort -k3,3n
# Select a feature to draw, such as Actinobacteria (指定特征绘图,如放线菌)
lefse_plot_features.py -f one --format pdf \
--feature_name "k__Bacteria.p__Actinobacteria" \
temp/input.in temp/input.res \
$result/metaphlan4/lefse_Actinobacteria.pdf
# (Optional) Batch plot all difference feature bar charts (可选)批量绘制所有差异特征柱状图
lefse_plot_features.py -f diff \
--archive none --format pdf \
temp/input.in temp/input.res \
$result/metaphlan4/lefse_
## 2.4 Kraken2+Bracken taxonomic classification and abundance estimation (物种注释和丰度估计)
# Kraken2 can quickly perform species annotation and quantification at the read level,
# and can also perform sequence species annotation at the contig, gene, and metagenomic assembly (MAG/bin) levels.
# Kraken2可以快速完成读长(read)层面的物种注释和定量,还可以进行重叠群(contig)、基因(gene)、宏基因组组装基因组(MAG/bin)层面的序列物种注释。
# Start the Kraken2 working environment(启动kraken2工作环境)
conda activate kraken2
# Record software version(记录软件版本)
kraken2 --version # 2.1.6
mkdir -p temp/kraken2
### Kraken2 taxonomic classification (物种注释)
# Input(输入): temp/hr/{1}_?.fastq, {1} representative sample name (代表样本名)
# Database(数据库): -db ${db}/kraken2/pluspf16g/
# Output(输出结果): temp/kraken2/{1}_report and {1}_output
# Feature table(物种丰度表): result/kraken2/taxonomy_count.txt
# Select a database based on server memory size or specific analytical needs
# 根据服务器内存大小或具体分析需求选择数据库
# pluspf16g for small memory / pluspf(100G) for human/animial / pluspfp(214G) for plant/soil
type=pluspf16g
# test first sample, 2min
i=`tail -n+2 result/metadata.txt|cut -f1 | head -n1`
time kraken2 --db ${db}/kraken2/${type}/ \
--paired temp/hr/${i}_?.fastq \
--threads 2 --use-names --report-zero-counts \
--report temp/kraken2/${i}.report \
--output temp/kraken2/${i}.output
# output classified and unclassified reads
# --unclassified-out "temp/kraken2/${i}.unclassified_reads#.fasta" \
# --classified-out "temp/kraken2/${i}.classified_reads#.fasta" \
# Batch processing of multiple samples to generate reports consumes a lot of memory but is fast; therefore, multi-task parallelism is not recommended.
# 多样本批处理生成report,内存消耗大但速度快,不建议用多任务并行
for i in `tail -n+3 result/metadata.txt | cut -f1`;do
kraken2 --db ${db}/kraken2/${type} \
--paired temp/hr/${i}_?.fastq \
--threads 2 --use-names --report-zero-counts \
--report temp/kraken2/${i}.report \
--output temp/kraken2/${i}.output; done
# Use Krakentools to convert the report to MPA format.
# 使用krakentools转换report为mpa格式
for i in `tail -n+2 result/metadata.txt | cut -f1`;do
kreport2mpa.py -r temp/kraken2/${i}.report \
--display-header -o temp/kraken2/${i}.mpa; done
# Display by percentage (optional) 按照百分比展示(可选)
for i in `tail -n+2 result/metadata.txt | cut -f1`;do
kreport2mpa.py -r temp/kraken2/${i}.report \
--percentages --display-header -o temp/kraken2/${i}.p.mpa; done
# Merged samples into a table (合并样本为表格)
mkdir -p result/kraken2
# Same row number, sort ensure consistent (结果行数相同sort确保排序一致)
tail -n+2 result/metadata.txt | cut -f1 | rush -j 1 \
'tail -n+2 temp/kraken2/{1}.mpa | LC_ALL=C sort | cut -f 2 | sed "1 s/^/{1}\n/" \
> temp/kraken2/{1}_count '
# sample1 as header(提取第一样本品行名为表行名)
tail -n+2 temp/kraken2/`tail -n 1 result/metadata.txt | cut -f 1`.mpa | LC_ALL=C sort | cut -f 1 | \
sed "1 s/^/Taxonomy\n/" > temp/kraken2/0header_count
head -n3 temp/kraken2/0header_count
# paste merge into table (合并样本为表格)
ls temp/kraken2/*count
paste temp/kraken2/*count > result/kraken2/tax_count.mpa
# Check table and statistics (检查表格及统计)
head -n 5 result/kraken2/tax_count.mpa
csvtk -t stat result/kraken2/tax_count.mpa
### Bracken abundance estimation (丰度估计)
# Parameter description (参数简介):
# -d database, -i input Kraken2 report file
# -r read length, usually 150, -o outputs the re-estimated abundance
# -l is the taxonomic level, domain (D), phylum (P), class (C), order (O), family (F), genus (G), and species (S) levels
# -t is the threshold, defaults to 0, larger values result in higher reliability
# -d为数据库,-i为输入kraken2报告文件
# r是读长,通常为150,o输出重新估计的值
# l为分类级,可选域D、门P、纲C、目O、科F、属G、种S级别丰度估计
# t是阈值,默认为0,越大越可靠
# Re-estimate the abundance of each sample in a loop.
# Please modify the tax and recalculate P and S once each.
# 循环重新估计每个样品的丰度,请修改tax分别重新计算P和S各1次
mkdir -p temp/bracken
# read length (测序数据长度)、
readLen=150
# Only those present in 20% of the samples are retained (20%样本中存在才保留)
prop=0.2
# Classification is set into D, P, C, O, F, G, S, with commonly used categories being Kingdom (D), Phylum (P), Genus (G), and Species (S).
# 设置分类级D,P,C,O,F,G,S,常用界D门P和属G种S
for tax in P G S;do
# tax=P
for i in `tail -n+2 result/metadata.txt | cut -f1`;do
# i=C1
bracken -d ${db}/kraken2/${type}/ \
-i temp/kraken2/${i}.report \
-r ${readLen} -l ${tax} -t 0 \
-o temp/bracken/${i}.brk \
-w temp/bracken/${i}.report; done
# bracken结果合并成表: 需按表头排序,提取第6列reads count,并添加样本名
bash ${db}/EasyMicrobiome/script/bracken_integration.sh -t ${tax} \
-i temp/bracken/*.brk \
-o result/kraken2/bracken.${tax}.txt
# 需要确认行数一致才能按以下方法合并
# wc -l temp/bracken/*.report
# bracken结果合并成表: 需按表头排序,提取第6列reads count,并添加样本名
# tail -n+2 result/metadata.txt | cut -f1 | rush -j 1 \
# 'tail -n+2 temp/bracken/{1}.brk | LC_ALL=C sort | cut -f6 | sed "1 s/^/{1}\n/" \
# > temp/bracken/{1}.count'
# 提取第一样本品行名为表行名
# h=`tail -n1 result/metadata.txt|cut -f1`
# tail -n+2 temp/bracken/${h}.brk | LC_ALL=C sort | cut -f1 | \
# sed "1 s/^/Taxonomy\n/" > temp/bracken/0header.count
# 检查文件数,为n+1
# ls temp/bracken/*count | wc
# paste合并样本为表格,并删除非零行
# paste temp/bracken/*count > result/kraken2/bracken.${tax}.txt
# 统计行列,默认去除表头
csvtk -t stat result/kraken2/bracken.${tax}.txt
# 按频率过滤,-r可标准化,-e过滤(microbiome_helper)
Rscript ${db}/EasyMicrobiome/script/filter_feature_table.R \
-i result/kraken2/bracken.${tax}.txt \
-p ${prop} \
-o result/kraken2/bracken.${tax}.${prop}
# head result/kraken2/bracken.${tax}.${prop}
done
csvtk -t stat result/kraken2/bracken.?.txt result/kraken2/bracken.?.$prop
# Personalized results filtering, e.g. Chordata (个性化结果筛选,如脊索动物)
# Phylum level removal of chordate (humans) 门水平去除脊索动物(人)
grep 'Chordata' result/kraken2/bracken.P.${prop}
grep -v 'Chordata' result/kraken2/bracken.P.${prop} > result/kraken2/bracken.P.${prop}-H
# Remove host contamination by species name, humans as an example: Homo sapiens (按物种名手动去除宿主污染,以人为例)
grep 'Homo sapiens' result/kraken2/bracken.S.${prop}
grep -v 'Homo sapiens' result/kraken2/bracken.S.${prop} \
> result/kraken2/bracken.S.${prop}-H
# Clean big annotation files for each sequence (清理每条序列的注释大文件)
/bin/rm -rf temp/kraken2/*.output
### Diversity and Visualization (多样性和可视化)
# Alpha diversity:Berger Parker’s (BP), Simpson’s (Si), inverse Simpson’s (ISi), Shannon’s (Sh)
echo -e "SampleID\tBerger Parker\tSimpson\tinverse Simpson\tShannon" > result/kraken2/alpha.txt
for i in `tail -n+2 result/metadata.txt|cut -f1`;do
echo -e -n "$i\t" >> result/kraken2/alpha.txt
for a in BP Si ISi Sh;do
alpha_diversity.py -f temp/bracken/${i}.brk -a $a | cut -f 2 -d ':' | tr '\n' '\t' >> result/kraken2/alpha.txt
done
echo "" >> result/kraken2/alpha.txt
done
cat result/kraken2/alpha.txt
# Beta diversity
beta_diversity.py -i temp/bracken/*.brk --type bracken > result/kraken2/beta.txt
cat result/kraken2/beta.txt
# Krona plot
for i in `tail -n+2 result/metadata.txt|cut -f1`;do
kreport2krona.py -r temp/bracken/${i}.report -o temp/bracken/${i}.krona --no-intermediate-ranks
ktImportText temp/bracken/${i}.krona -o result/kraken2/krona.${i}.html
done
# Pavian Sankey diagram (桑基图):https://fbreitwieser.shinyapps.io/pavian/
# Online visualization: `Upload sample set` (temp/bracken/*.report), supports multiple samples; `Sample` view results; `Configure Sankey` to configure graph styles; `Save Network` to download the graph webpage.
# 在线可视化:,左侧菜单,Upload sample set (temp/bracken/*.report),支持多样本同时上传;Sample查看结果,Configure Sankey配置图样式,Save Network下载图网页
# For diversity/composition visualization, see 3StatPlot.sh (多样性分析/物种组成可视化见3StatPlot.sh)
# 3. Assemble-based (三、组装分析流程)
## 3.1 Assemble (组装)
# Start working environment (启动工作环境)
conda activate megahit
### MEGAHIT Assembly (组装)
# Delete directory for rerun (删除旧文件夹才能重新运行)
# /bin/rm -rf temp/megahit
# Assembly,demo 3p 80m, TB may n days (TB数据要几天)
megahit -v # MEGAHIT v1.2.9
/usr/bin/time -v megahit -t 3 \
-1 `tail -n+2 result/metadata.txt|cut -f1|sed 's/^/temp\/hr\//;s/$/_1.fastq/'|tr '\n' ','|sed 's/,$//'` \
-2 `tail -n+2 result/metadata.txt|cut -f1|sed 's/^/temp\/hr\//;s/$/_2.fastq/'|tr '\n' ','|sed 's/,$//'` \
-o temp/megahit
# Stat contig size, 160M, eg. 18sample 100G data 10h contigs 1.8G
# If too many contigs, filter by length to reduce data and improve gene integrity. See appendix megahit
# 如果contigs太多,可以按长度筛选,降低数据量,提高基因完整度,详见附录megahit
seqkit stat temp/megahit/final.contigs.fa
# Preview first 6 rows and 60 characters each line (预览重叠群最前6行,前60列字符)
head -n6 temp/megahit/final.contigs.fa | cut -c1-60
# Delete temporary files (删除临时文件)
/bin/rm -rf temp/megahit/intermediate_contigs
### metaSPAdes fine assembly (option) 精细组装(可选)
# More time and memory consume (精细但使用内存和时间更多)
mkdir -p temp/metaspades
/usr/bin/time -v -o metaspades.py.log metaspades.py -t 3 -m 100 \
`tail -n+2 result/metadata.txt|cut -f1|sed 's/^/temp\/hr\//;s/$/_1.fastq/'|sed 's/^/-1 /'| tr '\n' ' '` \
`tail -n+2 result/metadata.txt|cut -f1|sed 's/^/temp\/hr\//;s/$/_2.fastq/'|sed 's/^/-2 /'| tr '\n' ' '` \
-o temp/metaspades
# View calculate time (Elapsed time, 2.5h) and Memory consume (Maximum resident set size, 22G)
cat metaspades.py.log
# 219M,contigs bigger than megahit
ls -sh temp/metaspades/contigs.fasta
seqkit stat temp/metaspades/contigs.fasta
# Select VIP result, Delete temporary files (删除临时文件)
mv temp/metaspades/contigs.fasta temp/metaspades.fa
/bin/rm -rf temp/metaspades/
# Note: metaSPAdes supports hybrid assembly of second and third generation systems (see appendix).
# Additionally, there are OPERA-MS assembly solutions for second and third generation systems.
# 注:metaSPAdes支持二、三代混合组装,见附录,此外还有OPERA-MS组装二、三代方案
### QUAST assembly evaluation (组装评估)
mkdir -p result/megahit
# QUAST evaluation, generating reports in various formats such as tsv/txt text, html webpage, and PDF.
# QUAST评估,生成report文本tsv/txt、网页html、PDF等格式报告
quast.py temp/megahit/final.contigs.fa \
-o result/megahit/quast -t 2
# (opt) megahit versus metaspades
quast.py --label "megahit,metapasdes" \
temp/megahit/final.contigs.fa \
temp/metaspades.fa \
-o result/quast
# (opt)metaquast access (更全面评估,需下载相关数据库受网速影响可能时间很长或失败)
# metaquast based on silva, and top 50 species genome, 18min
time metaquast.py temp/megahit/final.contigs.fa \
-o result/megahit/metaquast
## 3.2 Gene prediction, cluster & quantitfy(基因预测、去冗余和定量)
### metaProdigal Gene prediction (基因预测)
# Input file: Assembled sequences temp/megahit/final.contigs.fa
# Output file: Gene sequence predicted by prodigal temp/prodigal/gene.fa
# For large gene sequences, refer to the appendix for prodigal gene file splitting and parallel computation.
# 输入文件:组装的序列 result/megahit/final.contigs.fa
# 输出文件:prodigal预测的基因序列 temp/prodigal/gene.fa
# 基因大,可参考附录prodigal拆分基因文件,并行计算
mkdir -p temp/prodigal
# prodigal meta mode, 8m, > and 2>&1 record gene.log, ~1G 1h
time prodigal -i temp/megahit/final.contigs.fa \
-d temp/prodigal/gene.fa \
-o temp/prodigal/gene.gff \
-p meta -f gff > temp/prodigal/gene.log 2>&1
# Check if the log has completed (查看日志是否运行完成)
tail temp/prodigal/gene.log
# Count of genes (基因数量) 261K, 6G18s3M
seqkit stat temp/prodigal/gene.fa
# Count complete genes, if the data volume is large, select complete gene, 73K
# 统计完整基因数量,数据量大可只用完整基因部分
grep -c 'partial=00' temp/prodigal/gene.fa
# Select complete gene (提取完整基因)
seqkit grep -n -r -p "partial=00" temp/prodigal/gene.fa > temp/prodigal/full_length.fa
seqkit stat temp/prodigal/full_length.fa
# Opt. Single sample assemble bacth run (可选单样本组装可批量运行)
for i in `tail -n+2 result/metadata.txt | cut -f1`;do
mkdir -p temp/prodigal/${i}/
prodigal -i result/megahit/${i}/final.contigs.fa \
-d temp/prodigal/${i}/gene.fa \
-o temp/prodigal/${i}/gene.gff \
-p meta -f gff > temp/prodigal/${i}/gene.log 2>&1
done
### cd-hit cluster & redundancy (基因聚类/去冗余)
# Input file: gene sequences predicted by prodigal temp/prodigal/gene.fa
# Output files: Deredundant gene and protein sequences: result/NR/nucleotide.fa, result/NR/protein.fa
# 输入文件:prodigal预测的基因序列 temp/prodigal/gene.fa
# 输出文件:去冗余后的基因和蛋白序列:result/NR/nucleotide.fa, result/NR/protein.fa
mkdir -p result/NR
# aS coverage, c similarity, G local alignment, g optimal solution, T multithreading, M memory 0 unlimited
# aS覆盖度,c相似度,G局部比对,g最优解,T多线程,M内存0不限制
# 2M 384p 8m,3M384p15m,2千万需要2000h,多线程可加速
time cd-hit-est -i temp/prodigal/gene.fa \
-o result/NR/nucleotide.fa \
-aS 0.9 -c 0.95 -G 0 -g 0 -T 0 -M 0
# Counting the number of non-redundant genes, the number of genes in a single assembly does not decrease significantly, such as 3M-2M. Multiple batch assemblies show high redundancy.
# 统计非冗余基因数量,单次拼接结果数量下降不大,如3M-2M,多批拼接冗余度高
grep -c '>' result/NR/nucleotide.fa
# Translate nucleic acids into corresponding protein sequences, and use `--trim` to remove trailing asterisks.
# 翻译核酸为对应蛋白序列, --trim去除结尾的*
seqkit translate --trim result/NR/nucleotide.fa \
> result/NR/protein.fa
# Redundancy removal for both batches of data was accelerated using cd-hit-est-2d, see appendix.
# 两批数据去冗余使用cd-hit-est-2d加速,见附录
# batch samples (样本批量运行)
for i in `tail -n+2 result/metadata.txt | cut -f1`;do
mkdir -p result/NR/${i}
cd-hit-est -i temp/prodigal/${i}/gene.fa \
-o result/NR/${i}/nucleotide.fa \
-aS 0.9 -c 0.95 -G 0 -g 0 -T 0 -M 0
grep -c '>' result/NR/${i}/nucleotide.fa
seqkit translate --trim result/NR/${i}/nucleotide.fa \
> result/NR/${i}/protein.fa
done
### salmon quantitfy (基因定量)
# Input file: Redundancy-free gene sequence: result/NR/nucleotide.fa
# Output file: Salmon quantification: result/salmon/gene.count, gene.TPM
# 输入文件:去冗余后的基因序列:result/NR/nucleotide.fa
# 输出文件:Salmon定量:result/salmon/gene.count, gene.TPM
mkdir -p temp/salmon
salmon -v # 1.10.3
# build index, -t sequence, -i index (建索引, -t序列, -i 索引, 2m)
time salmon index -t result/NR/nucleotide.fa \
-p 3 -i temp/salmon/index
# Quantitative: -l automatic library type selection, p threads, --meta metagenomics
# 定量,l文库类型自动选择,p线程,--meta宏基因组, 10s
i=C1
time salmon quant -i temp/salmon/index -l A -p 3 --meta \
-1 temp/hr/${i}_1.fastq -2 temp/hr/${i}_2.fastq \
-o temp/salmon/${i}.quant
# parallel, 1m, 18 samples 30m
time tail -n+2 result/metadata.txt | cut -f1 | rush -j 2 \
"salmon quant -i temp/salmon/index -l A -p 3 --meta \
-1 temp/hr/{1}_1.fastq -2 temp/hr/{1}_2.fastq \
-o temp/salmon/{1}.quant"
# Merge (合并)
mkdir -p result/salmon
salmon quantmerge --quants temp/salmon/*.quant \
-o result/salmon/gene.TPM
salmon quantmerge --quants temp/salmon/*.quant \
--column NumReads -o result/salmon/gene.count
sed -i '1 s/.quant//g' result/salmon/gene.*
# Preview (预览结果)
head -n3 result/salmon/gene.*
## 3.3 Functional gene annotation (功能基因注释)
# input:result/NR/protein.fa
# COG: result/eggnog/cogtab.count
# result/eggnog/cogtab.count.spf (for STAMP)
# KO table:result/eggnog/kotab.count
# CAZy Carbohydrate:result/dbcan3/cazytab.count
# Expanded to other databases (可拓展其它数据库)
### eggNOG gene annotation(COG/KEGG/CAZy基因注释)
# eggNOG: https://github.com/eggnogdb/eggnog-mapper
# Run and record the software version (运行并记录软件版本)
conda activate eggnog
emapper.py --version
mkdir -p temp/eggnog
# emapper-2.1.7 / Expected eggNOG DB version: 5.0.2 / diamond version 2.0.15
# run emapper, 3p 100m, default diamond 1e-3; 2M,32p,1.5h
time emapper.py --data_dir ${db}/eggnog \
-i result/NR/protein.fa --cpu 3 -m diamond --override \
-o temp/eggnog/output
# Format the results and display the table headers (格式化结果并显示表头)
grep -v '^##' temp/eggnog/output.emapper.annotations | sed '1 s/^#//' \
> temp/eggnog/output
csvtk -t headers -v temp/eggnog/output
# create COG/KO/CAZy abundance table
mkdir -p result/eggnog
# Show help (显示帮助)
summarizeAbundance.py -h
# Summary, the 7 columns COG_category are alphabetically separated, and the 12 columns KEGG_ko and 19 columns CAZy are comma-separated. The original values are summed.
# 汇总,7列COG_category按字母分隔,12列KEGG_ko和19列CAZy按逗号分隔,原始值累加
summarizeAbundance.py \
-i result/salmon/gene.TPM \
-m temp/eggnog/output --dropkeycolumn \
-c '7,12,19' -s '*+,+,' -n raw \
-o result/eggnog/eggnog
sed -i 's#^ko:##' result/eggnog/eggnog.KEGG_ko.raw.txt
sed -i '/^-/d' result/eggnog/eggnog*
head -n3 result/eggnog/eggnog*
# eggnog.CAZy.raw.txt eggnog.COG_category.raw.txt eggnog.KEGG_ko.raw.txt
# format to STAMP spf format
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \
${db}/EasyMicrobiome/kegg/KO_description.txt \
result/eggnog/eggnog.KEGG_ko.raw.txt | \
sed 's/^\t/Unannotated\t/' \
> result/eggnog/eggnog.KEGG_ko.TPM.spf
head -n 5 result/eggnog/eggnog.KEGG_ko.TPM.spf
# KO to level 1/2/3
summarizeAbundance.py \
-i result/eggnog/eggnog.KEGG_ko.raw.txt \
-m ${db}/EasyMicrobiome/kegg/KO1-4.txt \
-c 2,3,4 -s ',+,+,' -n raw --dropkeycolumn \
-o result/eggnog/KEGG
head -n3 result/eggnog/KEGG*
# CAZy
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2} NR>FNR{print a[$1],$0}' \
${db}/EasyMicrobiome/dbcan2/CAZy_description.txt result/eggnog/eggnog.CAZy.raw.txt | \
sed 's/^\t/Unannotated\t/' > result/eggnog/eggnog.CAZy.TPM.spf
head -n 3 result/eggnog/eggnog.CAZy.TPM.spf
# COG
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$2"\t"$3} NR>FNR{print a[$1],$0}' \
${db}/EasyMicrobiome/eggnog/COG.anno result/eggnog/eggnog.COG_category.raw.txt > \
result/eggnog/eggnog.COG_category.TPM.spf
head -n 3 result/eggnog/eggnog.COG_category.TPM.spf
### CARD耐药基因
# CARD: https://card.mcmaster.ca/
# GitHub: https://github.com/arpcard/rgi
# Result:protein.json, upload to CARD; protein.txt, annotation list
mkdir -p result/card
conda activate rgi
rgi main -v # 6.0.5
# Simplified Protein ID (简化蛋白ID)
cut -f 1 -d ' ' result/NR/protein.fa > temp/protein.fa
grep '>' result/NR/protein.fa | head -n 3
grep '>' temp/protein.fa | head -n 3
# Protein-level annotation ARG
# rgi load -i $db/card/card.json --card_annotation $db/card/card.fasta
time rgi main -i temp/protein.fa -t protein \
-n 9 -a DIAMOND --include_loose --clean \
-o result/card/protein
head -n3 result/card/protein.txt
# WARNING baeR ---> hsp.bits: 140.6 <class 'float'> ? <class 'str'> Exception : <class 'KeyError'> -> '2885' -> Model(2885) missing in database. Please generate new database.
# Software and database have bugs that do not affect the results (新版软件与数据库bug,不影响主体结果)
# (Optional) Gene-level annotation ARG (可选)基因层面注释ARG
cut -f 1 -d ' ' result/NR/nucleotide.fa > temp/nucleotide.fa
grep '>' temp/nucleotide.fa | head -n3
rgi main -i temp/nucleotide.fa -t contig \
-n 9 -a DIAMOND --include_loose --clean \
-o result/card/nucleotide
head -n3 result/card/nucleotide.txt
# (Optional) Overlap Contigs Level Annotation (ARG) (可选)重叠群层面注释ARG
cut -f 1 -d ' ' temp/megahit/final.contigs.fa > temp/contigs.fa
grep '>' temp/contigs.fa | head -n3
time rgi main -i temp/contigs.fa -t contig \
-n 9 -a DIAMOND --include_loose --clean \
-o result/card/contigs
head result/card/contigs.txt
## 3.4 Kraken2 gene taxonomic annotation (基因物种注释)
# Generate report in default taxid output
conda activate kraken2
# 16g 60.69%, pf 73.02%, pfp 74.20%
kraken2 --db ${db}/kraken2/pluspf16g \
result/NR/nucleotide.fa \
--threads 3 \
--report temp/NRgene.report \
--output temp/NRgene.output
# Genes & taxid list
grep '^C' temp/NRgene.output | cut -f 2,3 | sed '1 i Name\ttaxid' \
> temp/NRgene.taxid
# Add taxonomy
awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$1]=$0} NR>FNR{print $1,a[$2]}' \
$db/EasyMicrobiome/kraken2/taxonomy.txt \
temp/NRgene.taxid > result/NR/nucleotide.tax
conda activate eggnog
summarizeAbundance.py \
-i result/salmon/gene.TPM \
-m result/NR/nucleotide.tax --dropkeycolumn \
-c '2,3,4,5,6,7,8,9' -s ',+,+,+,+,+,+,+,' -n raw \
-o result/NR/tax
wc -l result/NR/tax*|sort -n
# 4. Binning (四、分箱挖掘单菌基因组)
## MetwWRAP binning (分箱)
# GitHub: https://github.com/bxlab/metaWRAP
# For mining single-bacterial genomes, a single sample size of 6GB+ is recommended, and for complex samples such as soil, a data size of 30GB+ is recommended.
# The demonstration data consists of 6 samples (approximately 1GB), which is insufficient to obtain single-bacterial genomes. Therefore, official sequencing data is used for demonstration and explanation.
# 挖掘单菌基因组,推荐单样本6GB+,复杂样本如土壤推荐数据量30GB+
# 演示数据6个样品~1G,无法获得单菌基因组,这里使用官方测序数据演示讲解
cd $wd
conda activate metawrap
metawrap -v # 1.3.2
mkdir -p temp/bin temp/bin_refine temp/drep_in result/bin
# Input: quaility control & host removal sequences, *_1.fastq和*_2.fastq,
# assemble contigs, temp/megahit/final.contigs.fa
# Output: Binning result: temp/bin
# Stat:temp/bin_refine/metawrap_50_10_bins.stats