mgp1000/somatic.nf at master · MonikaCho/mgp1000 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Myeloma Genome Project 1000
// Comprehensive pipeline for analysis of matched T/N Multiple Myeloma WGS data
// https://github.com/pblaney/mgp1000

// This portion of the pipeline is used for somatic variant analysis of matched tumor/normal WGS samples.
// It is designed to be run with BAMs that were genereated via the Preprocessing module of this pipeline.

import java.text.SimpleDateFormat;
def workflowTimestamp = "${workflow.start.format('MM-dd-yyyy HH:mm')}"

def helpMessage() {
	log.info"""
	                             .------------------------.
	                            |    .-..-. .--. .---.     |
	                            |    : `' :: .--': .; :    |
	                            |    : .. :: : _ :  _.'    |
	                            |    : :; :: :; :: :       |
	                            |    :_;:_;`.__.':_;       |
	                            |   ,-. .--.  .--.  .--.   |
	                            | .'  :: ,. :: ,. :: ,. :  |
	                            |   : :: :: :: :: :: :: :  |
	                            |   : :: :; :: :; :: :; :  |
	                            |   :_;`.__.'`.__.'`.__.'  |
	                             .________________________.

	                                      SOMATIC

	Usage:
	  nextflow run somatic.nf --run_id STR --sample_sheet FILE -profile somatic [-bg] [-resume]
	  [--input_dir PATH] [--output_dir PATH] [--email STR] [--mutect_ref_vcf_concatenated STR]
	  [--battenberg_ref_cached STR] [--annotsv_ref_cached STR] [--vep_ref_cached STR] [--help]

	Mandatory Arguments:
	  --run_id                       STR  Unique identifier for pipeline run
	  --sample_sheet                FILE  CSV file containing the list of samples where the
	                                      first column designates the file name of the normal
	                                      sample, the second column for the file name of the
	                                      matched tumor sample
	  -profile                       STR  Configuration profile to use, must use somatic

	Main Options:
	  -bg                           FLAG  Runs the pipeline processes in the background, this
	                                      option should be included if deploying pipeline with
	                                      real data set so processes will not be cut if user
	                                      disconnects from deployment environment
	  -resume                       FLAG  Successfully completed tasks are cached so that if
	                                      the pipeline stops prematurely the previously
	                                      completed tasks are skipped while maintaining their
	                                      output
	  --input_dir                   PATH  Directory that holds BAMs and associated index files,
	                                      this should be given as an absolute path
	                                      [Default: input/preprocessedBams/]
	  --output_dir                  PATH  Directory that will hold all output files this should
	                                      be given as an absolute path
	                                      [Default: output/]
	  --email                        STR  Email address to send workflow completion/stoppage
	                                      notification
	  --mutect_ref_vcf_concatenated  STR  Indicates whether or not the gnomAD allele frequency
	                                      reference VCF used for MuTect2 processes has been
	                                      concatenated, this will be done in a process of the
	                                      pipeline if it has not, this does not need to be done
	                                      for every separate run after the first
	                                      [Default: yes | Available: yes, no]
	  --battenberg_ref_cached        STR  Indicates whether or not the reference files used for
	                                      Battenberg have been downloaded/cached locally, this
	                                      will be done in a process of the pipeline if it has
	                                      not, this does not need to be done for every separate
	                                      run after the first
	                                      [Default: yes | Available: yes, no]
	  --annotsv_ref_cached           STR  Indicates whether or not the AnnotSV reference filee
	                                      used for annotation have been downloaded/cached
	                                      locally, this will be done in a process of the
	                                      pipeline if it has not, this does not need to be done
	                                      for every separate run after the first
	                                      [Default: yes | Available: yes, no]
	  --vep_ref_cached               STR  Indicates whether or not the VEP reference files used
	                                      for annotation have been downloaded/cached locally,
	                                      this will be done in a process of the pipeline if it
	                                      has not, this does not need to be done for every
	                                      separate run after the first
	                                      [Default: yes | Available: yes, no]
	  --cpus                         INT  Globally set the number of cpus to be allocated
	  --memory                       STR  Globally set the amount of memory to be allocated,
	                                      written as '##.GB' or '##.MB'
	  --queue_size                   INT  Set max number of tasks the pipeline will launch
	                                      [Default: 100]
	  --executor                     STR  Set the job executor for the run
	                                      [Default: slurm | Available: local, slurm, lsf]
	  --help                        FLAG  Prints this message

	Toolbox Options:
	  --telomerecat                  STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --telomerehunter               STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --conpair                      STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --conpair_min_cov              INT  Manually set the minimum coverage
	                                      [Default: 10]
	  --varscan                      STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --mutect                       STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --strelka                      STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --fragcounter                  STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --battenberg                   STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --battenberg_min_depth         STR  Manually set the minimum read depth in the normal
	                                      sample for SNP filtering in BAF calculations,
	                                      default is for 30x coverage
	                                      [Default: 10]
	  --controlfreec                 STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --controlfreec_read_length     STR  Manually set the read length to be used for the
	                                      mappability
	                                      [Default: 151]
	  --controlfreec_bp_threshold  FLOAT  Manually set the breakpoint threshold value, lower if
	                                      the sample is expected to have large number of CNV
	                                      segments or increase for the opposite assumption
	                                      [Default: 0.8]
	  --controlfreec_ploidy          INT  Manually set the ploidy value
	                                      [Default: 2 | Available: 3, 4]
	  --sclust                       STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --sclust_minp                FLOAT  Manually set the minimal expected ploidy
	                                      [Default: 1.5]
	  --sclust_maxp                FLOAT  Manually set the maximal expected ploidy
	                                      [Default: 4.5]
	  --sclust_mutclustering         STR  Manually turn on or off the mutational clustering step
	                                      of the Sclust process, turn off if a solution cannot be
	                                      reached after lowering lambda value
	                                      [Default: on | Available: off]
	  --sclust_lambda                STR  Manually set the degree of smoothing for clustering
	                                      mutations, increasing the value should resolve issues
	                                      with QP iterations related errors
	                                      [Default: 1e-7]
	  --facets                       STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --facets_min_depth             STR  Manually set the minimum read depth in the normal
	                                      sample for SNP filtering in BAF calculations,
	                                      default is for 30x coverage
	                                      [Default: 20]
	  --manta                        STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --svaba                        STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --delly                        STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]
	  --igcaller                     STR  Indicates whether or not to use this tool
	                                      [Default: on | Available: off, on]

	Consensus Workflow Options:
	  --min_consensus_snv_callers    INT  Set minimum of caller agreement for SNV consensus
	                                      [Default: 2]
	  --min_consensus_indel_callers  INT  Set minimum of caller agreement for InDel consensus
	                                      [Default: 2]

	""".stripIndent()
}

// #################################################### \\
// ~~~~~~~~~~~~~ PARAMETER CONFIGURATION ~~~~~~~~~~~~~~ \\

// Declare the defaults for all pipeline parameters
params.input_dir = "${workflow.projectDir}/input/preprocessedBams"
params.output_dir = "${workflow.projectDir}/output"
params.run_id = null
params.sample_sheet = null
params.email = null
params.mutect_ref_vcf_concatenated = "yes"
params.battenberg_ref_cached = "yes"
params.annotsv_ref_cached = "yes"
params.vep_ref_cached = "yes"
params.telomerecat = "on"
params.telomerehunter = "on"
params.conpair = "on"
params.varscan = "on"
params.mutect = "on"
params.strelka = "on"
params.fragcounter = "on"
params.battenberg = "on"
params.controlfreec = "on"
params.sclust = "on"
params.facets = "on"
params.manta = "on"
params.svaba = "on"
params.delly = "on"
params.igcaller = "on"
params.conpair_min_cov = 10
params.battenberg_min_depth = 10
params.controlfreec_read_length = 151
params.controlfreec_bp_threshold = 0.8
params.controlfreec_ploidy = 2
params.sclust_minp = 1.5
params.sclust_maxp = 4.5
params.sclust_mutclustering = "on"
params.sclust_lambda = null
params.facets_min_depth = 20
params.cpus = null
params.memory = null
params.queue_size = 100
params.executor = 'slurm'
params.min_consensus_snv_callers = 2
params.min_consensus_indel_callers = 2
params.help = null

// Print help message if requested
if( params.help ) exit 0, helpMessage()

// Print preemptive error message if user-defined input/output directories does not exist
if( !file(params.input_dir).exists() ) exit 1, "The user-specified input directory does not exist in filesystem."

// Print preemptive error messages if required parameters are not set
if( params.run_id == null ) exit 1, "The run command issued does not have the '--run_id' parameter set. Please set the '--run_id' parameter to a unique identifier for the run."

if( params.sample_sheet == null ) exit 1, "The run command issued does not have the '--sample_sheet' parameter set. Please set the '--sample_sheet' parameter to the path of the normal/tumor pair sample sheet CSV."

// Print preemptive error message if Sclust is set while Mutect2 is not
if( params.sclust == "on" && params.mutect == "off" ) exit 1, "Sclust requires output from Mutect2 to run so both must be turned on"

// Print preemptive error message if Strelka is set while Manta is not
if( params.strelka == "on" && params.manta == "off" ) exit 1, "Strelka requires output from Manta to run so both must be turned on"

// Set channels for reference files
Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38.fasta' )
	.into{ reference_genome_fasta_forConpairPileup;
	       reference_genome_fasta_forVarscanSamtoolsMpileup;
	       reference_genome_fasta_forVarscanBamReadcount;
	       reference_genome_fasta_forVarscanBcftoolsNorm;
	       reference_genome_fasta_forMutectCalling;
	       reference_genome_fasta_forMutectFilter;
	       reference_genome_fasta_forMutectBcftools;
	       reference_genome_fasta_forControlFreecSamtoolsMpileup;
	       reference_genome_fasta_forControlFreecCalling;
	       reference_genome_fasta_forManta;
	       reference_genome_fasta_forStrelka;
	       reference_genome_fasta_forStrelkaBcftools;
	       reference_genome_fasta_forSvabaBcftools;
	       reference_genome_fasta_forDelly;
	       reference_genome_fasta_forIgCaller;
	       reference_genome_fasta_forConsensusSnvMpileup;
	       reference_genome_fasta_forConsensusIndelMpileup;
	       reference_genome_fasta_forConsensusSvFpFilter;
	       reference_genome_fasta_forAnnotation }

Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38.fasta.fai' )
	.into{ reference_genome_fasta_index_forAlleleCount;
		   reference_genome_fasta_index_forConpairPileup;
	       reference_genome_fasta_index_forVarscanSamtoolsMpileup;
	       reference_genome_fasta_index_forVarscanBamReadcount;
	       reference_genome_fasta_index_forVarscanBcftoolsNorm;
	       reference_genome_fasta_index_forMutectCalling;
	       reference_genome_fasta_index_forMutectFilter;
	       reference_genome_fasta_index_forMutectBcftools;
	       reference_genome_fasta_index_forControlFreecSamtoolsMpileup;
	       reference_genome_fasta_index_forControlFreecCalling;
	       reference_genome_fasta_index_forControlFreecConsensusPrep;
	       reference_genome_fasta_index_forSclustConsensusCnv;
	       reference_genome_fasta_index_forManta;
	       reference_genome_fasta_index_forStrelka;
	       reference_genome_fasta_index_forStrelkaBcftools;
	       reference_genome_fasta_index_forSvabaBcftools;
	       reference_genome_fasta_index_forDelly;
	       reference_genome_fasta_index_forIgCaller;
	       reference_genome_fasta_index_forConsensusSnvMpileup;
	       reference_genome_fasta_index_forConsensusIndelMpileup;
	       reference_genome_fasta_index_forConsensusSvFpFilter;
	       reference_genome_fasta_index_forAnnotation }

Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38.dict' )
	.into{ reference_genome_fasta_dict_forConpairPileup;
	       reference_genome_fasta_dict_forVarscanSamtoolsMpileup;
	       reference_genome_fasta_dict_forVarscanBamReadcount;
	       reference_genome_fasta_dict_forVarscanBcftoolsNorm;
	       reference_genome_fasta_dict_forMutectCalling;
	       reference_genome_fasta_dict_forMutectPileupGatherTumor;
	       reference_genome_fasta_dict_forMutectPileupGatherNormal;
	       reference_genome_fasta_dict_forMutectFilter;
	       reference_genome_fasta_dict_forMutectBcftools;
	       reference_genome_fasta_dict_forControlFreecSamtoolsMpileup;
	       reference_genome_fasta_dict_forControlFreecCalling;
	       reference_genome_fasta_dict_forManta;
	       reference_genome_fasta_dict_forStrelka;
	       reference_genome_fasta_dict_forStrelkaBcftools;
	       reference_genome_fasta_dict_forSvaba;
	       reference_genome_fasta_dict_forSvabaBcftools;
	       reference_genome_fasta_dict_forDelly;
	       reference_genome_fasta_dict_forIgCaller;
	       reference_genome_fasta_dict_forConsensusSnvMpileup;
	       reference_genome_fasta_dict_forConsensusIndelMpileup;
	       reference_genome_fasta_dict_forAnnotation }

Channel
	.fromPath( 'references/hg38/wgs_calling_regions.hg38.bed' )
	.into{ gatk_bundle_wgs_bed_forVarscanSamtoolsMpileup;
	       gatk_bundle_wgs_bed_forMutectCalling;
	       gatk_bundle_wgs_bed_forMutectPileup;
	       gatk_bundle_wgs_bed_forControlFreecSamtoolsMpileup;
	       gatk_bundle_wgs_bed_forManta;
	       gatk_bundle_wgs_bed_forStrelka }

Channel
	.fromPath( 'references/hg38/wgs_calling_regions_blacklist.0based.hg38.bed' )
	.into{ gatk_bundle_wgs_bed_blacklist_0based_forDelly;
	       gatk_bundle_wgs_bed_blacklist_0based_forSvaba }

Channel
	.fromList( ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
	            'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
	            'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
	            'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY',] )
	.into{ chromosome_list_forVarscanSamtoolsMpileup;
	       chromosome_list_forMutectCalling;
	       chromosome_list_forMutectPileup;
	       chromosome_list_forControlFreecSamtoolsMpileup;
	       chromosome_list_forControlFreecMerge;
	       chromosome_list_forSclustBamprocess }

Channel
	.fromPath( 'references/hg38/sex_identification_loci.chrY.hg38.txt' )
	.set{ sex_identification_loci }

Channel
	.fromPath( 'references/hg38/cytoband_autosome_sex_chroms.hg38.bed' )
	.set{ cytoband_bed }

Channel
	.fromPath( 'references/hg38/1000g_pon.hg38.vcf.gz' )
	.set{ panel_of_normals_1000G }

Channel
	.fromPath( 'references/hg38/1000g_pon.hg38.vcf.gz.tbi' )
	.set{ panel_of_normals_1000G_index }

Channel
	.fromPath( 'references/hg38/af-only-gnomad.chr1-9.hg38.vcf.gz' )
	.set{ gnomad_ref_vcf_chromosomes1_9 }

Channel
	.fromPath( 'references/hg38/af-only-gnomad.chr1-9.hg38.vcf.gz.tbi' )
	.set{ gnomad_ref_vcf_chromosomes1_9_index }

Channel
	.fromPath( 'references/hg38/af-only-gnomad.chr10-22.hg38.vcf.gz' )
	.set{ gnomad_ref_vcf_chromosomes10_22 }

Channel
	.fromPath( 'references/hg38/af-only-gnomad.chr10-22.hg38.vcf.gz.tbi' )
	.set{ gnomad_ref_vcf_chromosomes10_22_index }

Channel
	.fromPath( 'references/hg38/af-only-gnomad.chrXYM-alts.hg38.vcf.gz' )
	.set{ gnomad_ref_vcf_chromosomesXYM_alts }

Channel
	.fromPath( 'references/hg38/af-only-gnomad.chrXYM-alts.hg38.vcf.gz.tbi' )
	.set{ gnomad_ref_vcf_chromosomesXYM_alts_index }

if( params.mutect == "on" && params.mutect_ref_vcf_concatenated == "yes" ) {
	Channel
		.fromPath( 'references/hg38/af-only-gnomad.hg38.vcf.gz', checkIfExists: true )
		.ifEmpty{ error "The run command issued has the '--mutect_ref_vcf_concatenated' parameter set to 'yes', however the file does not exist. Please set the '--mutect_ref_vcf_concatenated' parameter to 'no' and resubmit the run command. For more information, check the README or issue the command 'nextflow run somatic.nf --help'"}
		.set{ mutect_gnomad_ref_vcf_preBuilt }

	Channel
		.fromPath( 'references/hg38/af-only-gnomad.hg38.vcf.gz.tbi', checkIfExists: true )
		.ifEmpty{ error "The '--mutect_ref_vcf_concatenated' parameter set to 'yes', however the index file does not exist for the reference VCF. Please set the '--mutect_ref_vcf_concatenated' parameter to 'no' and resubmit the run command. Alternatively, use Tabix to index the reference VCF."}
		.set{ mutect_gnomad_ref_vcf_index_preBuilt }
}

Channel
	.fromPath( 'references/hg38/small_exac_common_3.hg38.vcf.gz' )
	.set{ exac_common_sites_ref_vcf }

Channel
	.fromPath( 'references/hg38/small_exac_common_3.hg38.vcf.gz.tbi' )
	.set{ exac_common_sites_ref_vcf_index }

if( params.battenberg_ref_cached == "yes" ) {
	Channel
		.fromPath( 'references/hg38/battenberg_reference/', checkIfExists: true )
		.ifEmpty{ error "The run command issued has the '--battenberg_ref_cached' parameter set to 'yes', however the directory does not exist. Please set the '--battenberg_ref_cached' parameter to 'no' and resubmit the run command. For more information, check the README or issue the command 'nextflow run somatic.nf --help'"}
		.set{ battenberg_ref_dir_preDownloaded }
}

Channel
	.fromPath( 'references/hg38/SnpGcCorrections.hg38.tsv' )
	.set{ snp_gc_corrections }

Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38_autosome_sex_chroms', type: 'dir' )
	.set{ autosome_sex_chromosome_fasta_dir }

Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38_autosome_sex_chrom_sizes.txt' )
	.set{ autosome_sex_chromosome_sizes_forControlFreec }

Channel
    .fromPath( 'references/hg38' )
    .into{ fragcounter_gc_mappability_dir_forFragCounterNormal;
           fragcounter_gc_mappability_dir_forFragCounterTumor }

Channel
	.fromPath( 'references/hg38/Hapmap_3.3.hg38.vcf.gz' )
	.set{ hapmap_ref_snps_vcf_forFragCounter }

Channel
	.fromPath( 'references/hg38/Hapmap_3.3.hg38.vcf.gz.tbi' )
	.set{ hapmap_ref_snps_vcf_index_forFragCounter }

Channel
	.fromPath( 'references/hg38/common_all_20180418.vcf.gz' )
	.into{ common_dbsnp_ref_vcf_forControlFreec;
	       common_dbsnp_ref_vcf_forFacets }

Channel
	.fromPath( 'references/hg38/common_all_20180418.vcf.gz.tbi' )
	.into{ common_dbsnp_ref_vcf_index_forControlFreec;
	       common_dbsnp_ref_vcf_index_forFacets }

Channel
	.fromPath( 'references/hg38/mappability_track_85m2.hg38.zip' )
	.set{ mappability_track_85kmer_zip }

Channel
	.fromPath( 'references/hg38/mappability_track_100m2.hg38.zip' )
	.set{ mappability_track_100kmer_zip }

Channel
	.fromPath( 'references/hg38/mappability_track_150m2.hg38.zip' )
	.set{ mappability_track_150kmer_zip }

Channel
	.fromPath( ['references/hg38/Homo_sapiens_assembly38.fasta', 'references/hg38/Homo_sapiens_assembly38.fasta.fai',
	            'references/hg38/Homo_sapiens_assembly38.fasta.64.alt', 'references/hg38/Homo_sapiens_assembly38.fasta.64.amb',
	            'references/hg38/Homo_sapiens_assembly38.fasta.64.ann', 'references/hg38/Homo_sapiens_assembly38.fasta.64.bwt',
	            'references/hg38/Homo_sapiens_assembly38.fasta.64.pac', 'references/hg38/Homo_sapiens_assembly38.fasta.64.sa'] )
	.set{ bwa_ref_genome_files }

Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38.dbsnp138.vcf.gz' )
	.set{ dbsnp_known_indel_ref_vcf }

Channel
	.fromPath( 'references/hg38/Homo_sapiens_assembly38.dbsnp138.vcf.gz.tbi' )
	.set{ dbsnp_known_indel_ref_vcf_index }

Channel
	.fromPath( 'references/hg38/simple_and_centromeric_repeats.hg38.bed' )
	.into{ simple_and_centromeric_repeats_bed_forSvaba;
		   simple_and_centromeric_repeats_bed_forSnvBedFilter;
	       simple_and_centromeric_repeats_bed_forIndelBedFilter }

if( params.annotsv_ref_cached == "yes" ) {
     Channel
          .fromPath( 'references/hg38/annotations_human_annotsv_hg38/', type: 'dir', checkIfExists: true )
          .ifEmpty{ error "The run command issued has the '--annotsv_ref_cached' parameter set to 'yes', however the directory does not exist. Please set the '--annotsv_ref_cached' parameter to 'no' and resubmit the run command. For more information, check the README or issue the command 'nextflow run somatic.nf --help'"}
          .into{ annotsv_ref_dir_pre_downloaded_forSvAnnotation;
                 annotsv_ref_dir_pre_downloaded_forCnvAnnotation }
}

if( params.vep_ref_cached == "yes" ) {
	Channel
		.fromPath( 'references/hg38/homo_sapiens_vep_101_GRCh38/', type: 'dir', checkIfExists: true )
		.ifEmpty{ error "The run command issued has the '--vep_ref_cached' parameter set to 'yes', however the directory does not exist. Please set the '--vep_ref_cached' parameter to 'no' and resubmit the run command. For more information, check the README or issue the command 'nextflow run somatic.nf --help'"}
		.set{ vep_ref_dir_preDownloaded }
}

// #################################################### \\
// ~~~~~~~~~~~~~~~~ PIPELINE PROCESSES ~~~~~~~~~~~~~~~~ \\

log.info ''
log.info '################################################'
log.info ''
log.info "           .------------------------.           "
log.info "          |    .-..-. .--. .---.     |          "
log.info "          |    : `' :: .--': .; :    |          "
log.info "          |    : .. :: : _ :  _.'    |          "
log.info "          |    : :; :: :; :: :       |          "
log.info "          |    :_;:_;`.__.':_;       |          "
log.info "          |   ,-. .--.  .--.  .--.   |          "
log.info "          | .'  :: ,. :: ,. :: ,. :  |          "
log.info "          |   : :: :: :: :: :: :: :  |          "
log.info "          |   : :: :; :: :; :: :; :  |          "
log.info "          |   :_;`.__.'`.__.'`.__.'  |          "
log.info "           .________________________.           "
log.info ''
log.info "                    SOMATIC                     "
log.info ''
log.info "~~~ Launch Time ~~~		${workflowTimestamp}"
log.info ''
log.info "~~~ Input Directory ~~~		${params.input_dir}"
log.info ''
log.info "~~~ Output Directory ~~~	${params.output_dir}"
log.info ''
log.info "~~~ Run Report File ~~~		nextflow_report.${params.run_id}.html"
log.info ''
log.info "~~~ Read Length ~~~		${params.controlfreec_read_length}"
log.info ''
log.info '################################################'
log.info ''

// Read user provided sample sheet to set the Tumor/Normal sample pairs
Channel
	.fromPath( params.sample_sheet )
	.splitCsv( header:true )
	.map{ row -> tumor_bam = "${row.tumor}"
				 tumor_bam_index = "${row.tumor}".replaceFirst(/\.bam$/, "")
	             normal_bam = "${row.normal}"
	             normal_bam_index = "${row.normal}".replaceFirst(/\.bam$/, "")
	             return[ file("${params.input_dir}/${tumor_bam}"),
	             		 file("${params.input_dir}/${tumor_bam_index}*.bai"),
	             		 file("${params.input_dir}/${normal_bam}"),
	             		 file("${params.input_dir}/${normal_bam_index}*.bai") ] }
	.into{ tumor_normal_pair_forAlleleCount;
		   tumor_normal_pair_forTelomerecat;
		   tumor_normal_pair_forTelomereHunter;
		   tumor_normal_pair_forConpairPileup;
	       tumor_normal_pair_forVarscanSamtoolsMpileup;
	       tumor_normal_pair_forMutectCalling;
	       tumor_normal_pair_forMutectPileup;
	       tumor_normal_pair_forControlFreecSamtoolsMpileup;
	       tumor_normal_pair_forSclustBamprocess;
	       tumor_normal_pair_forManta;
	       tumor_normal_pair_forSvaba;
	       tumor_normal_pair_forDelly;
	       tumor_normal_pair_forIgCaller }

// Combine reference FASTA index and sex identification loci files into one channel for use in alleleCount process
reference_genome_fasta_index_forAlleleCount.combine( sex_identification_loci )
	.set{ ref_index_and_sex_ident_loci }

// alleleCount ~ determine the sex of each sample to use in downstream analyses
process identifySampleSex_allelecount {
	publishDir "${params.output_dir}/somatic/sexOfSamples", mode: 'copy', pattern: '*.{txt}'
	tag "${tumor_normal_sample_id}"

	input:
	tuple path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index), path(reference_genome_fasta_index_forAlleleCount), path(sex_identification_loci) from tumor_normal_pair_forAlleleCount.combine(ref_index_and_sex_ident_loci)

	output:
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index) into bams_forVarscanBamReadcount
	tuple val(tumor_normal_sample_id), path(sample_sex) into sex_of_sample_forControlFreecCalling
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index) into tumor_bams_forFragCounter
	tuple val(tumor_normal_sample_id), path(normal_bam), path(normal_bam_index) into normal_bams_forFragCounter
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index) into bams_forFragCounterPileup
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index), path(sample_sex) into bams_and_sex_of_sample_forBattenberg
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index) into bams_forFacetsPileup
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index) into bams_forConsensusSnvMpileup
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index) into bams_forConsensusIndelMpileup
	tuple val(tumor_normal_sample_id), path(sample_sex) into sex_of_sample_forConsensusCnvTransform
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index) into bam_forConsensusSvFpFilter
	tuple val(tumor_normal_sample_id), path(sample_sex) into allelecount_output_forConsensusMetadata

	script:
	tumor_id = "${tumor_bam.baseName}".replaceFirst(/\..*$/, "")
	normal_id = "${normal_bam.baseName}".replaceFirst(/\..*$/, "")
	tumor_normal_sample_id = "${tumor_id}_vs_${normal_id}"
	sex_loci_allele_counts = "${tumor_normal_sample_id}.sexloci.txt"
	sample_sex = "${tumor_normal_sample_id}.sexident.txt"
	"""
	alleleCounter \
	--loci-file "${sex_identification_loci}" \
	--hts-file "${normal_bam}" \
	--ref-file "${reference_genome_fasta_index_forAlleleCount}" \
	--output-file "${sex_loci_allele_counts}"

	sample_sex_determinator.sh "${sex_loci_allele_counts}" > "${sample_sex}"
	"""
}

// Telomerecat bam2length ~  estimating the average telomere length
process telomereLengthEstimation_telomerecat {
    publishDir "${params.output_dir}/somatic/telomerecat", mode: 'copy', pattern: '*.{csv}'
    tag "${tumor_normal_sample_id}"

    input:
    tuple path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index) from tumor_normal_pair_forTelomerecat

    output:
    path normal_telomere_estimates
    path tumor_telomere_estimates

    when:
    params.telomerecat == "on"

    script:
    tumor_id = "${tumor_bam.baseName}".replaceFirst(/\..*$/, "")
    normal_id = "${normal_bam.baseName}".replaceFirst(/\..*$/, "")
    tumor_normal_sample_id = "${tumor_id}_vs_${normal_id}"
    normal_telomere_estimates = "${normal_id}.telomerecat.csv"
    tumor_telomere_estimates = "${tumor_id}.telomerecat.csv"
    """
    telomerecat bam2length \
    -p ${task.cpus} \
    -v 1 \
    --output "${tumor_telomere_estimates}" \
    "${tumor_bam}"

    telomerecat bam2length \
    -p ${task.cpus} \
    -v 1 \
    --output "${normal_telomere_estimates}" \
    "${normal_bam}"
    """
}

// TelomereHunter ~ estimate telomere content and composition
process telomereEstimation_telomerehunter {
	publishDir "${params.output_dir}/somatic/telomereHunter", mode: 'copy'
	tag "${tumor_normal_sample_id}"

	input:
	tuple path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index), path(cytoband_bed) from tumor_normal_pair_forTelomereHunter.combine(cytoband_bed)

	output:
	path "${tumor_normal_sample_id}/*.tsv"
	path "${tumor_normal_sample_id}/*.png"
	path "${tumor_normal_sample_id}/control_TelomerCnt_${tumor_normal_sample_id}/*.tsv"
	path "${tumor_normal_sample_id}/control_TelomerCnt_${tumor_normal_sample_id}/TVRs"
	path "${tumor_normal_sample_id}/tumor_TelomerCnt_${tumor_normal_sample_id}/*.tsv"
	path "${tumor_normal_sample_id}/tumor_TelomerCnt_${tumor_normal_sample_id}/TVRs"
	path "${tumor_normal_sample_id}/plots"

	when:
	params.telomerehunter == "on"

	script:
	tumor_id = "${tumor_bam.baseName}".replaceFirst(/\..*$/, "")
	normal_id = "${normal_bam.baseName}".replaceFirst(/\..*$/, "")
	tumor_normal_sample_id = "${tumor_id}_vs_${normal_id}"
	"""
	telomerehunter \
	--inputBamTumor "${tumor_bam}" \
	--inputBamControl "${normal_bam}" \
	--outPath . \
	--pid "${tumor_normal_sample_id}" \
	--bandingFile "${cytoband_bed}" \
	--parallel \
	--plotFileFormat png
	"""
}

// ~~~~~~~~~~~~~~~~ Conpair ~~~~~~~~~~~~~~~~ \\
// START

// Combine all reference FASTA files into one channel for use in Conpair Pileup process
reference_genome_fasta_forConpairPileup.combine( reference_genome_fasta_index_forConpairPileup )
	.combine( reference_genome_fasta_dict_forConpairPileup )
	.set{ reference_genome_bundle_forConpairPileup }

// Conpair run_gatk_pileup_for_sample ~ generate GATK pileups the tumor and normal BAMs separately
process bamPileupForConpair_conpair {
	tag "${tumor_normal_sample_id}"

	input:
	tuple path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index), path(reference_genome_fasta_forConpairPileup), path(reference_genome_fasta_index_forConpairPileup), path(reference_genome_fasta_dict_forConpairPileup) from tumor_normal_pair_forConpairPileup.combine(reference_genome_bundle_forConpairPileup)

	output:
	tuple val(tumor_normal_sample_id), path(tumor_pileup), path(normal_pileup) into bam_pileups_forConpair

	when:
	params.conpair == "on"

	script:
	tumor_id = "${tumor_bam.baseName}".replaceFirst(/\..*$/, "")
	normal_id = "${normal_bam.baseName}".replaceFirst(/\..*$/, "")
	tumor_normal_sample_id = "${tumor_id}_vs_${normal_id}"
	tumor_pileup = "${tumor_id}.pileup"
	normal_pileup = "${normal_id}.pileup"
	hg38_ref_genome_markers = "/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover"
	"""
	\${CONPAIR_DIR}/scripts/run_gatk_pileup_for_sample.py \
	--xmx_jav "${task.memory.toGiga()}g" \
	--bam "${tumor_bam}" \
	--outfile "${tumor_pileup}" \
	--reference "${reference_genome_fasta_forConpairPileup}" \
	--markers \${CONPAIR_DIR}"${hg38_ref_genome_markers}.bed"

	\${CONPAIR_DIR}/scripts/run_gatk_pileup_for_sample.py \
	--xmx_jav "${task.memory.toGiga()}g" \
	--bam "${normal_bam}" \
	--outfile "${normal_pileup}" \
	--reference "${reference_genome_fasta_forConpairPileup}" \
	--markers \${CONPAIR_DIR}"${hg38_ref_genome_markers}.bed"
	"""
}

// Conpair verify_concordance / estimate_tumor_normal_contamination ~ concordance and contamination estimator for tumor–normal pileups
process concordanceAndContaminationEstimation_conpair {
	publishDir "${params.output_dir}/somatic/conpair", mode: 'copy', pattern: '*.{txt}'
	tag "${tumor_normal_sample_id}"

	input:
	tuple val(tumor_normal_sample_id), path(tumor_pileup), path(normal_pileup) from bam_pileups_forConpair

	output:
	tuple val(tumor_normal_sample_id), path(conpair_concordance_file), path(conpair_contamination_file) into conpair_output_forConsensusMetadata

	when:
	params.conpair == "on"

	script:
	conpair_concordance_file = "${tumor_normal_sample_id}.conpair.concordance.txt"
	conpair_contamination_file = "${tumor_normal_sample_id}.conpair.contamination.txt"
	hg38_ref_genome_markers = "/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover"
	"""
	\${CONPAIR_DIR}/scripts/verify_concordance.py \
	--min_cov ${params.conpair_min_cov} \
	--min_mapping_quality 10 \
	--min_base_quality 20 \
	--tumor_pileup "${tumor_pileup}" \
	--normal_pileup "${normal_pileup}" \
	--outfile "${conpair_concordance_file}" \
	--markers \${CONPAIR_DIR}"${hg38_ref_genome_markers}.txt"

	\${CONPAIR_DIR}/scripts/estimate_tumor_normal_contamination.py \
	--grid 0.01 \
	--min_mapping_quality 10 \
	--tumor_pileup "${tumor_pileup}" \
	--normal_pileup "${normal_pileup}" \
	--outfile "${conpair_contamination_file}" \
	--markers \${CONPAIR_DIR}"${hg38_ref_genome_markers}.txt"
	"""
}

// END
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ \\


// ~~~~~~~~~~~~~~~~ VarScan2 ~~~~~~~~~~~~~~~~ \\
// START

// Combine all reference FASTA files and WGS BED file into one channel for use in VarScan / SAMtools mpileup
reference_genome_fasta_forVarscanSamtoolsMpileup.combine( reference_genome_fasta_index_forVarscanSamtoolsMpileup )
	.combine( reference_genome_fasta_dict_forVarscanSamtoolsMpileup )
	.set{ reference_genome_bundle_forVarscanSamtoolsMpileup }

reference_genome_bundle_forVarscanSamtoolsMpileup.combine( gatk_bundle_wgs_bed_forVarscanSamtoolsMpileup )
	.set{ reference_genome_bundle_and_bed_forVarscanSamtoolsMpileup }

// VarScan somatic / SAMtools mpileup ~ heuristic/statistic approach to call SNV and indel variants
process snvAndIndelCalling_varscan {
	tag "${tumor_normal_sample_id} C=${chromosome}"

	input:
	tuple path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index), path(reference_genome_fasta_forVarscanSamtoolsMpileup), path(reference_genome_fasta_index_forVarscanSamtoolsMpileup), path(reference_genome_fasta_dict_forVarscanSamtoolsMpileup), path(gatk_bundle_wgs_bed_forVarscanSamtoolsMpileup) from tumor_normal_pair_forVarscanSamtoolsMpileup.combine(reference_genome_bundle_and_bed_forVarscanSamtoolsMpileup)
	each chromosome from chromosome_list_forVarscanSamtoolsMpileup

	output:
	tuple val(tumor_normal_sample_id), path(raw_per_chromosome_snv_vcf), path(raw_per_chromosome_snv_vcf_index), path(raw_per_chromosome_indel_vcf), path(raw_per_chromosome_indel_vcf_index) into raw_per_chromosome_vcfs_forVarscanBcftools

	when:
	params.varscan == "on"

	script:
	tumor_id = "${tumor_bam.baseName}".replaceFirst(/\..*$/, "")
	normal_id = "${normal_bam.baseName}".replaceFirst(/\..*$/, "")
	tumor_normal_sample_id = "${tumor_id}_vs_${normal_id}"
	raw_per_chromosome_snv_vcf = "${tumor_normal_sample_id}.${chromosome}.snv.vcf.gz"
	raw_per_chromosome_snv_vcf_index = "${raw_per_chromosome_snv_vcf}.tbi"
	raw_per_chromosome_indel_vcf = "${tumor_normal_sample_id}.${chromosome}.indel.vcf.gz"
	raw_per_chromosome_indel_vcf_index = "${raw_per_chromosome_indel_vcf}.tbi"
	"""
	samtools mpileup \
	--no-BAQ \
	--min-MQ 1 \
	--positions "${gatk_bundle_wgs_bed_forVarscanSamtoolsMpileup}" \
	--region "${chromosome}" \
	--fasta-ref "${reference_genome_fasta_forVarscanSamtoolsMpileup}" \
	"${normal_bam}" "${tumor_bam}" \
	| \
	java -Xmx2G -XX:ParallelGCThreads=2 -jar \${VARSCAN} somatic \
	--mpileup 1 \
	--min-coverage-normal 8 \
	--min-coverage-tumor 6 \
	--min-var-freq 0.10 \
	--min-freq-for-hom 0.75 \
	--normal-purity 1.00 \
	--tumor-purity 1.00 \
	--p-value 0.99 \
	--somatic-p-value 0.05 \
	--strand-filter 0 \
	--output-vcf \
	--output-snp "${tumor_normal_sample_id}.${chromosome}.snv" \
	--output-indel "${tumor_normal_sample_id}.${chromosome}.indel"

	bgzip < "${tumor_normal_sample_id}.${chromosome}.snv.vcf" > "${raw_per_chromosome_snv_vcf}"
	tabix "${raw_per_chromosome_snv_vcf}"

	bgzip < "${tumor_normal_sample_id}.${chromosome}.indel.vcf" > "${raw_per_chromosome_indel_vcf}"
	tabix "${raw_per_chromosome_indel_vcf}"
	"""
}

// BCFtools concat ~ concatenate all VarScan SNV/indel per chromosome VCFs
process concatenateVarscanPerChromosomeVcfs_bcftools {
	tag "${tumor_normal_sample_id}"

	input:
	tuple val(tumor_normal_sample_id), path(raw_per_chromosome_snv_vcf), path(raw_per_chromosome_snv_vcf_index), path(raw_per_chromosome_indel_vcf), path(raw_per_chromosome_indel_vcf_index) from raw_per_chromosome_vcfs_forVarscanBcftools.groupTuple()

	output:
	tuple val(tumor_normal_sample_id), path(raw_snv_vcf), path(raw_indel_vcf) into raw_vcfs_forVarscanHcFilter

	when:
	params.varscan == "on"

	script:
	raw_snv_vcf = "${tumor_normal_sample_id}.snv.vcf.gz"
	raw_indel_vcf = "${tumor_normal_sample_id}.indel.vcf.gz"
	"""
	bcftools concat \
	--threads ${task.cpus} \
	--output-type z \
	--output "${raw_snv_vcf}" \
	"${tumor_normal_sample_id}.chr1.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr2.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr3.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr4.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr5.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr6.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr7.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr8.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr9.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr10.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr11.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr12.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr13.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr14.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr15.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr16.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr17.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr18.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr19.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr20.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr21.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chr22.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chrX.snv.vcf.gz" \
	"${tumor_normal_sample_id}.chrY.snv.vcf.gz"

	bcftools concat \
	--threads ${task.cpus} \
	--output-type z \
	--output "${raw_indel_vcf}" \
	"${tumor_normal_sample_id}.chr1.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr2.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr3.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr4.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr5.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr6.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr7.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr8.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr9.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr10.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr11.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr12.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr13.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr14.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr15.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr16.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr17.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr18.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr19.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr20.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr21.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chr22.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chrX.indel.vcf.gz" \
	"${tumor_normal_sample_id}.chrY.indel.vcf.gz"
	"""
}

// VarScan processSomatic ~ filter the called SNVs and indels for confidence and somatic status assignment
process filterRawSnvAndIndels_varscan {
	tag "${tumor_normal_sample_id}"

	input:
	tuple val(tumor_normal_sample_id), path(raw_snv_vcf), path(raw_indel_vcf) from raw_vcfs_forVarscanHcFilter

	output:
	tuple val(tumor_normal_sample_id), path(high_confidence_snv_vcf), path(high_confidence_snv_vcf_index), path(high_confidence_indel_vcf), path(high_confidence_indel_vcf_index) into high_confidence_vcfs_forVarscanBamReadcount, high_confidence_vcfs_forVarscanFpFilter

	when:
	params.varscan == "on"

	script:
	high_confidence_snv_vcf = "${raw_snv_vcf}".replaceFirst(/\.vcf\.gz/, ".somatic.hc.vcf.gz")
	high_confidence_snv_vcf_index = "${high_confidence_snv_vcf}.tbi"
	high_confidence_indel_vcf = "${raw_indel_vcf}".replaceFirst(/\.vcf\.gz/, ".somatic.hc.vcf.gz")
	high_confidence_indel_vcf_index = "${high_confidence_indel_vcf}.tbi"
	"""
	zcat "${raw_snv_vcf}" \
	| \
	java -jar \${VARSCAN} processSomatic \
	"${tumor_normal_sample_id}.snv" \
	--min-tumor-freq 0.10 \
	--max-normal-freq 0.05 \
	--p-value 0.07

	bgzip < "${tumor_normal_sample_id}.snv.Somatic.hc" > "${high_confidence_snv_vcf}"
	tabix "${high_confidence_snv_vcf}"

	zcat "${raw_indel_vcf}" \
	| \
	java -jar \${VARSCAN} processSomatic \
	"${tumor_normal_sample_id}.indel" \
	--min-tumor-freq 0.10 \
	--max-normal-freq 0.05 \
	--p-value 0.07

	bgzip < "${tumor_normal_sample_id}.indel.Somatic.hc" > "${high_confidence_indel_vcf}"
	tabix "${high_confidence_indel_vcf}"
	"""
}

// Combine all needed reference FASTA files into one channel for use in bam-readcount process
reference_genome_fasta_forVarscanBamReadcount.combine( reference_genome_fasta_index_forVarscanBamReadcount )
	.combine( reference_genome_fasta_dict_forVarscanBamReadcount )
	.set{ reference_genome_bundle_forVarscanBamReadcount }

// bam-readcount / BCFtools concat ~ generate metrics at single nucleotide positions for filtering out false positive calls
process bamReadcountForVarscanFpFilter_bamreadcount {
	tag "${tumor_normal_sample_id}"

	input:
	tuple val(tumor_normal_sample_id), path(tumor_bam), path(tumor_bam_index), path(normal_bam), path(normal_bam_index), path(high_confidence_snv_vcf), path(high_confidence_snv_vcf_index), path(high_confidence_indel_vcf), path(high_confidence_indel_vcf_index), path(reference_genome_fasta_forVarscanBamReadcount), path(reference_genome_fasta_index_forVarscanBamReadcount), path(reference_genome_fasta_dict_forVarscanBamReadcount) from bams_forVarscanBamReadcount.join(high_confidence_vcfs_forVarscanBamReadcount).combine(reference_genome_bundle_forVarscanBamReadcount)

	output:
	tuple val(tumor_normal_sample_id), path(snv_readcount_file), path(indel_readcount_file) into readcount_forVarscanFpFilter

	when:
	params.varscan == "on"

	script:
	snv_readcount_file = "${tumor_normal_sample_id}_bam_readcount_snv.tsv"
	indel_readcount_file = "${tumor_normal_sample_id}_bam_readcount_indel.tsv"
	"""
	bcftools concat \
	--threads ${task.cpus} \
	--allow-overlaps \
	--output-type z \
	--output "${tumor_normal_sample_id}.somatic.hc.vcf.gz" \
	"${high_confidence_snv_vcf}" "${high_confidence_indel_vcf}"

	tabix "${tumor_normal_sample_id}.somatic.hc.vcf.gz"

	bam_readcount_helper.py \
	"${tumor_normal_sample_id}.somatic.hc.vcf.gz" \
	TUMOR \
	"${reference_genome_fasta_forVarscanBamReadcount}" \
	"${tumor_bam}" \
	.

	mv TUMOR_bam_readcount_snv.tsv "${snv_readcount_file}"
	mv TUMOR_bam_readcount_indel.tsv "${indel_readcount_file}"
	"""
}

// VarScan fpfilter ~ filter out additional false positive variants
process falsePositivefilterSnvAndIndels_varscan {
	tag "${tumor_normal_sample_id}"

	input:
	tuple val(tumor_normal_sample_id), path(high_confidence_snv_vcf), path(high_confidence_snv_vcf_index), path(high_confidence_indel_vcf), path(high_confidence_indel_vcf_index), path(snv_readcount_file), path(indel_readcount_file) from high_confidence_vcfs_forVarscanFpFilter.join(readcount_forVarscanFpFilter)

	output:
	tuple val(tumor_normal_sample_id), path(fp_filtered_snv_vcf), path(fp_filtered_indel_vcf) into filtered_vcfs_forVarscanBcftools

	when:
	params.varscan == "on"

	script:
	unzipped_hc_snv_vcf = "${high_confidence_snv_vcf}".replaceFirst(/\.gz/, "")
	unzipped_hc_indel_vcf = "${high_confidence_indel_vcf}".replaceFirst(/\.gz/, "")
	fp_filtered_snv_vcf = "${unzipped_hc_snv_vcf}".replaceFirst(/\.hc\.vcf/, ".filtered.vcf")
	fp_filtered_indel_vcf = "${unzipped_hc_indel_vcf}".replaceFirst(/\.hc\.vcf/, ".filtered.vcf")
	"""
	gunzip -f "${high_confidence_snv_vcf}"

	java -jar \$VARSCAN fpfilter \
	"${unzipped_hc_snv_vcf}" \
	"${snv_readcount_file}" \
	--filtered-file "${tumor_normal_sample_id}.snv.failed.vcf" \
	--min-var-count 2 \
	--min-var-freq 0.01 \
	--min-ref-basequal 25 \
	--min-var-basequal 25 \
	--output-file "${fp_filtered_snv_vcf}"

	gunzip -f "${high_confidence_indel_vcf}"

	java -jar \$VARSCAN fpfilter \
	"${unzipped_hc_indel_vcf}" \
	"${indel_readcount_file}" \
	--filtered-file "${tumor_normal_sample_id}.indel.failed.vcf" \