@@ -18,8 +18,12 @@ workflow TrainGCNV {
1818 File reference_index # Index (.fai), must be in same dir as fasta
1919 File reference_dict # Dictionary (.dict), must be in same dir as fasta
2020
21+ # Options for subsetting samples for training. Both options require providing sv_pipeline_base_docker
22+ # Assumes all other inputs correspond to the full sample list. Intended for Terra
2123 Int ? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100)
2224 Int subsample_seed = 42
25+ # Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided
26+ Array [String ]? sample_ids_training_subset
2327
2428 # Condense read counts
2529 Int ? condense_num_bins
@@ -85,7 +89,7 @@ workflow TrainGCNV {
8589 String linux_docker
8690 String gatk_docker
8791 String condense_counts_docker
88- String ? sv_pipeline_base_docker # required if using n_samples_subsample to select samples
92+ String ? sv_pipeline_base_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples
8993
9094 # Runtime configuration overrides
9195 RuntimeAttr ? condense_counts_runtime_attr
@@ -100,20 +104,31 @@ workflow TrainGCNV {
100104 RuntimeAttr ? runtime_attr_explode
101105 }
102106
103- if (defined (n_samples_subsample )) {
107+ if (defined (sample_ids_training_subset )) {
108+ call util .GetSubsampledIndices {
109+ input :
110+ all_strings = write_lines (samples ),
111+ subset_strings = write_lines (select_first ([sample_ids_training_subset ])),
112+ prefix = cohort ,
113+ sv_pipeline_base_docker = select_first ([sv_pipeline_base_docker ])
114+ }
115+ }
116+
117+ if (defined (n_samples_subsample ) && !defined (sample_ids_training_subset )) {
104118 call util .RandomSubsampleStringArray {
105119 input :
106- strings = samples ,
120+ strings = write_lines ( samples ) ,
107121 seed = subsample_seed ,
108122 subset_size = select_first ([n_samples_subsample ]),
109123 prefix = cohort ,
110124 sv_pipeline_base_docker = select_first ([sv_pipeline_base_docker ])
111125 }
112126 }
113127
114- Array [Int ] sample_indices = select_first ([RandomSubsampleStringArray .subsample_indices_array , range (length (samples ))])
128+ Array [Int ] sample_indices = select_first ([GetSubsampledIndices . subsample_indices_array , RandomSubsampleStringArray .subsample_indices_array , range (length (samples ))])
115129
116130 scatter (i in sample_indices ) {
131+ String sample_ids_ = samples [i ]
117132 call cov .CondenseReadCounts as CondenseReadCounts {
118133 input :
119134 counts = count_files [i ],
@@ -138,7 +153,7 @@ workflow TrainGCNV {
138153 preprocessed_intervals = CountsToIntervals .out ,
139154 filter_intervals = filter_intervals ,
140155 counts = CondenseReadCounts .out ,
141- count_entity_ids = select_first ([ RandomSubsampleStringArray . subsampled_strings_array , samples ]) ,
156+ count_entity_ids = sample_ids_ ,
142157 cohort_entity_id = cohort ,
143158 contig_ploidy_priors = contig_ploidy_priors ,
144159 num_intervals_per_scatter = num_intervals_per_scatter ,
0 commit comments