Skip to content

Commit cf54ad5

Browse files
Module04b: parametrize sample overlap and minimum var count outlier threshold for regeno filtering (#133)
1 parent 6c8d835 commit cf54ad5

File tree

2 files changed

+16
-4
lines changed

2 files changed

+16
-4
lines changed

wdl/CombineReassess.wdl

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ workflow CombineReassess {
88
File regeno_file
99
File regeno_sample_ids_lookup
1010
Array[File] vcfs
11+
Int min_var_per_sample_outlier_threshold
12+
Float regeno_sample_overlap
1113
String sv_pipeline_base_docker
1214
String sv_pipeline_docker
1315
RuntimeAttr? runtime_attr_vcf2bed
@@ -28,6 +30,8 @@ workflow CombineReassess {
2830
regeno_file = regeno_file,
2931
regeno_sample_ids_lookup = regeno_sample_ids_lookup,
3032
samplelist = samplelist,
33+
min_var_per_sample_outlier_threshold = min_var_per_sample_outlier_threshold,
34+
regeno_sample_overlap = regeno_sample_overlap,
3135
runtime_attr_override = runtime_attr_merge_list_creassess,
3236
sv_pipeline_base_docker = sv_pipeline_base_docker
3337
}
@@ -82,6 +86,8 @@ task MergeList {
8286
File regeno_file
8387
Array[File] nonempty_txt
8488
File regeno_sample_ids_lookup
89+
Int min_var_per_sample_outlier_threshold
90+
Float regeno_sample_overlap
8591
String sv_pipeline_base_docker
8692
RuntimeAttr? runtime_attr_override
8793
}
@@ -130,7 +136,9 @@ task MergeList {
130136
count(line)
131137
counts=np.array([int(dct[x]) for x in dct.keys()])
132138
def reject_outliers(data, m=3):
133-
return data[abs(data - np.mean(data)) > m * np.std(data)]
139+
deviation_threshold = m * np.std(data)
140+
data_mean = np.mean(data)
141+
return data[np.logical_and(abs(data - data_mean) > deviation_threshold, data > ~{min_var_per_sample_outlier_threshold})]
134142
outliers=reject_outliers(counts)
135143
outlier_samples=set([x for x in dct.keys() if dct[x] in outliers])
136144
with open("reassess_nonzero_overlap.txt",'w') as g, open("reassesss_by_var.txt",'r') as f:
@@ -146,7 +154,7 @@ task MergeList {
146154
overlap_over_expected=str(len(regeno_in_expected)/len(expected))
147155
g.write(dat[0]+"\t"+",".join(regeno)+'\t'+",".join(expected)+'\t'+overlap_over_regeno+'\t'+overlap_over_expected+"\n")
148156
CODE
149-
awk '{if($4>0.7 && $5>0.7)print $1}' reassess_nonzero_overlap.txt > regeno_var_filtered.txt
157+
awk '{if($4>~{regeno_sample_overlap} && $5>~{regeno_sample_overlap})print $1}' reassess_nonzero_overlap.txt > regeno_var_filtered.txt
150158
# the OR clause below is to ignore return code = 1 because that isn't an error, it just means there were 0 matched lines
151159
# (but don't ignore real error codes > 1)
152160
fgrep -w -f regeno_var_filtered.txt ~{regeno_file}> regeno.filtered.bed || [[ $? == 1 ]]

wdl/Module04b.wdl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ workflow Module04b {
2424
String cohort # Cohort name or project prefix for all cohort-level outputs
2525
File contig_list
2626
Array[File] regeno_coverage_medians # one file per batch
27-
Float regeno_max_allele_freq = 0.01
28-
Int regeno_allele_count_threshold = 3
27+
Float regeno_max_allele_freq = 0.01 # Rare variant filter for regenotyping candidates: must be < AF threshold (this parameter) or <= AC threshold (below)
28+
Int regeno_allele_count_threshold = 3 # Rare variant filter for regenotyping candidates: must be < AF threshold (above) or <= AC threshold (this parameter)
29+
Int min_var_per_sample_outlier_threshold = 3 # Threshold below which regeno SV count per sample should not be considered an outlier (need when counts are sparse)
30+
Float regeno_sample_overlap = 0.7 # Minimum sample overlap required between raw and regenotyped calls
2931
3032
RuntimeAttr? runtime_attr_cluster_merged_depth_beds
3133
RuntimeAttr? runtime_attr_regeno_raw_combined_depth
@@ -188,6 +190,8 @@ workflow Module04b {
188190
regeno_file = MergeList.master_regeno,
189191
regeno_sample_ids_lookup = ConcatSampleIdLookupBed.concat_bed,
190192
vcfs = Genotype_2.genotyped_vcf,
193+
min_var_per_sample_outlier_threshold = min_var_per_sample_outlier_threshold,
194+
regeno_sample_overlap = regeno_sample_overlap,
191195
sv_pipeline_docker = sv_pipeline_docker,
192196
sv_pipeline_base_docker = sv_pipeline_base_docker,
193197
runtime_attr_merge_list_creassess = runtime_attr_merge_list_creassess,

0 commit comments

Comments
 (0)