1- #! /bin/bash
2-
3- # Exit on error
4- set -e
5-
6- vcf_input=" $1 "
7- vcf_names=" $2 "
8- MAX_MISSING_IND=" $3 "
9-
10- vcf_dir=" vcf_filtered_directory"
11- summ_dir=" summary"
12-
13- # #### Check output directories ####
14- if [[ ! -d " ${vcf_dir} " ]]; then
15- echo " ERROR: Failed to create output VCF directory" >&2
16- exit 1
17- fi
18-
19- if [[ ! -d " ${summ_dir} " ]]; then
20- echo " ERROR: Failed to create output VCF directory" >&2
21- exit 1
22- fi
23-
24- # #### Check input files #####
25- if [[ -z " $vcf_input " ]]; then
26- echo " ERROR: VCF file is not provided." >&2
27- exit 1
28- fi
29-
30- # Verify that input VCF contains at least one variant
31- if ! bcftools view -H " $vcf_input " | head -n 1 | grep -q . ; then
32- echo " ERROR: Input VCF contains no variant records."
33- exit 1
34- fi
35-
36- summary_file=" summary/n_individuals.tabular"
37- echo -e " Dataset\tN_individuals_before\tN_individuals_after" > " ${summary_file} "
38-
39- # ##################################################
40- # Function: vcf_filtering_IND_missing_data
41- # Description:
42- # ##################################################
43-
44- vcf_filtering_IND_missing_data (){
45- # #### Parameters #####
46- local vcf=" $1 "
47- local original_name=" $2 "
48- local MAX_MISSING_IND=" $3 "
49-
50- # #### Check if file exists #####
51- if [[ ! -f " $vcf " ]]; then
52- echo " File not found, ignored: $vcf "
53- return
54- fi
55-
56- # Extract base name (handle .vcf)
57- local base_name
58- local regex=' \(([^)]+)\)[[:space:]]*$'
59- if [[ " $original_name " =~ $regex ]]; then
60- # Extract content between last parentheses
61- base_name=" ${BASH_REMATCH[1]} "
62- else
63- # No parentheses, use original name
64- base_name=$( basename " $original_name " )
65- fi
66-
67- base_name=${base_name% .vcf}
68-
69- # #### Filtering on individuals with a high amount of missing data #####
70- output_file=" $vcf_dir /${base_name} _mdIND.vcf"
71-
72- intermed_files=" ${base_name} _IND_MISSING_DATA"
73-
74- vcftools --vcf " $vcf " --missing-indv --out " $intermed_files "
75-
76- imiss=" ${intermed_files} .imiss"
77-
78- ind_miss=" ${base_name} _ind_missing_SNPs.txt" # list of individuals to be retained
79-
80- awk -v threshold=" $MAX_MISSING_IND " ' NR > 1 && $5 < threshold { print $1 }' " $imiss " > " $ind_miss "
81-
82- bcftools view -S " $ind_miss " -O v -o " $output_file " " $vcf "
83-
84- # #### Verify that filtered VCF is not empty ######
85- if [[ ! -f " $output_file " ]]; then
86- echo " ERROR: Output VCF not created: $final_vcf " >&2
87- exit 1
88- fi
89-
90- if ! bcftools view -H " $output_file " | head -n 1 | grep -q . ; then
91- echo " ERROR: Filtered VCF contains no variants."
92- exit 1
93- fi
94-
95- # #### Count individuals before and after filtering #####
96- n_ind_before=$( bcftools query -l " $vcf " | wc -l)
97- n_ind_after=$( bcftools query -l " $output_file " | wc -l)
98-
99- # #### Append results to output file #####
100- echo -e " ${base_name} \t${n_ind_before} \t${n_ind_after} " >> " ${summary_file} "
101-
102- }
103-
104- # #################
105- # Main execution
106- # #################
107-
1+ #! /bin/bash
2+
3+ # Exit on error
4+ set -e
5+
6+ vcf_input=" $1 "
7+ vcf_names=" $2 "
8+ MAX_MISSING_IND=" $3 "
9+
10+ vcf_dir=" vcf_filtered_directory"
11+ summ_dir=" summary"
12+
13+ # #### Check output directories ####
14+ if [[ ! -d " ${vcf_dir} " ]]; then
15+ echo " ERROR: Failed to create output VCF directory" >&2
16+ exit 1
17+ fi
18+
19+ if [[ ! -d " ${summ_dir} " ]]; then
20+ echo " ERROR: Failed to create output VCF directory" >&2
21+ exit 1
22+ fi
23+
24+ # #### Check input files #####
25+ if [[ -z " $vcf_input " ]]; then
26+ echo " ERROR: VCF file is not provided." >&2
27+ exit 1
28+ fi
29+
30+ # Verify that input VCF contains at least one variant
31+ if ! bcftools view -H " $vcf_input " | head -n 1 | grep -q . ; then
32+ echo " ERROR: Input VCF contains no variant records."
33+ exit 1
34+ fi
35+
36+ summary_file=" summary/n_individuals.tabular"
37+ echo -e " Dataset\tN_individuals_before\tN_individuals_after" > " ${summary_file} "
38+
39+ # ##################################################
40+ # Function: vcf_filtering_IND_missing_data
41+ # Description:
42+ # ##################################################
43+
44+ vcf_filtering_IND_missing_data (){
45+ # #### Parameters #####
46+ local vcf=" $1 "
47+ local original_name=" $2 "
48+ local MAX_MISSING_IND=" $3 "
49+
50+ # #### Check if file exists #####
51+ if [[ ! -f " $vcf " ]]; then
52+ echo " File not found, ignored: $vcf "
53+ return
54+ fi
55+
56+ # Extract base name (handle .vcf)
57+ local base_name
58+ local regex=' \(([^)]+)\)[[:space:]]*$'
59+ if [[ " $original_name " =~ $regex ]]; then
60+ # Extract content between last parentheses
61+ base_name=" ${BASH_REMATCH[1]} "
62+ else
63+ # No parentheses, use original name
64+ base_name=$( basename " $original_name " )
65+ fi
66+
67+ base_name=${base_name% .vcf}
68+
69+ # #### Filtering on individuals with a high amount of missing data #####
70+ output_file=" $vcf_dir /${base_name} _mdIND.vcf"
71+
72+ intermed_files=" ${base_name} _IND_MISSING_DATA"
73+
74+ vcftools --vcf " $vcf " --missing-indv --out " $intermed_files "
75+
76+ imiss=" ${intermed_files} .imiss"
77+
78+ ind_miss=" ${base_name} _ind_missing_SNPs.txt" # list of individuals to be retained
79+
80+ awk -v threshold=" $MAX_MISSING_IND " ' NR > 1 && $5 < threshold { print $1 }' " $imiss " > " $ind_miss "
81+
82+ bcftools view -S " $ind_miss " -O v -o " $output_file " " $vcf "
83+
84+ # #### Verify that filtered VCF is not empty ######
85+ if [[ ! -f " $output_file " ]]; then
86+ echo " ERROR: Output VCF not created: $final_vcf " >&2
87+ exit 1
88+ fi
89+
90+ if ! bcftools view -H " $output_file " | head -n 1 | grep -q . ; then
91+ echo " ERROR: Filtered VCF contains no variants."
92+ exit 1
93+ fi
94+
95+ # #### Count individuals before and after filtering #####
96+ n_ind_before=$( bcftools query -l " $vcf " | wc -l)
97+ n_ind_after=$( bcftools query -l " $output_file " | wc -l)
98+
99+ # #### Append results to output file #####
100+ echo -e " ${base_name} \t${n_ind_before} \t${n_ind_after} " >> " ${summary_file} "
101+
102+ }
103+
104+ # #################
105+ # Main execution
106+ # #################
107+
108108vcf_filtering_IND_missing_data " $vcf_input " " $vcf_names " " $MAX_MISSING_IND "
0 commit comments