Skip to content

Commit 7af1636

Browse files
authored
Add files via upload
1 parent cd355c0 commit 7af1636

15 files changed

+1229
-1263
lines changed
Lines changed: 107 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1,108 +1,108 @@
1-
#!/bin/bash
2-
3-
#Exit on error
4-
set -e
5-
6-
vcf_input="$1"
7-
vcf_names="$2"
8-
MAX_MISSING_IND="$3"
9-
10-
vcf_dir="vcf_filtered_directory"
11-
summ_dir="summary"
12-
13-
##### Check output directories ####
14-
if [[ ! -d "${vcf_dir}" ]]; then
15-
echo "ERROR: Failed to create output VCF directory" >&2
16-
exit 1
17-
fi
18-
19-
if [[ ! -d "${summ_dir}" ]]; then
20-
echo "ERROR: Failed to create output VCF directory" >&2
21-
exit 1
22-
fi
23-
24-
##### Check input files #####
25-
if [[ -z "$vcf_input" ]]; then
26-
echo "ERROR: VCF file is not provided." >&2
27-
exit 1
28-
fi
29-
30-
# Verify that input VCF contains at least one variant
31-
if ! bcftools view -H "$vcf_input" | head -n 1 | grep -q .; then
32-
echo "ERROR: Input VCF contains no variant records."
33-
exit 1
34-
fi
35-
36-
summary_file="summary/n_individuals.tabular"
37-
echo -e "Dataset\tN_individuals_before\tN_individuals_after" >"${summary_file}"
38-
39-
###################################################
40-
#Function: vcf_filtering_IND_missing_data
41-
#Description:
42-
###################################################
43-
44-
vcf_filtering_IND_missing_data(){
45-
##### Parameters #####
46-
local vcf="$1"
47-
local original_name="$2"
48-
local MAX_MISSING_IND="$3"
49-
50-
##### Check if file exists #####
51-
if [[ ! -f "$vcf" ]]; then
52-
echo "File not found, ignored: $vcf"
53-
return
54-
fi
55-
56-
# Extract base name (handle .vcf)
57-
local base_name
58-
local regex='\(([^)]+)\)[[:space:]]*$'
59-
if [[ "$original_name" =~ $regex ]]; then
60-
#Extract content between last parentheses
61-
base_name="${BASH_REMATCH[1]}"
62-
else
63-
# No parentheses, use original name
64-
base_name=$(basename "$original_name")
65-
fi
66-
67-
base_name=${base_name%.vcf}
68-
69-
##### Filtering on individuals with a high amount of missing data #####
70-
output_file="$vcf_dir/${base_name}_mdIND.vcf"
71-
72-
intermed_files="${base_name}_IND_MISSING_DATA"
73-
74-
vcftools --vcf "$vcf" --missing-indv --out "$intermed_files"
75-
76-
imiss="${intermed_files}.imiss"
77-
78-
ind_miss="${base_name}_ind_missing_SNPs.txt" #list of individuals to be retained
79-
80-
awk -v threshold="$MAX_MISSING_IND" 'NR > 1 && $5 < threshold { print $1 }' "$imiss" > "$ind_miss"
81-
82-
bcftools view -S "$ind_miss" -O v -o "$output_file" "$vcf"
83-
84-
##### Verify that filtered VCF is not empty ######
85-
if [[ ! -f "$output_file" ]]; then
86-
echo "ERROR: Output VCF not created: $final_vcf" >&2
87-
exit 1
88-
fi
89-
90-
if ! bcftools view -H "$output_file" | head -n 1 | grep -q .; then
91-
echo "ERROR: Filtered VCF contains no variants."
92-
exit 1
93-
fi
94-
95-
##### Count individuals before and after filtering #####
96-
n_ind_before=$(bcftools query -l "$vcf" | wc -l)
97-
n_ind_after=$(bcftools query -l "$output_file" | wc -l)
98-
99-
##### Append results to output file #####
100-
echo -e "${base_name}\t${n_ind_before}\t${n_ind_after}" >> "${summary_file}"
101-
102-
}
103-
104-
##################
105-
# Main execution
106-
##################
107-
1+
#!/bin/bash
2+
3+
#Exit on error
4+
set -e
5+
6+
vcf_input="$1"
7+
vcf_names="$2"
8+
MAX_MISSING_IND="$3"
9+
10+
vcf_dir="vcf_filtered_directory"
11+
summ_dir="summary"
12+
13+
##### Check output directories ####
14+
if [[ ! -d "${vcf_dir}" ]]; then
15+
echo "ERROR: Failed to create output VCF directory" >&2
16+
exit 1
17+
fi
18+
19+
if [[ ! -d "${summ_dir}" ]]; then
20+
echo "ERROR: Failed to create output VCF directory" >&2
21+
exit 1
22+
fi
23+
24+
##### Check input files #####
25+
if [[ -z "$vcf_input" ]]; then
26+
echo "ERROR: VCF file is not provided." >&2
27+
exit 1
28+
fi
29+
30+
# Verify that input VCF contains at least one variant
31+
if ! bcftools view -H "$vcf_input" | head -n 1 | grep -q .; then
32+
echo "ERROR: Input VCF contains no variant records."
33+
exit 1
34+
fi
35+
36+
summary_file="summary/n_individuals.tabular"
37+
echo -e "Dataset\tN_individuals_before\tN_individuals_after" >"${summary_file}"
38+
39+
###################################################
40+
#Function: vcf_filtering_IND_missing_data
41+
#Description:
42+
###################################################
43+
44+
vcf_filtering_IND_missing_data(){
45+
##### Parameters #####
46+
local vcf="$1"
47+
local original_name="$2"
48+
local MAX_MISSING_IND="$3"
49+
50+
##### Check if file exists #####
51+
if [[ ! -f "$vcf" ]]; then
52+
echo "File not found, ignored: $vcf"
53+
return
54+
fi
55+
56+
# Extract base name (handle .vcf)
57+
local base_name
58+
local regex='\(([^)]+)\)[[:space:]]*$'
59+
if [[ "$original_name" =~ $regex ]]; then
60+
#Extract content between last parentheses
61+
base_name="${BASH_REMATCH[1]}"
62+
else
63+
# No parentheses, use original name
64+
base_name=$(basename "$original_name")
65+
fi
66+
67+
base_name=${base_name%.vcf}
68+
69+
##### Filtering on individuals with a high amount of missing data #####
70+
output_file="$vcf_dir/${base_name}_mdIND.vcf"
71+
72+
intermed_files="${base_name}_IND_MISSING_DATA"
73+
74+
vcftools --vcf "$vcf" --missing-indv --out "$intermed_files"
75+
76+
imiss="${intermed_files}.imiss"
77+
78+
ind_miss="${base_name}_ind_missing_SNPs.txt" #list of individuals to be retained
79+
80+
awk -v threshold="$MAX_MISSING_IND" 'NR > 1 && $5 < threshold { print $1 }' "$imiss" > "$ind_miss"
81+
82+
bcftools view -S "$ind_miss" -O v -o "$output_file" "$vcf"
83+
84+
##### Verify that filtered VCF is not empty ######
85+
if [[ ! -f "$output_file" ]]; then
86+
echo "ERROR: Output VCF not created: $final_vcf" >&2
87+
exit 1
88+
fi
89+
90+
if ! bcftools view -H "$output_file" | head -n 1 | grep -q .; then
91+
echo "ERROR: Filtered VCF contains no variants."
92+
exit 1
93+
fi
94+
95+
##### Count individuals before and after filtering #####
96+
n_ind_before=$(bcftools query -l "$vcf" | wc -l)
97+
n_ind_after=$(bcftools query -l "$output_file" | wc -l)
98+
99+
##### Append results to output file #####
100+
echo -e "${base_name}\t${n_ind_before}\t${n_ind_after}" >> "${summary_file}"
101+
102+
}
103+
104+
##################
105+
# Main execution
106+
##################
107+
108108
vcf_filtering_IND_missing_data "$vcf_input" "$vcf_names" "$MAX_MISSING_IND"

0 commit comments

Comments
 (0)