Skip to content

Commit 0c13db8

Browse files
authored
Merge pull request #3 from GINAMO-EBVs/vcftoolbox
vcftoolbox
2 parents cdfb95f + 4796a85 commit 0c13db8

26 files changed

+6517
-0
lines changed

tools/vcftoolbox/.shed.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: vcftoolbox
2+
owner: genomics
3+
description: Tools to filter, manipulate, and manage VCF files for population genomics analyses using vcftools and bcftools.
4+
homepage_url: https://github.com/lauramtzan/GINAMO/vcftoolbox
5+
long_description: |
6+
VCF Toolbox is a suite of tools for filtering and subsetting VCF files used in population genomics.
7+
The toolbox allows users to filter SNPs by read depth, genotype quality, missing data, heterozygosity, and minor allele count,
8+
extract or remove individuals, split VCFs by population and perform subsampling.
9+
remote_repository_url: https://github.com/galaxyecology/tools-ecology/tree/master/tools/vcftoolbox
10+
type: unrestricted
11+
categories:
12+
- Genomics
13+
auto_tool_repositories:
14+
name_template: "{{ tool_id }}"
15+
description_template: "{{ tool_name }}"
16+
suite:
17+
name: "suite_vcftoolbox"
18+
description: Tools to filter, manipulate, and manage VCF files for population genomics analyses using vcftools and bcftools.
19+
long_description: |
20+
VCF Toolbox is a suite of tools for filtering and subsetting VCF files used in population genomics.
21+
The toolbox allows users to filter SNPs by read depth, genotype quality, missing data, heterozygosity, and minor allele count,
22+
extract or remove individuals, split VCFs by population and perform subsampling.
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#!/bin/bash
2+
3+
#Exit on error
4+
set -e
5+
6+
vcf_input="$1"
7+
vcf_names="$2"
8+
MAX_MISSING_IND="$3"
9+
10+
vcf_dir="vcf_filtered_directory"
11+
summ_dir="summary"
12+
13+
##### Check output directories ####
14+
if [[ ! -d "${vcf_dir}" ]]; then
15+
echo "ERROR: Failed to create output VCF directory" >&2
16+
exit 1
17+
fi
18+
19+
if [[ ! -d "${summ_dir}" ]]; then
20+
echo "ERROR: Failed to create output VCF directory" >&2
21+
exit 1
22+
fi
23+
24+
##### Check input files #####
25+
if [[ -z "$vcf_input" ]]; then
26+
echo "ERROR: VCF file is not provided." >&2
27+
exit 1
28+
fi
29+
30+
# Verify that input VCF contains at least one variant
31+
if ! bcftools view -H "$vcf_input" | head -n 1 | grep -q .; then
32+
echo "ERROR: Input VCF contains no variant records."
33+
exit 1
34+
fi
35+
36+
summary_file="summary/n_individuals.tabular"
37+
echo -e "Dataset\tN_individuals_before\tN_individuals_after" >"${summary_file}"
38+
39+
###################################################
40+
#Function: vcf_filtering_IND_missing_data
41+
#Description:
42+
###################################################
43+
44+
vcf_filtering_IND_missing_data(){
45+
##### Parameters #####
46+
local vcf="$1"
47+
local original_name="$2"
48+
local MAX_MISSING_IND="$3"
49+
50+
##### Check if file exists #####
51+
if [[ ! -f "$vcf" ]]; then
52+
echo "File not found, ignored: $vcf"
53+
return
54+
fi
55+
56+
# Extract base name (handle .vcf)
57+
local base_name
58+
local regex='\(([^)]+)\)[[:space:]]*$'
59+
if [[ "$original_name" =~ $regex ]]; then
60+
#Extract content between last parentheses
61+
base_name="${BASH_REMATCH[1]}"
62+
else
63+
# No parentheses, use original name
64+
base_name=$(basename "$original_name")
65+
fi
66+
67+
base_name=${base_name%.vcf}
68+
69+
##### Filtering on individuals with a high amount of missing data #####
70+
output_file="$vcf_dir/${base_name}_mdIND.vcf"
71+
72+
intermed_files="${base_name}_IND_MISSING_DATA"
73+
74+
vcftools --vcf "$vcf" --missing-indv --out "$intermed_files"
75+
76+
imiss="${intermed_files}.imiss"
77+
78+
ind_miss="${base_name}_ind_missing_SNPs.txt" #list of individuals to be retained
79+
80+
awk -v threshold="$MAX_MISSING_IND" 'NR > 1 && $5 < threshold { print $1 }' "$imiss" > "$ind_miss"
81+
82+
bcftools view -S "$ind_miss" -O v -o "$output_file" "$vcf"
83+
84+
##### Verify that filtered VCF is not empty ######
85+
if [[ ! -f "$output_file" ]]; then
86+
echo "ERROR: Output VCF not created: $final_vcf" >&2
87+
exit 1
88+
fi
89+
90+
if ! bcftools view -H "$output_file" | head -n 1 | grep -q .; then
91+
echo "ERROR: Filtered VCF contains no variants."
92+
exit 1
93+
fi
94+
95+
##### Count individuals before and after filtering #####
96+
n_ind_before=$(bcftools query -l "$vcf" | wc -l)
97+
n_ind_after=$(bcftools query -l "$output_file" | wc -l)
98+
99+
##### Append results to output file #####
100+
echo -e "${base_name}\t${n_ind_before}\t${n_ind_after}" >> "${summary_file}"
101+
102+
}
103+
104+
##################
105+
# Main execution
106+
##################
107+
108+
vcf_filtering_IND_missing_data "$vcf_input" "$vcf_names" "$MAX_MISSING_IND"
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
<tool id="vcf_filtering_IND_missing_data" name="VCF filtering: Individuals with missing data" version="0.1.0" python_template_version="3.5">
2+
<description>
3+
above the user-defined threshold value
4+
</description>
5+
<requirements>
6+
<requirement type="package" version="1.22">bcftools</requirement>
7+
<requirement type="package" version="0.1.17">vcftools</requirement>
8+
</requirements>
9+
10+
<command detect_errors="exit_code"><![CDATA[
11+
12+
mkdir -p vcf_filtered_directory &&
13+
mkdir -p summary &&
14+
15+
bash '$__tool_directory__/vcf_filtering_IND_missing_data.sh'
16+
'${vcf_input}'
17+
'${vcf_input.element_identifier}'
18+
'$MAX_MISSING_IND'
19+
]]></command>
20+
21+
<inputs>
22+
<param name="vcf_input" type="data" format="vcf" label="Upload your VCF file"
23+
help="File containing DNA sequence data in VCF format. You can upload a simple file, multiple files or a dataset collection."/>
24+
25+
<param name="MAX_MISSING_IND" type="float" min="0" max="1" value="0.25" label="Maximum missing data per individuals (MAX_MISSING_IND)"
26+
help="Maximum fraction of missing genotypes allowed per individual." />
27+
</inputs>
28+
29+
<outputs>
30+
<data name="vcf_filtered" format="vcf" label="${tool.name}">
31+
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.vcf$" directory="vcf_filtered_directory" visible="true" assign_primary_output="true"/>
32+
</data>
33+
<data name="Individuals" from_work_dir="summary/n_individuals.tabular" format="tabular" label="Filtering on individuals : Individuals number"/>
34+
</outputs>
35+
36+
<tests>
37+
<test>
38+
<param name="vcf_input" value="test_vcf.vcf"/>
39+
<param name="MAX_MISSING_IND" value="0.2"/>
40+
<output name="vcf_filtered" file="vcf_filtered_directory/test_vcf_mdIND.vcf" count="1"/>
41+
<output name="Individuals" file="summary/n_individuals.tabular"/>
42+
</test>
43+
</tests>
44+
45+
<help><![CDATA[
46+
Usage
47+
=====
48+
As input, the user provides one or more VCF files (Danecek *et al.* 2011) and a threshold value indicating a maximum proportion of missing data per individual.
49+
50+
For each VCF file entered, the tool calculates the proportion of missing genotypes for each individual (or sample), and removes those with a value above the user-defined threshold, resulting in the output file.
51+
52+
the output VCF file.
53+
Example: if MAX_MISSING_IND = 0.25 (default value), all individuals with missing data on more than 25% of loci in the input dataset are removed from the output VCF file.
54+
55+
Tips
56+
====
57+
Threshold values for missing data depend on the study objectives.
58+
They generally vary between 20% and 75% of loci for low-stringency filters, and between 5% and 25% loci for high-stringency filters (Hemstrom *et al.* 2024).
59+
Using 0.0 (0%) means that only individuals with no missing data are retained for further analysis.
60+
Using 1.0 (100%) means that no filtering is applied.
61+
62+
See also examples of VCF formats in : https://samtools.github.io/hts-specs/VCFv4.5.pdf
63+
64+
]]></help>
65+
66+
<citations>
67+
<citation type="bibtex">
68+
@article{10.1093/gigascience/giab008,
69+
author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng},
70+
title = "{Twelve years of SAMtools and BCFtools}",
71+
journal = {GigaScience},
72+
volume = {10},
73+
number = {2},
74+
year = {2021},
75+
abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\&gt;1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}",
76+
issn = {2047-217X},
77+
doi = {10.1093/gigascience/giab008},
78+
url = {https://doi.org/10.1093/gigascience/giab008},
79+
note = {giab008},
80+
eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf},
81+
}
82+
</citation>
83+
84+
<citation type="bibtex">
85+
@article{danecek2011vcf,
86+
author = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Eric and DePristo, Mark A. and Handsaker, Robert and Lunter, Gerton and Marth, Gabor and Sherry, Stephen T. and McVean, Gilean and Durbin, Richard and 1000 Genomes Project Analysis Group},
87+
title = {The Variant Call Format and VCFtools},
88+
journal = {Bioinformatics},
89+
volume = {27},
90+
number = {15},
91+
year = {2011},
92+
pages = {2156--2158},
93+
doi = {10.1093/bioinformatics/btr330},
94+
url = {https://doi.org/10.1093/bioinformatics/btr330},
95+
}
96+
</citation>
97+
98+
<citation type="bibtex">
99+
@article{hemstrom2024next,
100+
author={Hemstrom, William and Grummer, Jared A. and Luikart, Gordon and Beja-Pereira, Albano and Waples, Robin S. and Funk, W. Chris and Shafer, Aaron B. A. and Allendorf, Frederick W.},
101+
title={Next-generation data filtering in the genomics era},
102+
journal={Nature Reviews Genetics},
103+
volume={25},
104+
number={11},
105+
pages={750--767},
106+
year={2024},
107+
doi={10.1038/s41576-024-00738-6},
108+
url={https://doi.org/10.1038/s41576-024-00738-6}
109+
}
110+
</citation>
111+
<citation type="bibtex">
112+
@article{Danecek2011,
113+
author = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Erica and DePristo, Mark A. and Handsaker, Robert E. and Lunter, Gerton and Marth, Gabor T. and Sherry, Stephen T. and McVean, Gil and Durbin, Richard and {1000 Genomes Project Analysis Group}},
114+
title = {The variant call format and {VCF}tools},
115+
journal = {Bioinformatics},
116+
volume = {27},
117+
number = {15},
118+
pages = {2156-2158},
119+
year = {2011},
120+
doi = {10.1093/bioinformatics/btr330},
121+
URL = {https://doi.org/10.1093/bioinformatics/btr330},
122+
eprint = {https://academic.oup.com/bioinformatics/article-pdf/27/15/2156/600788/btr330.pdf},
123+
}
124+
</citation>
125+
<citation type="bibtex">
126+
@manual{VCFv45,
127+
title = {The Variant Call Format (VCF) Specification, Version 4.5},
128+
author = {{HTS-specs team}},
129+
organization = {Global Alliance for Genomics and Health},
130+
year = {2024},
131+
url = {https://samtools.github.io/hts-specs/VCFv4.5.pdf},
132+
note = {Accessed: 2025-02-11}
133+
}
134+
</citation>
135+
</citations>
136+
</tool>

0 commit comments

Comments
 (0)