galaxyecology
diff --git a/‎tools/vcftoolbox/.shed.yml‎
Lines changed: 22 additions & 0 deletions b/‎tools/vcftoolbox/.shed.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎tools/vcftoolbox/VCF_filtering_IND_missing_data.sh‎
Lines changed: 108 additions & 0 deletions b/‎tools/vcftoolbox/VCF_filtering_IND_missing_data.sh‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎tools/vcftoolbox/VCF_filtering_IND_missing_data.xml‎
Lines changed: 136 additions & 0 deletions b/‎tools/vcftoolbox/VCF_filtering_IND_missing_data.xml‎
Lines changed: 136 additions & 0 deletions
@@ -0,0 +1,22 @@
+name: vcftoolbox
+owner: genomics
+description: Tools to filter, manipulate, and manage VCF files for population genomics analyses using vcftools and bcftools.
+homepage_url: https://github.com/lauramtzan/GINAMO/vcftoolbox
+long_description: |
+  VCF Toolbox is a suite of tools for filtering and subsetting VCF files used in population genomics. 
+  The toolbox allows users to filter SNPs by read depth, genotype quality, missing data, heterozygosity, and minor allele count, 
+  extract or remove individuals, split VCFs by population and perform subsampling. 
+remote_repository_url: https://github.com/galaxyecology/tools-ecology/tree/master/tools/vcftoolbox
+type: unrestricted
+categories:
+- Genomics 
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }}"
+suite:
+  name: "suite_vcftoolbox"
+  description: Tools to filter, manipulate, and manage VCF files for population genomics analyses using vcftools and bcftools.
+  long_description: |
+    VCF Toolbox is a suite of tools for filtering and subsetting VCF files used in population genomics. 
+    The toolbox allows users to filter SNPs by read depth, genotype quality, missing data, heterozygosity, and minor allele count, 
+    extract or remove individuals, split VCFs by population and perform subsampling. 
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+#Exit on error
+set -e
+
+vcf_input="$1"
+vcf_names="$2"
+MAX_MISSING_IND="$3"
+
+vcf_dir="vcf_filtered_directory"
+summ_dir="summary"
+
+##### Check output directories ####
+if [[ ! -d "${vcf_dir}" ]]; then
+    echo "ERROR: Failed to create output VCF directory" >&2
+    exit 1
+fi
+
+if [[ ! -d "${summ_dir}" ]]; then
+    echo "ERROR: Failed to create output VCF directory" >&2
+    exit 1
+fi
+
+##### Check input files #####
+if [[ -z "$vcf_input" ]]; then
+    echo "ERROR: VCF file is not provided." >&2
+    exit 1
+fi
+
+# Verify that input VCF contains at least one variant
+if ! bcftools view -H "$vcf_input" | head -n 1 | grep -q .; then
+    echo "ERROR: Input VCF contains no variant records."
+    exit 1
+fi
+
+summary_file="summary/n_individuals.tabular"
+echo -e "Dataset\tN_individuals_before\tN_individuals_after" >"${summary_file}"
+
+###################################################
+#Function: vcf_filtering_IND_missing_data
+#Description: 
+###################################################
+
+vcf_filtering_IND_missing_data(){
+    ##### Parameters #####
+    local vcf="$1"
+    local original_name="$2"
+    local MAX_MISSING_IND="$3"
+        
+        ##### Check if file exists #####
+        if [[ ! -f "$vcf" ]]; then
+            echo "File not found, ignored: $vcf"
+            return
+        fi
+
+        # Extract base name (handle .vcf)
+        local base_name
+        local regex='\(([^)]+)\)[[:space:]]*$'
+        if [[ "$original_name" =~ $regex ]]; then
+            #Extract content between last parentheses
+            base_name="${BASH_REMATCH[1]}"
+        else
+            # No parentheses, use original name
+            base_name=$(basename "$original_name")
+        fi
+        
+        base_name=${base_name%.vcf}
+
+        ##### Filtering on individuals with a high amount of missing data #####
+        output_file="$vcf_dir/${base_name}_mdIND.vcf"
+
+        intermed_files="${base_name}_IND_MISSING_DATA"
+
+        vcftools --vcf  "$vcf" --missing-indv --out "$intermed_files"
+
+        imiss="${intermed_files}.imiss"
+
+        ind_miss="${base_name}_ind_missing_SNPs.txt" #list of individuals to be retained
+        
+        awk -v threshold="$MAX_MISSING_IND" 'NR > 1 && $5 < threshold { print $1 }' "$imiss" > "$ind_miss"
+
+        bcftools view -S "$ind_miss" -O v -o "$output_file" "$vcf"
+
+        ##### Verify that filtered VCF is not empty ######
+        if [[ ! -f "$output_file" ]]; then
+            echo "ERROR: Output VCF not created: $final_vcf" >&2
+            exit 1
+        fi
+
+        if ! bcftools view -H "$output_file" | head -n 1 | grep -q .; then
+            echo "ERROR: Filtered VCF contains no variants."
+            exit 1
+        fi  
+
+        ##### Count individuals before and after filtering #####
+        n_ind_before=$(bcftools query -l "$vcf" | wc -l)
+        n_ind_after=$(bcftools query -l "$output_file" | wc -l)
+
+        ##### Append results to output file #####
+        echo -e "${base_name}\t${n_ind_before}\t${n_ind_after}" >> "${summary_file}"
+        
+}
+
+##################
+# Main execution
+##################
+
+vcf_filtering_IND_missing_data "$vcf_input" "$vcf_names" "$MAX_MISSING_IND"
@@ -0,0 +1,136 @@
+<tool id="vcf_filtering_IND_missing_data" name="VCF filtering: Individuals with missing data" version="0.1.0" python_template_version="3.5">
+    <description>
+        above the user-defined threshold value
+    </description>
+    <requirements>
+        <requirement type="package" version="1.22">bcftools</requirement>
+        <requirement type="package" version="0.1.17">vcftools</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+
+        mkdir -p vcf_filtered_directory &&
+        mkdir -p summary &&
+
+        bash '$__tool_directory__/vcf_filtering_IND_missing_data.sh'
+            '${vcf_input}'
+            '${vcf_input.element_identifier}'
+            '$MAX_MISSING_IND'
+        ]]></command>
+    
+    <inputs>
+        <param name="vcf_input" type="data" format="vcf" label="Upload your VCF file"
+            help="File containing DNA sequence data in VCF format. You can upload a simple file, multiple files or a dataset collection."/>
+
+        <param name="MAX_MISSING_IND" type="float" min="0" max="1" value="0.25" label="Maximum missing data per individuals (MAX_MISSING_IND)"
+            help="Maximum fraction of missing genotypes allowed per individual." />
+    </inputs>
+
+        <outputs>
+            <data name="vcf_filtered" format="vcf" label="${tool.name}">
+                <discover_datasets pattern="(?P&lt;designation&gt;.+)\.vcf$" directory="vcf_filtered_directory" visible="true" assign_primary_output="true"/>
+            </data>
+            <data name="Individuals" from_work_dir="summary/n_individuals.tabular" format="tabular" label="Filtering on individuals : Individuals number"/>
+        </outputs>
+        
+        <tests>
+        <test>
+            <param name="vcf_input" value="test_vcf.vcf"/>
+            <param name="MAX_MISSING_IND" value="0.2"/>
+            <output name="vcf_filtered"  file="vcf_filtered_directory/test_vcf_mdIND.vcf" count="1"/>
+            <output name="Individuals" file="summary/n_individuals.tabular"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+Usage
+=====
+As input, the user provides one or more VCF files (Danecek *et al.* 2011) and a threshold value indicating a maximum proportion of missing data per individual.
+
+For each VCF file entered, the tool calculates the proportion of missing genotypes for each individual (or sample), and removes those with a value above the user-defined threshold, resulting in the output file.                       
+
+the output VCF file.
+Example: if MAX_MISSING_IND = 0.25 (default value), all individuals with missing data on more than 25% of loci in the input dataset are removed from the output VCF file.
+
+Tips
+====
+Threshold values for missing data depend on the study objectives. 
+They generally vary between 20% and 75% of loci for low-stringency filters, and between 5% and 25% loci for high-stringency filters (Hemstrom *et al.* 2024).
+Using 0.0 (0%) means that only individuals with no missing data are retained for further analysis.
+Using 1.0 (100%) means that no filtering is applied. 
+
+See also examples of VCF formats in :  https://samtools.github.io/hts-specs/VCFv4.5.pdf
+
+]]></help>
+
+    <citations>
+        <citation type="bibtex">
+        @article{10.1093/gigascience/giab008,
+    author = {Danecek, Petr and Bonfield, James K and Liddle, Jennifer and Marshall, John and Ohan, Valeriu and Pollard, Martin O and Whitwham, Andrew and Keane, Thomas and McCarthy, Shane A and Davies, Robert M and Li, Heng},
+    title = "{Twelve years of SAMtools and BCFtools}",
+    journal = {GigaScience},
+    volume = {10},
+    number = {2},
+    year = {2021},
+    abstract = "{SAMtools and BCFtools are widely used programs for processing and analysing high-throughput sequencing data. They include tools for file format conversion and manipulation, sorting, querying, statistics, variant calling, and effect analysis amongst other methods.The first version appeared online 12 years ago and has been maintained and further developed ever since, with many new features and improvements added over the years. The SAMtools and BCFtools packages represent a unique collection of tools that have been used in numerous other software projects and countless genomic pipelines.Both SAMtools and BCFtools are freely available on GitHub under the permissive MIT licence, free for both non-commercial and commercial use. Both packages have been installed \\&gt;1 million times via Bioconda. The source code and documentation are available from https://www.htslib.org.}",
+    issn = {2047-217X},
+    doi = {10.1093/gigascience/giab008},
+    url = {https://doi.org/10.1093/gigascience/giab008},
+    note = {giab008},
+    eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab008/36332246/giab008.pdf},
+        }
+        </citation>
+
+        <citation type="bibtex">
+        @article{danecek2011vcf,
+    author = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Eric and DePristo, Mark A. and Handsaker, Robert and Lunter, Gerton and Marth, Gabor and Sherry, Stephen T. and McVean, Gilean and Durbin, Richard and 1000 Genomes Project Analysis Group},
+    title = {The Variant Call Format and VCFtools},
+    journal = {Bioinformatics},
+    volume = {27},
+    number = {15},
+    year = {2011},
+    pages = {2156--2158},
+    doi = {10.1093/bioinformatics/btr330},
+    url = {https://doi.org/10.1093/bioinformatics/btr330},
+    }
+        </citation>
+
+        <citation type="bibtex">
+            @article{hemstrom2024next,
+  author={Hemstrom, William and Grummer, Jared A. and Luikart, Gordon and Beja-Pereira, Albano and Waples, Robin S. and Funk, W. Chris and Shafer, Aaron B. A. and Allendorf, Frederick W.},
+  title={Next-generation data filtering in the genomics era},
+  journal={Nature Reviews Genetics},
+  volume={25},
+  number={11},
+  pages={750--767},
+  year={2024},
+  doi={10.1038/s41576-024-00738-6},
+  url={https://doi.org/10.1038/s41576-024-00738-6}
+}
+        </citation>
+        <citation type="bibtex">
+            @article{Danecek2011,
+  author          = {Danecek, Petr and Auton, Adam and Abecasis, Goncalo and Albers, Cornelis A. and Banks, Erica and DePristo, Mark A. and Handsaker, Robert E. and Lunter, Gerton and Marth, Gabor T. and Sherry, Stephen T. and McVean, Gil and Durbin, Richard and {1000 Genomes Project Analysis Group}},
+  title           = {The variant call format and {VCF}tools},
+  journal         = {Bioinformatics},
+  volume          = {27},
+  number          = {15},
+  pages           = {2156-2158},
+  year            = {2011},
+  doi             = {10.1093/bioinformatics/btr330},
+  URL             = {https://doi.org/10.1093/bioinformatics/btr330},
+  eprint          = {https://academic.oup.com/bioinformatics/article-pdf/27/15/2156/600788/btr330.pdf},
+}
+        </citation>
+    <citation type="bibtex">
+        @manual{VCFv45,
+  title        = {The Variant Call Format (VCF) Specification, Version 4.5},
+  author       = {{HTS-specs team}},
+  organization = {Global Alliance for Genomics and Health},
+  year         = {2024},
+  url          = {https://samtools.github.io/hts-specs/VCFv4.5.pdf},
+  note         = {Accessed: 2025-02-11}
+}
+    </citation>
+    </citations>    
+</tool>