bgruening · neo417 · Jun 15, 2025 · Jun 15, 2025 · Jun 16, 2025
diff --git a/tools/fastspar/.shed.yml b/tools/fastspar/.shed.yml
@@ -0,0 +1,9 @@
+name: fastspar
+owner: bgruening
+description: Tool for rapid and scalable correlation estimation for compositional data.
+homepage_url: https://github.com/scwatts/fastspar/
+long_description: FastSpar is a C++ implementation of the SparCC algorithm for the inference of interaction networks from sparse and compositional data. It rapidly infers correlation networks and calculates P-values using an unbiased estimator.
+remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/fastspar
+categories:
+  - Metagenomics
+  - Statistics
diff --git a/tools/fastspar/fastspar.xml b/tools/fastspar/fastspar.xml
@@ -0,0 +1,107 @@
+<tool id="fastspar" name="FastSpar" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="24.2">
+    <description>
+        correlation estimation for compositional data
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        fastspar
+            --otu_table '$otu_table'
+            --iterations $iterations
+            --exclude_iterations $exclude_iterations
+            --threshold $threshold
+            --seed $seed
+            --correlation '$correlation'
+            --covariance '$covariance'
+            --threads \${GALAXY_SLOTS:-1}
+            --yes
+    ]]></command>
+    <inputs>
+        <!-- Todo: What is the correct Otu format in galaxy? -->
+        <param argument="--otu_table" type="data" format="tabular" label="Input OTU table"
+               help="The table must contain absolute OTU counts in plain tabular (TSV) format, with OTUs as rows and samples as columns. Do not include any metadata rows or columns."/>
+        <param argument="--iterations" type="integer" value="50" max="1000" label="Number of iterations"
+               help="Rounds of SparCC correlation estimation."/>
+        <param argument="--exclude_iterations" type="integer" value="10" max="100" label="Number of exclusion iterations"
+               help="The number of times highly correlated OTU pairs are excluded."/>
+        <param argument="--threshold" type="float" value="0.1" max="1.0" label="Exclusion threshold"
+               help="Correlation strength above which to exclude OTU pairs."/>
+        <param argument="--seed" type="integer" value="1" label="Random number seed"/>
+    </inputs>
+    <outputs>
+        <data name="correlation" format="tabular" label="${tool.name} on ${on_string}: median_correlation.tsv"/>
+        <data name="covariance" format="tabular" label="${tool.name} on ${on_string}: median_covariance.tsv"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="otu_table" ftype="tabular" value="fake_data.tsv"/>
+            <output name="correlation" file="fake_data_cor.tsv" compare="diff"/>
+            <output name="covariance" file="fake_data_cov.tsv" compare="diff"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="otu_table" ftype="tabular" value="fake_data.tsv"/>
+            <param name="exclude_iterations" value="20"/>
+            <param name="threshold" value="0.2"/>
+            <output name="correlation" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="51"/>
+                    <has_text text="1.0000"/>
+                </assert_contents>
+            </output>
+            <output name="covariance" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="51"/>
+                    <has_text text="OTU ID"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+What it does
+============
+
+FastSpar is a C++ implementation of the SparCC algorithm for estimating correlations from compositional data.
+This tool performs the **initial correlation and covariance matrix estimation** as the first step in the FastSpar pipeline.
+**If you also want to estimate p-values** you might want to use `fastspar_pvalues` with "Recalculate the correlation matrix".
+
+Required Inputs
+===============
+
+- **OTU table** (TSV format): Contains absolute OTU counts (not relative abundances). Must be a plain tabular file with samples in columns and OTUs in rows. Metadata is not supported.
+
+Main Parameters
+===============
+
+- **Iterations** (`--iterations`): Number of correlation estimation rounds. More iterations improve stability.
+- **Exclude iterations** (`--exclude_iterations`): Number of times highly correlated OTU pairs are removed.
+- **Correlation threshold** (`--threshold`): Correlation strength above which to exclude OTU pairs.
+- **Seed** (`--seed`): Random seed for reproducibility.
+
+Main Features
+===============
+
+- Efficient and fast computation of sparse correlations.
+- Customizable exclusion and thresholding strategy.
+- Designed to handle compositional count data from microbiome studies.
+
+Generated Outputs
+=================
+
+- `median_correlation.tsv`: Correlation matrix between all OTUs.
+- `median_covariance.tsv`: Covariance matrix between all OTUs.
+
+Additional Resources
+====================
+
+- FastSpar GitHub: [https://github.com/scwatts/fastspar]
+
+For a complete FastSpar analysis, follow up with:
+
+1. `fastspar_pvalues`: Estimate empirical p-values from bootstrap correlations.
+2. `fastspar_reduce`: Filter correlation and p-value matrices to produce sparse networks.
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/fastspar/fastspar_pvalues.xml b/tools/fastspar/fastspar_pvalues.xml
@@ -0,0 +1,158 @@
+<tool id="fastspar_pvalues" name="FastSpar: estimate p-values" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="24.2">
+    <description>
+        Bootstrap-based estimation of p-values from FastSpar correlations
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements_pvalues"/>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $correlation.select == "new"
+            fastspar
+                --otu_table '$otu_table'
+                --iterations $iterations
+                --exclude_iterations $exclude_iterations
+                --threshold $threshold
+                --seed $seed
+                --correlation '$output_correlation'
+                --covariance '$output_covariance'
+                --threads \${GALAXY_SLOTS:-1}
+                --yes &&
+            #set $correlation_file = $output_correlation
+        #else
+            #set $correlation_file = $correlation.input_file
+        #end if
+
+        mkdir bootstrap_counts
+     && fastspar_bootstrap
+            --otu_table '$otu_table'
+            --number $number
+            --prefix bootstrap_counts/data
+            --seed $seed
+            --threads \${GALAXY_SLOTS:-1}
+
+     && mkdir bootstrap_correlation
+     && parallel
+            --max-procs \${GALAXY_SLOTS:-1}
+            fastspar
+                --otu_table {}
+                --correlation bootstrap_correlation/cor_{/}
+                --covariance bootstrap_correlation/cov_{/}
+                --iterations $iterations
+                --exclude_iterations $exclude_iterations
+                --threshold $threshold
+                --seed $seed
+            ::: bootstrap_counts/*
+
+     && fastspar_pvalues
+            --otu_table '$otu_table'
+            --correlation '$correlation_file'
+            --prefix bootstrap_correlation/cor_data_
+            --permutations $number
+            $pseudo
+            --threads \${GALAXY_SLOTS:-1}
+            --outfile '$pvalues'
+    ]]></command>
+    <inputs>
+        <param argument="--otu_table" type="data" format="tabular" label="Input OTU table" help="The table must contain absolute OTU counts in plain tabular (TSV) format, with OTUs as rows and samples as columns. Do not include any metadata rows or columns."/>
+        <conditional name="correlation">
+            <param name="select" type="select" label="Tested correlation matrix"
+                   help="For meaningful p-values, the parameters used during bootstrapped correlation estimation should be identical to those used for the FastSpar run which produced the correlation matrix. &lt;br&gt;For your convenience you can choose to calculate the correlation matrix here. In that case the seed used for the calculation is the same one used for generating the bootstrapped samples.">
+                    <option value="new">Recalculate the correlation matrix</option>
+                    <option value="original">Use an existing correlation matrix</option>
+            </param>
+            <when value="new"/>
+            <when value="original">
+                <param name="input_file" type="data" format="tabular" argument="--correlation" label="Correlation table" help="The correlation matrix generated by the original FastSpar analysis."/>
+            </when>
+        </conditional>
+        <param argument="--number" type="integer" value="1000" min="10" max="10000" label="Number of bootstrap samples" help="Recommended minimum: 1000 bootstrap samples for robust estimation."/>
+        <param argument="--iterations" type="integer" value="50" max="1000" label="Number of iterations" help="Must match the value used in the original FastSpar run."/>
+        <param argument="--exclude_iterations" type="integer" value="10" max="100" label="Number of exclusion iterations" help="Must match the value used in the original FastSpar run."/>
+        <param argument="--threshold" type="float" value="0.1" max="1.0" label="Exclusion threshold" help="Must match the value used in the original FastSpar run."/>
+        <param argument="--seed" type="integer" value="1" label="Used to ensure reproducibility of bootstrapped samples."/>
+        <param argument="--pseudo" type="boolean" truevalue="--pseudo" falsevalue="" label="Use pseudo p-values" help="If selected, pseudo p-values are calculated instead of exact p-values. This can provide faster estimates but may be less precise."/>
+    </inputs>
+    <outputs>
+        <data name="output_correlation" format="tabular" label="${tool.name} on ${on_string}: median_correlation.tsv">
+            <filter>correlation['select'] == "new"</filter>
+        </data>
+        <data name="output_covariance" format="tabular" label="${tool.name} on ${on_string}: median_covariance.tsv">
+            <filter>correlation['select'] == "new"</filter>
+        </data>
+        <data name="pvalues" format="tabular" label="${tool.name} on ${on_string}: pvalues.tsv"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="otu_table" ftype="tabular" value="fake_data.tsv"/>
+            <conditional name="correlation">
+                <param name="select" value="original"/>
+                <param name="input_file" ftype="tabular" value="fake_data_cor.tsv"/>
+            </conditional>
+            <param name="number" value="10"/>
+            <output name="pvalues" file="fake_pvalues.tsv" compare="diff"/>
+        </test>
+        <test expect_num_outputs="3">
+            <param name="otu_table" ftype="tabular" value="fake_data.tsv"/>
+            <conditional name="correlation">
+                <param name="select" value="new"/>
+            </conditional>
+            <param name="number" value="10"/>
+            <output name="output_correlation" file="fake_data_cor.tsv" compare="diff"/>
+            <output name="output_covariance" file="fake_data_cov.tsv" compare="diff"/>
+            <output name="pvalues" file="fake_pvalues.tsv" compare="diff"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+What it does
+============
+
+This tool estimates **empirical p-values** for correlation values generated by FastSpar. It uses a **bootstrap-based permutation approach** to assess the statistical significance of observed correlations.
+
+You can choose to recalculate the correlation matrix with the same parameters or use an existing correlation matrix.
+
+How it works
+============
+
+1. Generates multiple bootstrapped versions of the OTU table.
+2. Runs FastSpar on each bootstrap replicate.
+3. Compares bootstrapped correlations to the original correlation matrix to calculate empirical p-values.
+
+Required Inputs
+===============
+
+- **OTU table**: TSV file with absolute counts (no metadata).
+- **Correlation table** (optional): Output from the original FastSpar run.
+- **Bootstrap samples**: Number of bootstrap replicates (≥1000 recommended).
+
+Important Parameters
+====================
+
+- **Iterations**: Must match the number used in the original FastSpar run.
+- **Exclude Iterations** and **Threshold**: Should also match the original settings, if used.
+- **Seed**: Optional, for reproducibility.
+- **Pseudo**: Choose whether to calculate pseudo p-values instead of exact values.
+
+IMPORTANT
+=========
+
+For meaningful p-values, the parameters used during bootstrapped correlation estimation (**iterations, exclude iterations, threshold**) should be identical to those used in the original FastSpar run.
+
+Output
+======
+
+- `pvalues.tsv`: A table of empirical p-values for all pairwise correlations.
+
+When "Recalculate the correlation matrix" is selected the tool will also output:
+
+- `median_correlation.tsv`: Correlation matrix between all OTUs.
+- `median_covariance.tsv`: Covariance matrix between all OTUs.
+
+Additional Resources
+====================
+
+- FastSpar GitHub: https://github.com/scwatts/fastspar
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/fastspar/fastspar_reduce.xml b/tools/fastspar/fastspar_reduce.xml
@@ -0,0 +1,76 @@
+<tool id="fastspar_reduce" name="FastSpar: Reduce correlation table" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="24.2">
+    <description>
+         Filter correlation and p-value table into sparse matrices
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        fastspar_reduce
+            --correlation_table '$correlation_table'
+            --pvalue_table '$pvalue_table'
+            --correlation $correlation
+            --pvalue $pvalue
+            --output_prefix sparse
+         && cp sparse_filtered_correlation.tsv '$correlations'
+         && cp sparse_filtered_pvalue.tsv  '$pvalues'
+    ]]></command>
+    <inputs>
+        <param argument="--correlation_table" type="data" format="tabular" label="Correlation table"/>
+        <param argument="--pvalue_table" type="data" format="tabular" label="P-value table"/>
+        <param argument="--correlation" type="float" value="0.1" label="Absolute correlation threshold"/>
+        <param argument="--pvalue" type="float" value="0.05" label="P-value threshold"/>
+    </inputs>
+    <outputs>
+        <data name="correlations" format="tabular" label="${tool.name} on ${on_string}: filtered_correlations.tsv"/>
+        <data name="pvalues" format="tabular" label="${tool.name} on ${on_string}: filtered_pvalues.tsv"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="correlation_table" ftype="tabular" value="fake_data_cor.tsv"/>
+            <param name="pvalue_table" ftype="tabular" value="pvalues.tsv"/>
+            <output name="correlations" file="filtered_correlations.tsv" compare="diff"/>
+            <output name="pvalues" file="filtered_pvalues.tsv" compare="diff"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+What it does
+============
+
+This tool filters pairwise correlations and p-values from FastSpar outputs to generate sparse matrices suitable for network construction or visualization. It is typically used as the final step in a FastSpar pipeline.
+
+Filtering Criteria
+==================
+
+- **Absolute correlation threshold**: Only retain OTU pairs whose absolute correlation exceeds this value.
+- **P-value threshold**: Only retain OTU pairs whose empirical p-value is below this cutoff.
+
+Both conditions must be satisfied (logical AND).
+
+Required Inputs
+===============
+
+- **Correlation table**: A symmetric matrix from FastSpar.
+- **P-value table**: A matching symmetric matrix from FastSpar p-value estimation.
+
+Generated Outputs
+=================
+
+- `filtered_correlations.tsv`: Correlation values that passed both thresholds.
+- `filtered_pvalues.tsv`: Matching p-values for retained entries.
+
+Notes
+=====
+
+- Both input matrices must have identical dimensions and OTU order.
+- The output tables are still symmetric and retain all diagonal values (e.g., self-correlations).
+
+Additional Resources
+====================
+
+- FastSpar GitHub: https://github.com/scwatts/fastspar
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/fastspar/macros.xml b/tools/fastspar/macros.xml
@@ -0,0 +1,26 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.0</token>
+    <token name="@VERSION_SUFFIX@">galaxy0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">fastspar</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_pvalues">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">fastspar</requirement>
+            <requirement type="package" version="20250422">parallel</requirement>
+        </requirements>
+    </xml>
+    <xml name="biotools">
+        <xrefs>
+            <xref type="bio.tools">FastSpar</xref>
+        </xrefs>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/bioinformatics/bty734</citation>
+            <citation type="doi">10.1371/journal.pcbi.1002687</citation>
+        </citations>
+    </xml>
+</macros>