bgruening-galaxytools/tools/fastspar/fastspar_pvalues.xml at f5c7a84ce1758b1762aa8d9736e31f94b43f6da8 · neo417/bgruening-galaxytools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
<tool id="fastspar_pvalues" name="FastSpar: estimate p-values" version="@TOOL_VERSION@+@VERSION_SUFFIX@" profile="24.2">
    <description>
        Bootstrap-based estimation of p-values from FastSpar correlations
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements_pvalues"/>
    <command detect_errors="exit_code"><![CDATA[
        #if $correlation.select == "new"
            fastspar
                --otu_table '$otu_table'
                --iterations $iterations
                --exclude_iterations $exclude_iterations
                --threshold $threshold
                --seed $seed
                --correlation '$output_correlation'
                --covariance '$output_covariance'
                --threads \${GALAXY_SLOTS:-1}
                --yes &&
            #set $correlation_file = $output_correlation
        #else
            #set $correlation_file = $correlation.input_file
        #end if

        mkdir bootstrap_counts
     && fastspar_bootstrap
            --otu_table '$otu_table'
            --number $number
            --prefix bootstrap_counts/data
            --seed $seed
            --threads \${GALAXY_SLOTS:-1}

     && mkdir bootstrap_correlation
     && parallel
            --max-procs \${GALAXY_SLOTS:-1}
            fastspar
                --otu_table {}
                --correlation bootstrap_correlation/cor_{/}
                --covariance bootstrap_correlation/cov_{/}
                --iterations $iterations
                --exclude_iterations $exclude_iterations
                --threshold $threshold
                --seed $seed
            ::: bootstrap_counts/*

     && fastspar_pvalues
            --otu_table '$otu_table'
            --correlation '$correlation_file'
            --prefix bootstrap_correlation/cor_data_
            --permutations $number
            $pseudo
            --threads \${GALAXY_SLOTS:-1}
            --outfile '$pvalues'
    ]]></command>
    <inputs>
        <param argument="--otu_table" type="data" format="tabular" label="Input OTU table" help="The table must contain absolute OTU counts in plain tabular (TSV) format, with OTUs as rows and samples as columns. Do not include any metadata rows or columns."/>
        <conditional name="correlation">
            <param name="select" type="select" label="Tested correlation matrix"
                   help="For meaningful p-values, the parameters used during bootstrapped correlation estimation should be identical to those used for the FastSpar run which produced the correlation matrix. &lt;br&gt;For your convenience you can choose to calculate the correlation matrix here. In that case the seed used for the calculation is the same one used for generating the bootstrapped samples.">
                    <option value="new">Recalculate the correlation matrix</option>
                    <option value="original">Use an existing correlation matrix</option>
            </param>
            <when value="new"/>
            <when value="original">
                <param name="input_file" type="data" format="tabular" argument="--correlation" label="Correlation table" help="The correlation matrix generated by the original FastSpar analysis."/>
            </when>
        </conditional>
        <param argument="--number" type="integer" value="1000" min="10" max="10000" label="Number of bootstrap samples" help="Recommended minimum: 1000 bootstrap samples for robust estimation."/>
        <param argument="--iterations" type="integer" value="50" max="1000" label="Number of iterations" help="Must match the value used in the original FastSpar run."/>
        <param argument="--exclude_iterations" type="integer" value="10" max="100" label="Number of exclusion iterations" help="Must match the value used in the original FastSpar run."/>
        <param argument="--threshold" type="float" value="0.1" max="1.0" label="Exclusion threshold" help="Must match the value used in the original FastSpar run."/>
        <param argument="--seed" type="integer" value="1" label="Used to ensure reproducibility of bootstrapped samples."/>
        <param argument="--pseudo" type="boolean" truevalue="--pseudo" falsevalue="" label="Use pseudo p-values" help="If selected, pseudo p-values are calculated instead of exact p-values. This can provide faster estimates but may be less precise."/>
    </inputs>
    <outputs>
        <data name="output_correlation" format="tabular" label="${tool.name} on ${on_string}: median_correlation.tsv">
            <filter>correlation['select'] == "new"</filter>
        </data>
        <data name="output_covariance" format="tabular" label="${tool.name} on ${on_string}: median_covariance.tsv">
            <filter>correlation['select'] == "new"</filter>
        </data>
        <data name="pvalues" format="tabular" label="${tool.name} on ${on_string}: pvalues.tsv"/>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="otu_table" ftype="tabular" value="fake_data.tsv"/>
            <conditional name="correlation">
                <param name="select" value="original"/>
                <param name="input_file" ftype="tabular" value="fake_data_cor.tsv"/>
            </conditional>
            <param name="number" value="10"/>
            <output name="pvalues" file="fake_pvalues.tsv" compare="diff"/>
        </test>
        <test expect_num_outputs="3">
            <param name="otu_table" ftype="tabular" value="fake_data.tsv"/>
            <conditional name="correlation">
                <param name="select" value="new"/>
            </conditional>
            <param name="number" value="10"/>
            <output name="output_correlation" file="fake_data_cor.tsv" compare="diff"/>
            <output name="output_covariance" file="fake_data_cov.tsv" compare="diff"/>
            <output name="pvalues" file="fake_pvalues.tsv" compare="diff"/>
        </test>
    </tests>
    <help><![CDATA[
What it does
============

This tool estimates **empirical p-values** for correlation values generated by FastSpar. It uses a **bootstrap-based permutation approach** to assess the statistical significance of observed correlations.

You can choose to recalculate the correlation matrix with the same parameters or use an existing correlation matrix.

How it works
============

1. Generates multiple bootstrapped versions of the OTU table.
2. Runs FastSpar on each bootstrap replicate.
3. Compares bootstrapped correlations to the original correlation matrix to calculate empirical p-values.

Required Inputs
===============

- **OTU table**: TSV file with absolute counts (no metadata).
- **Correlation table** (optional): Output from the original FastSpar run.
- **Bootstrap samples**: Number of bootstrap replicates (≥1000 recommended).

Important Parameters
====================

- **Iterations**: Must match the number used in the original FastSpar run.
- **Exclude Iterations** and **Threshold**: Should also match the original settings, if used.
- **Seed**: Optional, for reproducibility.
- **Pseudo**: Choose whether to calculate pseudo p-values instead of exact values.

IMPORTANT
=========

For meaningful p-values, the parameters used during bootstrapped correlation estimation (**iterations, exclude iterations, threshold**) should be identical to those used in the original FastSpar run.

Output
======

- `pvalues.tsv`: A table of empirical p-values for all pairwise correlations.

When "Recalculate the correlation matrix" is selected the tool will also output:

- `median_correlation.tsv`: Correlation matrix between all OTUs.
- `median_covariance.tsv`: Covariance matrix between all OTUs.

Additional Resources
====================

- FastSpar GitHub: https://github.com/scwatts/fastspar
    ]]></help>
    <expand macro="citations"/>
</tool>