Skip to content

Commit 431830f

Browse files
authored
Merge pull request #7576 from mvdbeek/ngsderive
Implement ``ngsderive strandedness`` command
2 parents adc61f2 + d644c4c commit 431830f

File tree

7 files changed

+186
-0
lines changed

7 files changed

+186
-0
lines changed

tools/ngsderive/.shed.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
auto_tool_repositories:
3+
name_template: "{{ tool_id }}"
4+
description_template: "{{ tool_name }} from the ngsderive suite"
5+
categories:
6+
- Sequence Analysis
7+
- RNA
8+
- Transcriptomics
9+
description: Forensic analysis tool for inferring properties from NGS data
10+
homepage_url: https://github.com/stjudecloud/ngsderive
11+
long_description: |
12+
ngsderive is a forensic analysis tool useful for backwards computing
13+
information from next-generation sequencing data. It includes subcommands
14+
for inferring strandedness, read length, encoding, and other properties
15+
from BAM files.
16+
name: ngsderive
17+
owner: iuc
18+
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/ngsderive
19+
suite:
20+
name: suite_ngsderive
21+
description: >
22+
ngsderive is a forensic analysis tool for inferring properties from NGS data
23+
long_description: >
24+
ngsderive is a forensic analysis tool useful for backwards computing
25+
information from next-generation sequencing data. It includes subcommands
26+
for inferring strandedness, read length, encoding, and other properties
27+
from BAM files.
28+
type: unrestricted
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
<tool id="ngsderive_strandedness" name="ngsderive strandedness" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0">
2+
<description>infers strandedness from RNA-seq BAM files</description>
3+
<macros>
4+
<token name="@TOOL_VERSION@">4.0.0</token>
5+
<token name="@VERSION_SUFFIX@">0</token>
6+
</macros>
7+
<requirements>
8+
<requirement type="package" version="@TOOL_VERSION@">ngsderive</requirement>
9+
</requirements>
10+
<command detect_errors="exit_code"><![CDATA[
11+
ln -s '${alignment_input}' input.bam &&
12+
ln -s '${alignment_input.metadata.bam_index}' input.bam.bai &&
13+
ln -s '${gtf_input}' annotation.${gtf_input.ext} &&
14+
15+
ngsderive strandedness
16+
input.bam
17+
-g annotation.${gtf_input.ext}
18+
-n $n_genes
19+
-m $min_reads_per_gene
20+
-q $mapq
21+
$split_by_rg
22+
> '${output}'
23+
]]></command>
24+
<inputs>
25+
<param name="alignment_input" type="data" format="bam" label="Input alignment file" help="Aligned paired-end RNA-seq reads in BAM format."/>
26+
<param name="gtf_input" type="data" format="gtf,gtf.gz" label="Gene annotation file (GTF)" help="Gene model in GTF format. The file will be automatically sorted and indexed if necessary."/>
27+
<param argument="-n" name="n_genes" type="integer" value="1000" min="1" label="Number of genes to sample" help="Number of random genes to sample for strandedness inference."/>
28+
<param argument="-m" name="min_reads_per_gene" type="integer" value="10" min="1" label="Minimum reads per gene" help="Minimum number of reads per gene required for inclusion in the analysis."/>
29+
<param argument="-q" name="mapq" type="integer" value="30" min="0" label="Minimum mapping quality (MAPQ)" help="Minimum MAPQ score for a read to be considered."/>
30+
<param argument="--split-by-rg" type="boolean" truevalue="--split-by-rg" falsevalue="" checked="false" label="Split results by read group" help="Output one entry per read group in addition to an overall entry."/>
31+
</inputs>
32+
<outputs>
33+
<data name="output" format="tabular" label="${tool.name} on ${on_string}">
34+
<actions>
35+
<action name="column_names" type="metadata" default="File,ReadGroup,TotalReads,ForwardPct,ReversePct,Predicted"/>
36+
</actions>
37+
</data>
38+
</outputs>
39+
<tests>
40+
<!-- Test forward-stranded data -->
41+
<test expect_num_outputs="1">
42+
<param name="alignment_input" value="forward_stranded.bam"/>
43+
<param name="gtf_input" value="strandedness_test.gtf"/>
44+
<output name="output">
45+
<assert_contents>
46+
<has_n_columns n="6"/>
47+
<has_text text="Stranded-Forward"/>
48+
</assert_contents>
49+
</output>
50+
</test>
51+
<!-- Test reverse-stranded data -->
52+
<test expect_num_outputs="1">
53+
<param name="alignment_input" value="reverse_stranded.bam"/>
54+
<param name="gtf_input" value="strandedness_test.gtf"/>
55+
<output name="output">
56+
<assert_contents>
57+
<has_n_columns n="6"/>
58+
<has_text text="Stranded-Reverse"/>
59+
</assert_contents>
60+
</output>
61+
</test>
62+
<!-- Test unstranded data -->
63+
<test expect_num_outputs="1">
64+
<param name="alignment_input" value="unstranded.bam"/>
65+
<param name="gtf_input" value="strandedness_test.gtf"/>
66+
<output name="output">
67+
<assert_contents>
68+
<has_n_columns n="6"/>
69+
<has_text text="Unstranded"/>
70+
</assert_contents>
71+
</output>
72+
</test>
73+
<!-- Test with gzipped GTF annotation file -->
74+
<test expect_num_outputs="1">
75+
<param name="alignment_input" value="reverse_stranded.bam"/>
76+
<param name="gtf_input" value="strandedness_test.gtf.gz" ftype="gtf.gz"/>
77+
<output name="output">
78+
<assert_contents>
79+
<has_n_columns n="6"/>
80+
<has_text text="Stranded-Reverse"/>
81+
</assert_contents>
82+
</output>
83+
</test>
84+
</tests>
85+
<help><![CDATA[
86+
**What it does**
87+
88+
ngsderive strandedness infers the strandedness protocol used to generate RNA-seq data by
89+
analyzing read alignments against a gene model. It can determine whether your data was
90+
generated using a Stranded-Forward, Stranded-Reverse, or Unstranded protocol.
91+
92+
This tool is useful when you have RNA-seq data but are unsure about the library preparation
93+
protocol used. Knowing the correct strandedness is essential for accurate gene expression
94+
quantification.
95+
96+
**How it works**
97+
98+
The tool randomly samples genes from the provided gene model and examines how reads align
99+
to those genes. Based on the proportion of reads mapping in the forward vs reverse orientation,
100+
it classifies the library as:
101+
102+
- **Unstranded**: ~40-60% forward reads
103+
- **Stranded-Forward**: ≥80% forward reads
104+
- **Stranded-Reverse**: ≥80% reverse reads
105+
- **Inconclusive**: Results don't clearly indicate a strandedness type
106+
107+
**Inputs**
108+
109+
- **Alignment file**: Paired-end RNA-seq alignments in BAM format
110+
- **Gene annotation**: GTF file with gene models (gzipped GTF supported)
111+
112+
**Output**
113+
114+
A tabular file with the following columns:
115+
116+
- **File**: Name of the input BAM file
117+
- **ReadGroup**: Read group identifier (or "overall" for combined results)
118+
- **TotalReads**: Number of reads used in the analysis
119+
- **ForwardPct**: Percentage of reads supporting forward strandedness
120+
- **ReversePct**: Percentage of reads supporting reverse strandedness
121+
- **Predicted**: The inferred strandedness (Stranded-Forward, Stranded-Reverse, Unstranded, or Inconclusive)
122+
123+
**Notes**
124+
125+
- Only paired-end reads are currently supported
126+
- For best results, ensure your BAM file has sufficient read depth
127+
128+
For more information, see the `ngsderive documentation <https://stjudecloud.github.io/ngsderive/subcommands/strandedness/>`_.
129+
]]></help>
130+
<citations>
131+
<citation type="bibtex">
132+
@software{ngsderive,
133+
author = {{St. Jude Cloud Team}},
134+
title = {ngsderive: Forensic analysis tool for NGS data},
135+
url = {https://github.com/stjudecloud/ngsderive},
136+
year = {2020}
137+
}
138+
</citation>
139+
</citations>
140+
</tool>
13.3 KB
Binary file not shown.
13.2 KB
Binary file not shown.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
chr1 test gene 1000 2000 . + . gene_id "GENE_PLUS_1"; gene_name "GENE_PLUS_1";
2+
chr1 test transcript 1000 2000 . + . gene_id "GENE_PLUS_1"; transcript_id "GENE_PLUS_1.1";
3+
chr1 test exon 1000 2000 . + . gene_id "GENE_PLUS_1"; transcript_id "GENE_PLUS_1.1"; exon_number "1";
4+
chr1 test gene 3000 4000 . - . gene_id "GENE_MINUS_1"; gene_name "GENE_MINUS_1";
5+
chr1 test transcript 3000 4000 . - . gene_id "GENE_MINUS_1"; transcript_id "GENE_MINUS_1.1";
6+
chr1 test exon 3000 4000 . - . gene_id "GENE_MINUS_1"; transcript_id "GENE_MINUS_1.1"; exon_number "1";
7+
chr1 test gene 5000 6000 . + . gene_id "GENE_PLUS_2"; gene_name "GENE_PLUS_2";
8+
chr1 test transcript 5000 6000 . + . gene_id "GENE_PLUS_2"; transcript_id "GENE_PLUS_2.1";
9+
chr1 test exon 5000 6000 . + . gene_id "GENE_PLUS_2"; transcript_id "GENE_PLUS_2.1"; exon_number "1";
10+
chr1 test gene 7000 8000 . - . gene_id "GENE_MINUS_2"; gene_name "GENE_MINUS_2";
11+
chr1 test transcript 7000 8000 . - . gene_id "GENE_MINUS_2"; transcript_id "GENE_MINUS_2.1";
12+
chr1 test exon 7000 8000 . - . gene_id "GENE_MINUS_2"; transcript_id "GENE_MINUS_2.1"; exon_number "1";
13+
chr1 test gene 9000 10000 . + . gene_id "GENE_PLUS_3"; gene_name "GENE_PLUS_3";
14+
chr1 test transcript 9000 10000 . + . gene_id "GENE_PLUS_3"; transcript_id "GENE_PLUS_3.1";
15+
chr1 test exon 9000 10000 . + . gene_id "GENE_PLUS_3"; transcript_id "GENE_PLUS_3.1"; exon_number "1";
16+
chr1 test gene 11000 12000 . - . gene_id "GENE_MINUS_3"; gene_name "GENE_MINUS_3";
17+
chr1 test transcript 11000 12000 . - . gene_id "GENE_MINUS_3"; transcript_id "GENE_MINUS_3.1";
18+
chr1 test exon 11000 12000 . - . gene_id "GENE_MINUS_3"; transcript_id "GENE_MINUS_3.1"; exon_number "1";
282 Bytes
Binary file not shown.
13.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)