Skip to content

Commit fa29f13

Browse files
ewelsclaude
andcommitted
test: add diff_star_transcriptome.sh — byte-for-byte parity script
Generates a deterministic 2-chr / 4-transcript / 4-gene synthetic test case, runs both STAR and ruSTAR genomeGenerate, then diffs each of the 9 files byte-for-byte. Current result on STAR 2.7.11b: 9/9 files identical * chrName.txt, chrLength.txt, chrStart.txt, chrNameLength.txt * transcriptInfo.tab, exonInfo.tab, geneInfo.tab, exonGeTrInfo.tab, sjdbList.fromGTF.out.tab Runnable via: RUSTAR_BIN=/path/to/target/release/ruSTAR \\ test/diff_star_transcriptome.sh /tmp/workdir Reports with ✓/✗ per file and exits non-zero if any file differs. Requires STAR on PATH (`brew install rna-star` or bioconda star). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0535f9d commit fa29f13

1 file changed

Lines changed: 96 additions & 0 deletions

File tree

test/diff_star_transcriptome.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/usr/bin/env bash
2+
# Byte-for-byte diff of ruSTAR vs STAR for the transcriptome index files
3+
# written at genomeGenerate time.
4+
#
5+
# Usage: ./diff_star_transcriptome.sh [WORKDIR]
6+
#
7+
# Generates a synthetic 2-chr / 4-transcript / 4-gene test case, runs both
8+
# ruSTAR and STAR genomeGenerate, then diffs each output file byte-for-byte.
9+
#
10+
# Requires STAR on PATH (`brew install rna-star`, bioconda star, etc.).
11+
# ruSTAR must already be built — the script expects `target/debug/ruSTAR`
12+
# in the repo root (or $RUSTAR_BIN set to a compiled binary).
13+
14+
set -euo pipefail
15+
16+
WORKDIR="${1:-/tmp/rustar_diff}"
17+
RUSTAR_BIN="${RUSTAR_BIN:-$(pwd)/target/debug/ruSTAR}"
18+
19+
if [[ ! -x "$RUSTAR_BIN" ]]; then
20+
echo "ERROR: ruSTAR binary not found at $RUSTAR_BIN"
21+
echo "Build with: cargo build (or set RUSTAR_BIN=/path/to/ruSTAR)"
22+
exit 1
23+
fi
24+
25+
if ! command -v STAR >/dev/null 2>&1; then
26+
echo "ERROR: STAR not on PATH. Install via 'brew install rna-star' or bioconda."
27+
exit 1
28+
fi
29+
30+
mkdir -p "$WORKDIR"
31+
cd "$WORKDIR"
32+
33+
# Deterministic test case.
34+
python3 - <<'PYEOF'
35+
BASES = "ACGT"
36+
def lcg(seed, length):
37+
state = seed & 0xFFFFFFFF
38+
out = []
39+
for _ in range(length):
40+
state = (state * 1103515245 + 12345) & 0xFFFFFFFF
41+
out.append(BASES[(state >> 16) & 3])
42+
return "".join(out)
43+
44+
CHR1 = lcg(11111, 3000)
45+
CHR2 = lcg(22222, 3000)
46+
47+
with open("genome.fa", "w") as f:
48+
f.write(">chr1\n")
49+
for i in range(0, len(CHR1), 60): f.write(CHR1[i:i+60] + "\n")
50+
f.write(">chr2\n")
51+
for i in range(0, len(CHR2), 60): f.write(CHR2[i:i+60] + "\n")
52+
53+
with open("annotations.gtf", "w") as f:
54+
f.write('chr1\ttest\texon\t101\t400\t.\t+\t.\tgene_id "G1"; transcript_id "T1";\n')
55+
f.write('chr1\ttest\texon\t501\t600\t.\t+\t.\tgene_id "G2"; transcript_id "T2"; gene_name "GENE2";\n')
56+
f.write('chr1\ttest\texon\t701\t800\t.\t+\t.\tgene_id "G2"; transcript_id "T2"; gene_name "GENE2";\n')
57+
f.write('chr1\ttest\texon\t901\t1000\t.\t+\t.\tgene_id "G2"; transcript_id "T2"; gene_name "GENE2";\n')
58+
f.write('chr2\ttest\texon\t301\t600\t.\t-\t.\tgene_id "G3"; transcript_id "T3"; gene_biotype "protein_coding";\n')
59+
f.write('chr2\ttest\texon\t1001\t1100\t.\t+\t.\tgene_id "G4"; transcript_id "T4";\n')
60+
f.write('chr2\ttest\texon\t1301\t1400\t.\t+\t.\tgene_id "G4"; transcript_id "T4";\n')
61+
PYEOF
62+
63+
# STAR genomeGenerate.
64+
rm -rf star_index
65+
mkdir star_index
66+
STAR --runMode genomeGenerate --genomeDir star_index \
67+
--genomeFastaFiles genome.fa --sjdbGTFfile annotations.gtf \
68+
--genomeSAindexNbases 5 --sjdbOverhang 49 --runThreadN 1 >/dev/null 2>&1
69+
70+
# ruSTAR genomeGenerate.
71+
rm -rf rustar_index
72+
"$RUSTAR_BIN" --runMode genomeGenerate --genomeDir rustar_index \
73+
--genomeFastaFiles genome.fa --sjdbGTFfile annotations.gtf \
74+
--genomeSAindexNbases 5 --sjdbOverhang 49 --runThreadN 1 >/dev/null 2>&1
75+
76+
# Diff each file.
77+
pass=0
78+
fail=0
79+
for f in chrName.txt chrLength.txt chrStart.txt chrNameLength.txt \
80+
transcriptInfo.tab exonInfo.tab geneInfo.tab exonGeTrInfo.tab \
81+
sjdbList.fromGTF.out.tab; do
82+
if diff -q "star_index/$f" "rustar_index/$f" >/dev/null 2>&1; then
83+
echo "$f"
84+
pass=$((pass+1))
85+
else
86+
echo "$f DIFFERS"
87+
diff "star_index/$f" "rustar_index/$f" | head -20
88+
fail=$((fail+1))
89+
fi
90+
done
91+
92+
echo
93+
echo "RESULT: $pass/$((pass+fail)) files identical to STAR's output"
94+
if [[ $fail -gt 0 ]]; then
95+
exit 1
96+
fi

0 commit comments

Comments
 (0)