|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Byte-for-byte diff of ruSTAR vs STAR for the transcriptome index files |
| 3 | +# written at genomeGenerate time. |
| 4 | +# |
| 5 | +# Usage: ./diff_star_transcriptome.sh [WORKDIR] |
| 6 | +# |
| 7 | +# Generates a synthetic 2-chr / 4-transcript / 4-gene test case, runs both |
| 8 | +# ruSTAR and STAR genomeGenerate, then diffs each output file byte-for-byte. |
| 9 | +# |
| 10 | +# Requires STAR on PATH (`brew install rna-star`, bioconda star, etc.). |
| 11 | +# ruSTAR must already be built — the script expects `target/debug/ruSTAR` |
| 12 | +# in the repo root (or $RUSTAR_BIN set to a compiled binary). |
| 13 | + |
| 14 | +set -euo pipefail |
| 15 | + |
| 16 | +WORKDIR="${1:-/tmp/rustar_diff}" |
| 17 | +RUSTAR_BIN="${RUSTAR_BIN:-$(pwd)/target/debug/ruSTAR}" |
| 18 | + |
| 19 | +if [[ ! -x "$RUSTAR_BIN" ]]; then |
| 20 | + echo "ERROR: ruSTAR binary not found at $RUSTAR_BIN" |
| 21 | + echo "Build with: cargo build (or set RUSTAR_BIN=/path/to/ruSTAR)" |
| 22 | + exit 1 |
| 23 | +fi |
| 24 | + |
| 25 | +if ! command -v STAR >/dev/null 2>&1; then |
| 26 | + echo "ERROR: STAR not on PATH. Install via 'brew install rna-star' or bioconda." |
| 27 | + exit 1 |
| 28 | +fi |
| 29 | + |
| 30 | +mkdir -p "$WORKDIR" |
| 31 | +cd "$WORKDIR" |
| 32 | + |
| 33 | +# Deterministic test case. |
| 34 | +python3 - <<'PYEOF' |
| 35 | +BASES = "ACGT" |
| 36 | +def lcg(seed, length): |
| 37 | + state = seed & 0xFFFFFFFF |
| 38 | + out = [] |
| 39 | + for _ in range(length): |
| 40 | + state = (state * 1103515245 + 12345) & 0xFFFFFFFF |
| 41 | + out.append(BASES[(state >> 16) & 3]) |
| 42 | + return "".join(out) |
| 43 | +
|
| 44 | +CHR1 = lcg(11111, 3000) |
| 45 | +CHR2 = lcg(22222, 3000) |
| 46 | +
|
| 47 | +with open("genome.fa", "w") as f: |
| 48 | + f.write(">chr1\n") |
| 49 | + for i in range(0, len(CHR1), 60): f.write(CHR1[i:i+60] + "\n") |
| 50 | + f.write(">chr2\n") |
| 51 | + for i in range(0, len(CHR2), 60): f.write(CHR2[i:i+60] + "\n") |
| 52 | +
|
| 53 | +with open("annotations.gtf", "w") as f: |
| 54 | + f.write('chr1\ttest\texon\t101\t400\t.\t+\t.\tgene_id "G1"; transcript_id "T1";\n') |
| 55 | + f.write('chr1\ttest\texon\t501\t600\t.\t+\t.\tgene_id "G2"; transcript_id "T2"; gene_name "GENE2";\n') |
| 56 | + f.write('chr1\ttest\texon\t701\t800\t.\t+\t.\tgene_id "G2"; transcript_id "T2"; gene_name "GENE2";\n') |
| 57 | + f.write('chr1\ttest\texon\t901\t1000\t.\t+\t.\tgene_id "G2"; transcript_id "T2"; gene_name "GENE2";\n') |
| 58 | + f.write('chr2\ttest\texon\t301\t600\t.\t-\t.\tgene_id "G3"; transcript_id "T3"; gene_biotype "protein_coding";\n') |
| 59 | + f.write('chr2\ttest\texon\t1001\t1100\t.\t+\t.\tgene_id "G4"; transcript_id "T4";\n') |
| 60 | + f.write('chr2\ttest\texon\t1301\t1400\t.\t+\t.\tgene_id "G4"; transcript_id "T4";\n') |
| 61 | +PYEOF |
| 62 | + |
| 63 | +# STAR genomeGenerate. |
| 64 | +rm -rf star_index |
| 65 | +mkdir star_index |
| 66 | +STAR --runMode genomeGenerate --genomeDir star_index \ |
| 67 | + --genomeFastaFiles genome.fa --sjdbGTFfile annotations.gtf \ |
| 68 | + --genomeSAindexNbases 5 --sjdbOverhang 49 --runThreadN 1 >/dev/null 2>&1 |
| 69 | + |
| 70 | +# ruSTAR genomeGenerate. |
| 71 | +rm -rf rustar_index |
| 72 | +"$RUSTAR_BIN" --runMode genomeGenerate --genomeDir rustar_index \ |
| 73 | + --genomeFastaFiles genome.fa --sjdbGTFfile annotations.gtf \ |
| 74 | + --genomeSAindexNbases 5 --sjdbOverhang 49 --runThreadN 1 >/dev/null 2>&1 |
| 75 | + |
| 76 | +# Diff each file. |
| 77 | +pass=0 |
| 78 | +fail=0 |
| 79 | +for f in chrName.txt chrLength.txt chrStart.txt chrNameLength.txt \ |
| 80 | + transcriptInfo.tab exonInfo.tab geneInfo.tab exonGeTrInfo.tab \ |
| 81 | + sjdbList.fromGTF.out.tab; do |
| 82 | + if diff -q "star_index/$f" "rustar_index/$f" >/dev/null 2>&1; then |
| 83 | + echo "✓ $f" |
| 84 | + pass=$((pass+1)) |
| 85 | + else |
| 86 | + echo "✗ $f DIFFERS" |
| 87 | + diff "star_index/$f" "rustar_index/$f" | head -20 |
| 88 | + fail=$((fail+1)) |
| 89 | + fi |
| 90 | +done |
| 91 | + |
| 92 | +echo |
| 93 | +echo "RESULT: $pass/$((pass+fail)) files identical to STAR's output" |
| 94 | +if [[ $fail -gt 0 ]]; then |
| 95 | + exit 1 |
| 96 | +fi |
0 commit comments