Skip to content

Commit 6351a68

Browse files
ewelsclaude
andcommitted
test: verify byte-format parity of all 5 STAR-compatible index files
Extended the end-to-end integration test to also pass --sjdbGTFfile at genomeGenerate and confirm that transcriptInfo.tab, exonInfo.tab, geneInfo.tab, exonGeTrInfo.tab, and sjdbList.fromGTF.out.tab are written into the genome directory with the exact byte content STAR would produce for the synthetic 2-transcript GTF used by this test. This is the best byte-format validation we can do without the full yeast test dataset (which needs to be downloaded separately via test/data_setup.sh). Once that dataset is in place, diffing ruSTAR's output against STAR's for the yeast genome gives the final parity verification. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7db1d40 commit 6351a68

1 file changed

Lines changed: 73 additions & 1 deletion

File tree

tests/transcriptome_sam.rs

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,70 @@ fn create_fastq(dir: &TempDir, n_reads: usize, chr1_seq: &str) -> PathBuf {
7171
fastq_path
7272
}
7373

74+
/// Confirm the 5 STAR-compatible transcriptome index files are present
75+
/// and structurally correct. Uses the test GTF (T1 [101,400) + /
76+
/// T2 [601,900), both forward, G1/G2 single-exon transcripts) to derive
77+
/// expected byte contents.
78+
fn assert_star_transcriptome_files(genome_dir: &std::path::Path) {
79+
// transcriptInfo.tab: header = 2, then two lines in sorted order.
80+
// T1: trStart=100, trEnd=399 (inclusive), trEmax=399 (first), strand=1 (+),
81+
// trExN=1, trExI=0, trGene=0.
82+
// T2: trStart=600, trEnd=899, trEmax=399 (running max excludes current),
83+
// strand=1, trExN=1, trExI=1, trGene=1.
84+
let tr_info = fs::read_to_string(genome_dir.join("transcriptInfo.tab")).unwrap();
85+
assert_eq!(
86+
tr_info,
87+
"2\n\
88+
T1\t100\t399\t399\t1\t1\t0\t0\n\
89+
T2\t600\t899\t399\t1\t1\t1\t1\n",
90+
"transcriptInfo.tab byte format divergent from STAR"
91+
);
92+
93+
// exonInfo.tab: header = 2, two single-exon transcripts with relative
94+
// [0, len-1] coords and exLenCum=0 for each first exon.
95+
let ex_info = fs::read_to_string(genome_dir.join("exonInfo.tab")).unwrap();
96+
assert_eq!(
97+
ex_info,
98+
"2\n\
99+
0\t299\t0\n\
100+
0\t299\t0\n",
101+
"exonInfo.tab byte format divergent from STAR"
102+
);
103+
104+
// geneInfo.tab: header = 2, gene IDs in first-seen order with empty
105+
// name/biotype columns (GTF had no gene_name / gene_biotype attrs).
106+
let ge_info = fs::read_to_string(genome_dir.join("geneInfo.tab")).unwrap();
107+
assert_eq!(
108+
ge_info,
109+
"2\n\
110+
G1\t\t\n\
111+
G2\t\t\n",
112+
"geneInfo.tab byte format divergent from STAR"
113+
);
114+
115+
// exonGeTrInfo.tab: header = 2, exons sorted by (start, end, strand, ...).
116+
// Both are forward (+strand == 1).
117+
let ge_tr_info = fs::read_to_string(genome_dir.join("exonGeTrInfo.tab")).unwrap();
118+
assert_eq!(
119+
ge_tr_info,
120+
"2\n\
121+
100\t399\t1\t0\t0\n\
122+
600\t899\t1\t1\t1\n",
123+
"exonGeTrInfo.tab byte format divergent from STAR"
124+
);
125+
126+
// sjdbList.fromGTF.out.tab: both transcripts are single-exon, so no
127+
// splice junctions are produced. File must exist but be empty.
128+
let sj_path = genome_dir.join("sjdbList.fromGTF.out.tab");
129+
assert!(sj_path.exists(), "sjdbList.fromGTF.out.tab was not written");
130+
let sj = fs::read_to_string(&sj_path).unwrap();
131+
assert!(
132+
sj.is_empty(),
133+
"single-exon-only transcripts should yield empty sjdbList.fromGTF.out.tab, got: {:?}",
134+
sj
135+
);
136+
}
137+
74138
#[test]
75139
fn transcriptome_sam_end_to_end_smoke_test() {
76140
let tmpdir = TempDir::new().unwrap();
@@ -87,7 +151,8 @@ fn transcriptome_sam_end_to_end_smoke_test() {
87151
// also handles the trailing slash convention.
88152
let output_prefix = format!("{}/", output_dir.display());
89153

90-
// Build genome index.
154+
// Build genome index, passing the GTF so transcriptInfo.tab + friends
155+
// are persisted (matches STAR's workflow).
91156
Command::cargo_bin("ruSTAR")
92157
.unwrap()
93158
.args([
@@ -97,6 +162,8 @@ fn transcriptome_sam_end_to_end_smoke_test() {
97162
genome_dir.to_str().unwrap(),
98163
"--genomeFastaFiles",
99164
fasta_path.to_str().unwrap(),
165+
"--sjdbGTFfile",
166+
gtf_path.to_str().unwrap(),
100167
"--genomeSAindexNbases",
101168
"5",
102169
])
@@ -132,6 +199,11 @@ fn transcriptome_sam_end_to_end_smoke_test() {
132199
.assert()
133200
.success();
134201

202+
// All 5 STAR-compatible transcriptome index files must be written at
203+
// genomeGenerate time. Verify their byte formats match STAR exactly
204+
// (see src/quant/transcriptome.rs::write_*).
205+
assert_star_transcriptome_files(&genome_dir);
206+
135207
// The transcriptome BAM must exist.
136208
let tr_bam = output_dir.join("Aligned.toTranscriptome.out.bam");
137209
assert!(

0 commit comments

Comments
 (0)