Skip to content

Commit 0535f9d

Browse files
ewelsclaude
andcommitted
fix(quant): match STAR's geneInfo.tab fallback strings
When a GTF exon record has no gene_name attribute, STAR's GTF.cpp uses the gene_id string as the name. When there's no gene_biotype, it uses the literal string "MissingGeneType". ruSTAR was using empty strings for both. Verified byte-for-byte parity against STAR 2.7.11b on a synthetic 2-chr / 4-transcript / 4-gene test case (/tmp/rustar_diff): all 9 small text files (chrName/chrLength/chrStart/chrNameLength.txt, transcriptInfo.tab, exonInfo.tab, geneInfo.tab, exonGeTrInfo.tab, sjdbList.fromGTF.out.tab) now diff-clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6351a68 commit 0535f9d

2 files changed

Lines changed: 29 additions & 13 deletions

File tree

src/quant/transcriptome.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,12 @@ pub struct TranscriptomeIndex {
110110
pub tr_gene_idx: Vec<u32>,
111111
/// Unique gene IDs in first-seen order (STAR's `geneInfo.tab` column 1).
112112
pub gene_ids: Vec<String>,
113-
/// gene_name per gene (empty string if the GTF record had no `gene_name`).
113+
/// gene_name per gene. Falls back to the `gene_id` string when the GTF
114+
/// has no `gene_name` attribute — matches STAR's `GTF.cpp` behavior.
114115
pub gene_names: Vec<String>,
115-
/// gene_biotype per gene (empty if absent).
116+
/// gene_biotype per gene. Falls back to the literal string
117+
/// `"MissingGeneType"` when the GTF has no `gene_biotype` attribute —
118+
/// matches STAR's `GTF.cpp` behavior for `geneAttr[ig][1]`.
116119
pub gene_biotypes: Vec<String>,
117120
/// Absolute genome start of the first exon (0-based).
118121
pub tr_start: Vec<u64>,
@@ -256,16 +259,20 @@ impl TranscriptomeIndex {
256259
};
257260

258261
let gene_id = first.attributes.get("gene_id").cloned().unwrap_or_default();
262+
// STAR-faithful fallbacks: when the GTF record omits gene_name,
263+
// STAR's GTF.cpp writes geneAttr[ig][0] = gene_id (not empty).
264+
// When gene_biotype is omitted, it writes the literal string
265+
// "MissingGeneType".
259266
let gene_name = first
260267
.attributes
261268
.get("gene_name")
262269
.cloned()
263-
.unwrap_or_default();
270+
.unwrap_or_else(|| gene_id.clone());
264271
let gene_biotype = first
265272
.attributes
266273
.get("gene_biotype")
267274
.cloned()
268-
.unwrap_or_default();
275+
.unwrap_or_else(|| "MissingGeneType".to_string());
269276

270277
// Intern the gene — first transcript for each gene_id wins the
271278
// name/biotype slot. Subsequent transcripts with a richer name or
@@ -1546,9 +1553,14 @@ mod tests {
15461553
idx.gene_names,
15471554
vec!["GENE1".to_string(), "GENE2".to_string()]
15481555
);
1556+
// G2 has no gene_biotype attribute — STAR-faithful fallback is
1557+
// the literal "MissingGeneType".
15491558
assert_eq!(
15501559
idx.gene_biotypes,
1551-
vec!["protein_coding".to_string(), "".to_string()]
1560+
vec![
1561+
"protein_coding".to_string(),
1562+
"MissingGeneType".to_string()
1563+
]
15521564
);
15531565
assert_eq!(idx.tr_gene_idx, vec![0, 0, 1]);
15541566
}
@@ -1695,8 +1707,9 @@ mod tests {
16951707
built.write_gene_info(dir.path()).unwrap();
16961708

16971709
let loaded = TranscriptomeIndex::from_index_dir(dir.path(), &genome).unwrap();
1698-
assert_eq!(loaded.gene_names, vec!["".to_string()]);
1699-
assert_eq!(loaded.gene_biotypes, vec!["".to_string()]);
1710+
// STAR fallbacks: gene_name → gene_id, gene_biotype → "MissingGeneType".
1711+
assert_eq!(loaded.gene_names, vec!["G1".to_string()]);
1712+
assert_eq!(loaded.gene_biotypes, vec!["MissingGeneType".to_string()]);
17001713
}
17011714

17021715
#[test]
@@ -1821,12 +1834,14 @@ mod tests {
18211834
let dir = tempfile::tempdir().unwrap();
18221835
idx.write_gene_info(dir.path()).unwrap();
18231836
let body = std::fs::read_to_string(dir.path().join("geneInfo.tab")).unwrap();
1837+
// STAR-faithful fallbacks: missing gene_name → gene_id string,
1838+
// missing gene_biotype → literal "MissingGeneType".
18241839
assert_eq!(
18251840
body,
18261841
"3\n\
18271842
G1\tGENE1\tprotein_coding\n\
1828-
G2\tGENE2\t\n\
1829-
G3\t\t\n"
1843+
G2\tGENE2\tMissingGeneType\n\
1844+
G3\tG3\tMissingGeneType\n"
18301845
);
18311846
}
18321847

tests/transcriptome_sam.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,15 @@ fn assert_star_transcriptome_files(genome_dir: &std::path::Path) {
101101
"exonInfo.tab byte format divergent from STAR"
102102
);
103103

104-
// geneInfo.tab: header = 2, gene IDs in first-seen order with empty
105-
// name/biotype columns (GTF had no gene_name / gene_biotype attrs).
104+
// geneInfo.tab: header = 2, gene IDs in first-seen order. STAR falls
105+
// back to gene_id for missing gene_name, and the literal string
106+
// "MissingGeneType" for missing gene_biotype.
106107
let ge_info = fs::read_to_string(genome_dir.join("geneInfo.tab")).unwrap();
107108
assert_eq!(
108109
ge_info,
109110
"2\n\
110-
G1\t\t\n\
111-
G2\t\t\n",
111+
G1\tG1\tMissingGeneType\n\
112+
G2\tG2\tMissingGeneType\n",
112113
"geneInfo.tab byte format divergent from STAR"
113114
);
114115

0 commit comments

Comments
 (0)