-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtranscriptome.rs
More file actions
2417 lines (2229 loc) · 92.2 KB
/
transcriptome.rs
File metadata and controls
2417 lines (2229 loc) · 92.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! Transcript-level quantification (`--quantMode TranscriptomeSAM`).
//!
//! Builds a per-transcript exon map from GTF records and projects genome-space
//! alignments onto the transcriptome coordinate system so downstream tools
//! (Salmon, RSEM) can read `Aligned.toTranscriptome.out.bam`.
//!
//! This module mirrors STAR's `Transcriptome` / `alignToTranscript` /
//! `quantAlign` logic (see `source/Transcriptome.cpp` and
//! `source/Transcriptome_quantAlign.cpp` in the upstream repo), with the
//! only substantive divergence that rustar-aligner builds the transcript tables on
//! the fly from the input GTF instead of loading persisted
//! `transcriptInfo.tab`/`exonInfo.tab` files.
use std::collections::HashMap;
use std::io::Write as _;
use std::path::Path;
use crate::align::transcript::{CigarOp, Exon, Transcript};
use crate::error::Error;
use crate::genome::Genome;
use crate::junction::gtf::GtfRecord;
use crate::params::Parameters;
/// GTF attribute names the transcriptome index reads.
const GTF_ATTR_TRANSCRIPT_ID: &str = "transcript_id";
const GTF_ATTR_GENE_ID: &str = "gene_id";
const GTF_ATTR_GENE_NAME: &str = "gene_name";
const GTF_ATTR_GENE_BIOTYPE: &str = "gene_biotype";
/// STAR's fallback for GTF records missing `gene_biotype`
/// (`source/GTF.cpp`).
const MISSING_GENE_TYPE: &str = "MissingGeneType";
// ---------------------------------------------------------------------------
// Filter-mode enum + softclip extension (subtask 3)
// ---------------------------------------------------------------------------
/// STAR's `--quantTranscriptomeSAMoutput` value.
///
/// Controls how alignments are filtered / adjusted before being projected
/// onto the transcriptome:
/// * `BanSingleEndBanIndelsExtendSoftclip` (STAR default, RSEM-compatible) —
/// reject alignments containing indels; reject PE alignments where both
/// sides of the read come from a single mate; extend soft-clipped bases
/// back into matched bases (re-counting mismatches).
/// * `BanSingleEnd` — only reject single-mate-only PE alignments; keep
/// indels and soft-clips as-is.
/// * `BanSingleEndExtendSoftclip` — reject single-mate-only PE alignments;
/// keep indels; extend soft-clips.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum QuantTranscriptomeSAMoutput {
/// Default: ban indels, ban single-mate PE hits, extend soft-clips.
#[default]
BanSingleEndBanIndelsExtendSoftclip,
/// Ban single-mate PE hits only; keep indels and soft-clips.
BanSingleEnd,
/// Ban single-mate PE hits; keep indels; extend soft-clips.
BanSingleEndExtendSoftclip,
}
impl std::str::FromStr for QuantTranscriptomeSAMoutput {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"BanSingleEnd_BanIndels_ExtendSoftclip" => {
Ok(Self::BanSingleEndBanIndelsExtendSoftclip)
}
"BanSingleEnd" => Ok(Self::BanSingleEnd),
"BanSingleEnd_ExtendSoftclip" => Ok(Self::BanSingleEndExtendSoftclip),
_ => Err(format!(
"unknown --quantTranscriptomeSAMoutput '{s}'; expected \
'BanSingleEnd_BanIndels_ExtendSoftclip', 'BanSingleEnd', or \
'BanSingleEnd_ExtendSoftclip'"
)),
}
}
}
impl QuantTranscriptomeSAMoutput {
/// Whether indels are permitted in the output.
pub fn allow_indels(self) -> bool {
!matches!(self, Self::BanSingleEndBanIndelsExtendSoftclip)
}
/// Whether soft-clipped bases should be kept (true) or extended back into
/// matched bases (false).
pub fn allow_softclip(self) -> bool {
matches!(self, Self::BanSingleEnd)
}
}
/// Per-transcript exon in absolute genome coordinates (0-based half-open),
/// paired with the cumulative transcript-space length of all preceding exons
/// (STAR's `exLenCum`).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TrExon {
/// Absolute genome start (0-based, inclusive).
pub genome_start: u64,
/// Absolute genome end (0-based, exclusive).
pub genome_end: u64,
/// Cumulative transcript-space length of all prior exons (0 for the first).
pub ex_len_cum: u32,
}
/// Transcriptome metadata built from GTF exon records.
///
/// Each transcript is a contiguous set of exons on a single chromosome and
/// strand, ordered by ascending genome start. `tr_order` / `tr_starts_sorted`
/// / `tr_end_max_sorted` support STAR's `binarySearch1a` + running-max
/// early-exit in `quantAlign`.
#[derive(Debug, Clone)]
pub struct TranscriptomeIndex {
/// Transcript IDs in insertion order (index = transcript_idx).
pub tr_ids: Vec<String>,
/// Genome chromosome index per transcript.
pub tr_chr_idx: Vec<usize>,
/// Strand per transcript: 1 = forward/`+`, 2 = reverse/`-`.
pub tr_strand: Vec<u8>,
/// Index into `gene_ids` per transcript — matches STAR's `trGene` column.
/// Look up the string form via `gene_ids[tr_gene_idx[tr] as usize]`.
pub tr_gene_idx: Vec<u32>,
/// Unique gene IDs in first-seen order (STAR's `geneInfo.tab` column 1).
pub gene_ids: Vec<String>,
/// gene_name per gene. Falls back to the `gene_id` string when the GTF
/// has no `gene_name` attribute — matches STAR's `GTF.cpp` behavior.
pub gene_names: Vec<String>,
/// gene_biotype per gene. Falls back to the literal string
/// `"MissingGeneType"` when the GTF has no `gene_biotype` attribute —
/// matches STAR's `GTF.cpp` behavior for `geneAttr[ig][1]`.
pub gene_biotypes: Vec<String>,
/// Absolute genome start of the first exon (0-based).
pub tr_start: Vec<u64>,
/// Absolute genome end of the last exon (0-based half-open).
pub tr_end: Vec<u64>,
/// Exons per transcript (sorted by genome_start, with `ex_len_cum`).
pub tr_exons: Vec<Vec<TrExon>>,
/// Total transcript-space length (sum of exon spans).
pub tr_length: Vec<u32>,
/// First-exon offset into a flat global exon array, per transcript,
/// computed in STAR's SORTED-BY-(trStart, trEnd) order — matches the
/// `trExI` column in `transcriptInfo.tab`. Indexed by insertion position.
pub tr_exi: Vec<u32>,
/// Permutation of `0..n_transcripts` sorted by `(tr_start, tr_end)`.
pub tr_order: Vec<usize>,
/// `tr_start[tr_order[i]]` — for `binary_search`.
pub tr_starts_sorted: Vec<u64>,
/// Running max of `tr_end` along `tr_order` (STAR's `trEmax`).
pub tr_end_max_sorted: Vec<u64>,
}
impl TranscriptomeIndex {
/// Build from already-parsed GTF exon records with configurable GTF attribute names.
///
/// `transcript_tag`: STAR `sjdbGTFtagExonParentTranscript` (default `"transcript_id"`).
/// `gene_tag`: STAR `sjdbGTFtagExonParentGene` (default `"gene_id"`).
pub fn from_gtf_exons_configured(
exons: &[GtfRecord],
genome: &Genome,
transcript_tag: &str,
gene_tag: &str,
) -> Result<Self, Error> {
// Group exons by transcript_tag, preserving first-seen insertion order.
let mut order: Vec<String> = Vec::new();
let mut groups: HashMap<String, Vec<&GtfRecord>> = HashMap::new();
for rec in exons {
let tid = match rec.attributes.get(transcript_tag) {
Some(id) => id.clone(),
None => continue,
};
if !groups.contains_key(&tid) {
order.push(tid.clone());
}
groups.entry(tid).or_default().push(rec);
}
let mut tr_ids: Vec<String> = Vec::new();
let mut tr_chr_idx: Vec<usize> = Vec::new();
let mut tr_strand: Vec<u8> = Vec::new();
let mut tr_gene_idx: Vec<u32> = Vec::new();
let mut tr_start: Vec<u64> = Vec::new();
let mut tr_end: Vec<u64> = Vec::new();
let mut tr_exons_vec: Vec<Vec<TrExon>> = Vec::new();
let mut tr_length: Vec<u32> = Vec::new();
// Gene-interning pass: each unique gene_id gets a position-based
// integer index, matching STAR's geneInfo.tab / trGene column.
let mut gene_ids: Vec<String> = Vec::new();
let mut gene_names: Vec<String> = Vec::new();
let mut gene_biotypes: Vec<String> = Vec::new();
let mut gene_id_to_idx: HashMap<String, u32> = HashMap::new();
for tid in &order {
let mut recs = groups.remove(tid).unwrap();
if recs.is_empty() {
continue;
}
// Validate consistent chromosome + strand.
let first = recs[0];
let chr_name = &first.seqname;
let strand_char = first.strand;
let mut inconsistent = false;
for r in &recs {
if &r.seqname != chr_name || r.strand != strand_char {
inconsistent = true;
break;
}
}
if inconsistent {
log::warn!(
"quantMode TranscriptomeSAM: transcript {} has inconsistent chromosome/strand across exons — skipping",
tid
);
continue;
}
// Map chromosome name to genome index.
let chr_idx = match genome.chr_name.iter().position(|n| n == chr_name) {
Some(i) if i < genome.n_chr_real => i,
_ => {
log::warn!(
"quantMode TranscriptomeSAM: transcript {} on unknown chromosome {} — skipping",
tid,
chr_name
);
continue;
}
};
let chr_offset = genome.chr_start[chr_idx];
// Sort exons by GTF start ASC.
recs.sort_by_key(|r| r.start);
// Build TrExon list in absolute genome coords (0-based half-open).
let mut tr_exons: Vec<TrExon> = Vec::with_capacity(recs.len());
let mut ex_len_cum: u32 = 0;
for r in &recs {
// GTF is 1-based inclusive; rustar-aligner uses 0-based half-open.
let abs_start = chr_offset + r.start.saturating_sub(1);
let abs_end = chr_offset + r.end;
if abs_end <= abs_start {
log::warn!(
"quantMode TranscriptomeSAM: transcript {} has invalid exon {}-{} — skipping",
tid,
r.start,
r.end
);
tr_exons.clear();
break;
}
let len = (abs_end - abs_start) as u32;
tr_exons.push(TrExon {
genome_start: abs_start,
genome_end: abs_end,
ex_len_cum,
});
ex_len_cum = ex_len_cum.saturating_add(len);
}
if tr_exons.is_empty() {
continue;
}
let first_ex = tr_exons.first().unwrap();
let last_ex = tr_exons.last().unwrap();
let start_abs = first_ex.genome_start;
let end_abs = last_ex.genome_end;
let total_len = ex_len_cum;
let strand_u8 = match strand_char {
'+' => 1u8,
'-' => 2u8,
_ => 1u8, // unknown → treat as forward (STAR default)
};
let gene_id = first.attributes.get(gene_tag).cloned().unwrap_or_default();
// STAR-faithful fallbacks: when the GTF record omits gene_name,
// STAR's GTF.cpp writes geneAttr[ig][0] = gene_id (not empty).
// When gene_biotype is omitted, it writes `MISSING_GENE_TYPE`.
let gene_name = first
.attributes
.get(GTF_ATTR_GENE_NAME)
.cloned()
.unwrap_or_else(|| gene_id.clone());
let gene_biotype = first
.attributes
.get(GTF_ATTR_GENE_BIOTYPE)
.cloned()
.unwrap_or_else(|| MISSING_GENE_TYPE.to_string());
// Intern the gene — first transcript for each gene_id wins the
// name/biotype slot. Subsequent transcripts with a richer name or
// biotype do NOT overwrite (STAR's Transcriptome writer is
// first-seen-wins too).
let gene_idx = match gene_id_to_idx.get(&gene_id) {
Some(&i) => i,
None => {
let i = gene_ids.len() as u32;
gene_id_to_idx.insert(gene_id.clone(), i);
gene_ids.push(gene_id.clone());
gene_names.push(gene_name);
gene_biotypes.push(gene_biotype);
i
}
};
tr_ids.push(tid.clone());
tr_chr_idx.push(chr_idx);
tr_strand.push(strand_u8);
tr_gene_idx.push(gene_idx);
tr_start.push(start_abs);
tr_end.push(end_abs);
tr_exons_vec.push(tr_exons);
tr_length.push(total_len);
}
let n_tr = tr_ids.len();
// Sorted view by (tr_start, tr_end) for binary-search +
// running-max early-exit.
let mut tr_order: Vec<usize> = (0..n_tr).collect();
tr_order.sort_by(|&a, &b| {
tr_start[a]
.cmp(&tr_start[b])
.then_with(|| tr_end[a].cmp(&tr_end[b]))
});
// Cumulative exon offset in SORTED order — matches STAR's `trExI`,
// where exons in `exonInfo.tab` are grouped by transcript in sorted
// transcript order. `tr_exi[i]` (insertion position i) = sum of exon
// counts of transcripts that come BEFORE `i` in sorted order.
let mut tr_exi: Vec<u32> = vec![0; n_tr];
let mut cum: u32 = 0;
for &sorted_idx in &tr_order {
tr_exi[sorted_idx] = cum;
cum = cum.saturating_add(tr_exons_vec[sorted_idx].len() as u32);
}
let tr_starts_sorted: Vec<u64> = tr_order.iter().map(|&i| tr_start[i]).collect();
let mut tr_end_max_sorted: Vec<u64> = Vec::with_capacity(n_tr);
let mut running_max: u64 = 0;
for &i in &tr_order {
running_max = running_max.max(tr_end[i]);
tr_end_max_sorted.push(running_max);
}
Ok(TranscriptomeIndex {
tr_ids,
tr_chr_idx,
tr_strand,
tr_gene_idx,
gene_ids,
gene_names,
gene_biotypes,
tr_start,
tr_end,
tr_exons: tr_exons_vec,
tr_length,
tr_exi,
tr_order,
tr_starts_sorted,
tr_end_max_sorted,
})
}
/// Build from already-parsed GTF exon records using default STAR attribute names.
pub fn from_gtf_exons(exons: &[GtfRecord], genome: &Genome) -> Result<Self, Error> {
Self::from_gtf_exons_configured(exons, genome, GTF_ATTR_TRANSCRIPT_ID, GTF_ATTR_GENE_ID)
}
/// Number of transcripts indexed.
pub fn n_transcripts(&self) -> usize {
self.tr_ids.len()
}
/// Load from STAR-compatible index files in `dir`.
///
/// Requires `transcriptInfo.tab`, `exonInfo.tab`, `geneInfo.tab` to be
/// present — all written by either STAR's `genomeGenerate` or rustar-aligner's
/// `write_*` methods. Returns a fully-populated index equivalent to
/// what `from_gtf_exons` would produce, except transcripts come back
/// in STAR's SORTED `(tr_start, tr_end)` order (not GTF insertion
/// order), because that's how `transcriptInfo.tab` is written.
pub fn from_index_dir(dir: &Path, genome: &Genome) -> Result<Self, Error> {
let (tr_ids, tr_start, tr_end, tr_strand, tr_exn, tr_exi, tr_gene_idx) =
read_transcript_info(&dir.join("transcriptInfo.tab"))?;
let (ex_start_rel, ex_end_rel_incl, ex_len_cum_flat) =
read_exon_info(&dir.join("exonInfo.tab"))?;
let (gene_ids, gene_names, gene_biotypes) = read_gene_info(&dir.join("geneInfo.tab"))?;
let n_tr = tr_ids.len();
let n_exons_total = ex_start_rel.len();
// Sanity: exon count in header must equal sum of trExN.
let sum_exn: u64 = tr_exn.iter().map(|&n| n as u64).sum();
if sum_exn != n_exons_total as u64 {
return Err(Error::Index(format!(
"transcriptome index inconsistent: sum(trExN)={} but exonInfo has {} rows",
sum_exn, n_exons_total
)));
}
// Reconstruct per-transcript TrExon lists + tr_length from flat exon
// arrays. `ex_start_rel[k]` is transcript-relative; add tr_start[i] to
// get absolute 0-based coords. `ex_end_rel_incl` is inclusive; add 1
// for rustar-aligner's exclusive convention.
let mut tr_exons: Vec<Vec<TrExon>> = Vec::with_capacity(n_tr);
let mut tr_length: Vec<u32> = Vec::with_capacity(n_tr);
for i in 0..n_tr {
let start = tr_exi[i] as usize;
let end = start + tr_exn[i] as usize;
let tr_s = tr_start[i];
let mut exons: Vec<TrExon> = Vec::with_capacity(end - start);
let mut total: u32 = 0;
for k in start..end {
let gs = tr_s + ex_start_rel[k];
let ge = tr_s + ex_end_rel_incl[k] + 1;
exons.push(TrExon {
genome_start: gs,
genome_end: ge,
ex_len_cum: ex_len_cum_flat[k],
});
total = total.saturating_add((ge - gs) as u32);
}
tr_exons.push(exons);
tr_length.push(total);
}
// Derive tr_chr_idx from each transcript's first exon genome_start
// using the existing `Genome::position_to_chr` binary-search helper.
let mut tr_chr_idx: Vec<usize> = Vec::with_capacity(n_tr);
for exs in &tr_exons {
let pos = exs
.first()
.map(|e| e.genome_start)
.ok_or_else(|| Error::Index("transcript with zero exons".into()))?;
let (chr_idx, _) = genome.position_to_chr(pos).ok_or_else(|| {
Error::Index(format!(
"transcript first-exon position {pos} does not fall inside any chromosome"
))
})?;
tr_chr_idx.push(chr_idx);
}
// Sorted view: transcripts are already in sorted order on disk, so
// tr_order = identity.
let tr_order: Vec<usize> = (0..n_tr).collect();
let tr_starts_sorted: Vec<u64> = tr_start.clone();
let mut tr_end_max_sorted: Vec<u64> = Vec::with_capacity(n_tr);
let mut m: u64 = 0;
for &i in &tr_order {
m = m.max(tr_end[i]);
tr_end_max_sorted.push(m);
}
Ok(TranscriptomeIndex {
tr_ids,
tr_chr_idx,
tr_strand,
tr_gene_idx,
gene_ids,
gene_names,
gene_biotypes,
tr_start,
tr_end,
tr_exons,
tr_length,
tr_exi,
tr_order,
tr_starts_sorted,
tr_end_max_sorted,
})
}
/// Write `transcriptInfo.tab` into `dir`, byte-for-byte matching STAR's
/// `GTF_transcriptGeneSJ.cpp:86-112` format:
///
/// - Header line: integer transcript count.
/// - Per-transcript line (in sorted order by `(tr_start, tr_end)`):
/// `trID\ttrStart\ttrEnd\ttrEmax\ttrStrand\ttrExN\ttrExI\ttrGene\n`
///
/// `trStart` / `trEnd` are 0-based absolute genome coordinates
/// (STAR's `exonLoci` convention: chrStart + 1-based GTF pos − 1),
/// with `trEnd` INCLUSIVE (last base of transcript). `trEmax` is the
/// running max of prior transcripts' `trEnd` in sorted order — with
/// the initial value set to the first sorted transcript's `trEnd`
/// (so sorted position 0 has `trEmax == trEnd`, matching STAR's
/// `trend=extrLoci[GTF_extrTrEnd(0)]` initializer). `trStrand` uses
/// STAR's GTF.cpp encoding: `+`→1, `-`→2, other→0.
pub fn write_transcript_info(&self, dir: &Path) -> Result<(), Error> {
let path = dir.join("transcriptInfo.tab");
let file = std::fs::File::create(&path).map_err(|e| Error::io(e, &path))?;
let mut out = std::io::BufWriter::new(file);
let n_tr = self.tr_ids.len();
writeln!(out, "{n_tr}").map_err(|e| Error::io(e, &path))?;
let first_end_inclusive = self.tr_order.first().map(|&i| self.tr_end_inclusive(i));
let mut tr_emax = first_end_inclusive.unwrap_or(0);
for &i in &self.tr_order {
let tr_end_incl = self.tr_end_inclusive(i);
let tr_exn = self.tr_exons[i].len();
writeln!(
out,
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}",
self.tr_ids[i],
self.tr_start[i],
tr_end_incl,
tr_emax,
self.tr_strand[i],
tr_exn,
self.tr_exi[i],
self.tr_gene_idx[i],
)
.map_err(|e| Error::io(e, &path))?;
// Running-max update matches STAR's `trend=max(trend, ...)`
// that runs AFTER the write.
tr_emax = tr_emax.max(tr_end_incl);
}
Ok(())
}
/// 0-based inclusive end of transcript `i` — STAR's `exE` convention.
fn tr_end_inclusive(&self, i: usize) -> u64 {
self.tr_end[i].saturating_sub(1)
}
/// Write `sjdbList.fromGTF.out.tab` into `dir`, byte-for-byte matching
/// STAR's `GTF_transcriptGeneSJ.cpp:115-171` format:
///
/// - No header line.
/// - Per-junction line: `chr\tstart(1-based)\tend(1-based)\tstrand\tgenes\n`
///
/// Junctions come from consecutive exon pairs within each transcript
/// where an intron exists. `start` and `end` are 1-based chromosome-local
/// positions of the first and last base of the intron. `strand` is `.`,
/// `+`, or `-` (mapping STAR's `transcriptStrand` 0/1/2). Duplicate
/// junctions (same chr/start/end/strand) across multiple transcripts
/// share a single record with comma-separated 1-based gene indices.
pub fn write_sjdb_list_from_gtf(&self, dir: &Path, genome: &Genome) -> Result<(), Error> {
let path = dir.join("sjdbList.fromGTF.out.tab");
let file = std::fs::File::create(&path).map_err(|e| Error::io(e, &path))?;
let mut out = std::io::BufWriter::new(file);
// Collect (sjStart_abs_0based_excl, sjEnd_abs_0based_incl, chr_idx,
// strand, gene_idx_1based) for every intron within every transcript.
let mut junctions: Vec<(u64, u64, usize, u8, u32)> = Vec::new();
for (tr_idx, exs) in self.tr_exons.iter().enumerate() {
let chr_idx = self.tr_chr_idx[tr_idx];
let strand = self.tr_strand[tr_idx];
let gene1 = self.tr_gene_idx[tr_idx] + 1;
for pair in exs.windows(2) {
let e0 = &pair[0];
let e1 = &pair[1];
// STAR: if e1.start <= e0.end+1, exons touch (no intron). In
// rustar-aligner's 0-based half-open: e1.genome_start <= e0.genome_end.
if e1.genome_start <= e0.genome_end {
continue;
}
junctions.push((
e0.genome_end, // sjStart: 0-based first intron base
e1.genome_start - 1, // sjEnd: 0-based last intron base
chr_idx,
strand,
gene1,
));
}
}
// STAR sorts by sjStart only (funCompareUint2 on the first uint64).
// Keep it stable so gene list order across duplicates matches
// transcript-insertion order.
junctions.sort_by_key(|&(s, _, _, _, _)| s);
let strand_char = |s: u8| match s {
1 => '+',
2 => '-',
_ => '.',
};
// Dedup pass: merge genes across identical (chr, start, end, strand).
let mut i = 0;
while i < junctions.len() {
let (sj_start, sj_end, chr_idx, strand, gene1) = junctions[i];
let chr_offset = genome.chr_start[chr_idx];
let start_1based = sj_start + 1 - chr_offset;
let end_1based = (sj_end + 1) - chr_offset;
write!(
out,
"{}\t{}\t{}\t{}\t{}",
genome.chr_name[chr_idx],
start_1based,
end_1based,
strand_char(strand),
gene1
)
.map_err(|e| Error::io(e, &path))?;
// Append genes from subsequent entries with the same key.
let mut j = i + 1;
let mut seen: std::collections::BTreeSet<u32> = std::collections::BTreeSet::new();
seen.insert(gene1);
while j < junctions.len() {
let (s2, e2, c2, st2, g2) = junctions[j];
if s2 == sj_start && e2 == sj_end && c2 == chr_idx && st2 == strand {
if seen.insert(g2) {
write!(out, ",{g2}").map_err(|e| Error::io(e, &path))?;
}
j += 1;
} else {
break;
}
}
writeln!(out).map_err(|e| Error::io(e, &path))?;
i = j;
}
Ok(())
}
/// Write `exonGeTrInfo.tab` into `dir`, byte-for-byte matching STAR's
/// `GTF_transcriptGeneSJ.cpp:33-53` format:
///
/// - Header: total exon count.
/// - Per-exon line: `exStart\texEnd\texStrand\tgeID\ttrID\n`
///
/// Coordinates are 0-based absolute with `exEnd` INCLUSIVE (STAR's
/// `exE`). Exons are sorted by `(exStart, exEnd, exStrand, geID, trID)`
/// — STAR's `funCompareArrays<uint64,5>` order.
///
/// Note: STAR writes the insertion-order transcript index here, not the
/// sorted one. Its own comment at line 51 calls this "wrong" because
/// transcripts are re-sorted in `transcriptInfo.tab`. For byte parity
/// we replicate the behavior.
pub fn write_exon_ge_tr_info(&self, dir: &Path) -> Result<(), Error> {
let path = dir.join("exonGeTrInfo.tab");
let file = std::fs::File::create(&path).map_err(|e| Error::io(e, &path))?;
let mut out = std::io::BufWriter::new(file);
// Flatten (exStart, exEnd_inclusive, strand, gene_idx, tr_insertion_idx).
let mut records: Vec<(u64, u64, u8, u32, u32)> =
Vec::with_capacity(self.tr_exons.iter().map(|e| e.len()).sum());
for (tr_idx, exs) in self.tr_exons.iter().enumerate() {
let strand = self.tr_strand[tr_idx];
let gene_idx = self.tr_gene_idx[tr_idx];
for ex in exs {
records.push((
ex.genome_start,
ex.genome_end.saturating_sub(1),
strand,
gene_idx,
tr_idx as u32,
));
}
}
records.sort_unstable();
writeln!(out, "{}", records.len()).map_err(|e| Error::io(e, &path))?;
for (ex_start, ex_end, strand, gene_idx, tr_idx) in records {
writeln!(
out,
"{}\t{}\t{}\t{}\t{}",
ex_start, ex_end, strand, gene_idx, tr_idx
)
.map_err(|e| Error::io(e, &path))?;
}
Ok(())
}
/// Write `geneInfo.tab` into `dir`, byte-for-byte matching STAR's
/// `GTF_transcriptGeneSJ.cpp:55-60` format:
///
/// - Header: integer gene count.
/// - Per-gene line: `geneID\tgeneName\tgeneBiotype\n`
///
/// Order is first-seen (same as STAR, which accumulates genes during
/// GTF parsing). Empty string if the GTF record had no `gene_name` /
/// `gene_biotype` attribute.
pub fn write_gene_info(&self, dir: &Path) -> Result<(), Error> {
let path = dir.join("geneInfo.tab");
let file = std::fs::File::create(&path).map_err(|e| Error::io(e, &path))?;
let mut out = std::io::BufWriter::new(file);
writeln!(out, "{}", self.gene_ids.len()).map_err(|e| Error::io(e, &path))?;
for ((id, name), biotype) in self
.gene_ids
.iter()
.zip(&self.gene_names)
.zip(&self.gene_biotypes)
{
writeln!(out, "{}\t{}\t{}", id, name, biotype).map_err(|e| Error::io(e, &path))?;
}
Ok(())
}
/// Write `exonInfo.tab` into `dir`, byte-for-byte matching STAR's
/// `GTF_transcriptGeneSJ.cpp:86-112` format:
///
/// - Header: total exon count across all transcripts.
/// - Per-exon line: `exStart_relative\texEnd_relative\texLenCum\n`
///
/// Coordinates are transcript-relative (exon start/end minus transcript
/// start), 0-based, with `exEnd` INCLUSIVE (matches STAR's `exE`).
/// Exons are emitted in STAR's sort order: transcripts in `(tr_start,
/// tr_end)` order, exons within each transcript in genome order.
pub fn write_exon_info(&self, dir: &Path) -> Result<(), Error> {
let path = dir.join("exonInfo.tab");
let file = std::fs::File::create(&path).map_err(|e| Error::io(e, &path))?;
let mut out = std::io::BufWriter::new(file);
let total_exons: usize = self.tr_exons.iter().map(|e| e.len()).sum();
writeln!(out, "{total_exons}").map_err(|e| Error::io(e, &path))?;
for &i in &self.tr_order {
let tr_s = self.tr_start[i];
for ex in &self.tr_exons[i] {
// STAR's exEnd is 0-based INCLUSIVE; rustar-aligner stores exclusive.
let ex_start_rel = ex.genome_start - tr_s;
let ex_end_rel = ex.genome_end.saturating_sub(1) - tr_s;
writeln!(out, "{}\t{}\t{}", ex_start_rel, ex_end_rel, ex.ex_len_cum)
.map_err(|e| Error::io(e, &path))?;
}
}
Ok(())
}
}
// ---------------------------------------------------------------------------
// Loaders for STAR-compatible transcriptome index files.
// ---------------------------------------------------------------------------
/// Flat exon columns: `(starts_transcript_relative, ends_transcript_relative_inclusive, ex_len_cum)`.
type ExonInfoColumns = (Vec<u64>, Vec<u64>, Vec<u32>);
/// Gene table columns: `(gene_ids, gene_names, gene_biotypes)`.
type GeneInfoColumns = (Vec<String>, Vec<String>, Vec<String>);
type TrInfoColumns = (
Vec<String>, // tr_ids
Vec<u64>, // tr_start (0-based absolute)
Vec<u64>, // tr_end (0-based exclusive, converted from STAR's inclusive)
Vec<u8>, // tr_strand
Vec<u32>, // tr_exn
Vec<u32>, // tr_exi
Vec<u32>, // tr_gene_idx
);
/// Read `transcriptInfo.tab` back into column vectors. Converts STAR's
/// 0-based-inclusive `trEnd` to rustar-aligner's 0-based-exclusive convention.
/// Ignores the `trEmax` column (reconstructed from `tr_end` + sort order).
fn read_transcript_info(path: &Path) -> Result<TrInfoColumns, Error> {
let body = std::fs::read_to_string(path).map_err(|e| Error::io(e, path))?;
let mut lines = body.lines();
let header = lines
.next()
.ok_or_else(|| Error::Index(format!("{}: missing header line", path.display())))?;
let n_tr: usize = header
.trim()
.parse()
.map_err(|_| Error::Index(format!("{}: header is not an integer", path.display())))?;
let mut tr_ids = Vec::with_capacity(n_tr);
let mut tr_start = Vec::with_capacity(n_tr);
let mut tr_end = Vec::with_capacity(n_tr);
let mut tr_strand = Vec::with_capacity(n_tr);
let mut tr_exn = Vec::with_capacity(n_tr);
let mut tr_exi = Vec::with_capacity(n_tr);
let mut tr_gene_idx = Vec::with_capacity(n_tr);
for (row, line) in lines.enumerate() {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() != 8 {
return Err(Error::Index(format!(
"{}: row {} has {} fields, expected 8",
path.display(),
row + 1,
fields.len()
)));
}
let parse_u64 = |s: &str, col: &str| -> Result<u64, Error> {
s.parse().map_err(|_| {
Error::Index(format!(
"{}: row {} column {} not an integer: '{}'",
path.display(),
row + 1,
col,
s
))
})
};
tr_ids.push(fields[0].to_string());
tr_start.push(parse_u64(fields[1], "trStart")?);
// STAR's trEnd is 0-based INCLUSIVE; rustar-aligner stores exclusive.
tr_end.push(parse_u64(fields[2], "trEnd")? + 1);
tr_strand.push(parse_u64(fields[4], "trStrand")? as u8);
tr_exn.push(parse_u64(fields[5], "trExN")? as u32);
tr_exi.push(parse_u64(fields[6], "trExI")? as u32);
tr_gene_idx.push(parse_u64(fields[7], "trGene")? as u32);
}
if tr_ids.len() != n_tr {
return Err(Error::Index(format!(
"{}: header says {} transcripts but found {}",
path.display(),
n_tr,
tr_ids.len()
)));
}
Ok((
tr_ids,
tr_start,
tr_end,
tr_strand,
tr_exn,
tr_exi,
tr_gene_idx,
))
}
/// Read `exonInfo.tab` into flat arrays. Start/end are transcript-relative,
/// 0-based; exEnd is INCLUSIVE (STAR's convention).
fn read_exon_info(path: &Path) -> Result<ExonInfoColumns, Error> {
let body = std::fs::read_to_string(path).map_err(|e| Error::io(e, path))?;
let mut lines = body.lines();
let header = lines
.next()
.ok_or_else(|| Error::Index(format!("{}: missing header line", path.display())))?;
let n_ex: usize = header
.trim()
.parse()
.map_err(|_| Error::Index(format!("{}: header is not an integer", path.display())))?;
let mut starts = Vec::with_capacity(n_ex);
let mut ends = Vec::with_capacity(n_ex);
let mut len_cums = Vec::with_capacity(n_ex);
for (row, line) in lines.enumerate() {
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() != 3 {
return Err(Error::Index(format!(
"{}: row {} has {} fields, expected 3",
path.display(),
row + 1,
fields.len()
)));
}
let parse = |s: &str, col: &str| -> Result<u64, Error> {
s.parse().map_err(|_| {
Error::Index(format!(
"{}: row {} column {} not an integer: '{}'",
path.display(),
row + 1,
col,
s
))
})
};
starts.push(parse(fields[0], "exStart")?);
ends.push(parse(fields[1], "exEnd")?);
len_cums.push(parse(fields[2], "exLenCum")? as u32);
}
if starts.len() != n_ex {
return Err(Error::Index(format!(
"{}: header says {} exons but found {}",
path.display(),
n_ex,
starts.len()
)));
}
Ok((starts, ends, len_cums))
}
/// Read `geneInfo.tab`. Each record: geneID \t geneName \t geneBiotype.
fn read_gene_info(path: &Path) -> Result<GeneInfoColumns, Error> {
let body = std::fs::read_to_string(path).map_err(|e| Error::io(e, path))?;
let mut lines = body.lines();
let header = lines
.next()
.ok_or_else(|| Error::Index(format!("{}: missing header line", path.display())))?;
let n_ge: usize = header
.trim()
.parse()
.map_err(|_| Error::Index(format!("{}: header is not an integer", path.display())))?;
let mut ids = Vec::with_capacity(n_ge);
let mut names = Vec::with_capacity(n_ge);
let mut biotypes = Vec::with_capacity(n_ge);
for (row, line) in lines.enumerate() {
let fields: Vec<&str> = line.splitn(3, '\t').collect();
if fields.len() != 3 {
return Err(Error::Index(format!(
"{}: row {} has {} fields, expected 3",
path.display(),
row + 1,
fields.len()
)));
}
ids.push(fields[0].to_string());
names.push(fields[1].to_string());
biotypes.push(fields[2].to_string());
}
if ids.len() != n_ge {
return Err(Error::Index(format!(
"{}: header says {} genes but found {}",
path.display(),
n_ge,
ids.len()
)));
}
Ok((ids, names, biotypes))
}
// ---------------------------------------------------------------------------
// Projection — STAR's `Transcriptome::quantAlign` + `alignToTranscript`.
// ---------------------------------------------------------------------------
/// Project a genome-space `Transcript` onto every transcript whose coordinates
/// fully contain the alignment. Returns transcript-space alignments ready for
/// BAM emission.
///
/// Mirrors STAR `Transcriptome_quantAlign.cpp`:
/// * binary-search `tr_starts_sorted` for the greatest `tr_start <=
/// align.genome_start`,
/// * walk back while `tr_end_max_sorted[i] >= align.genome_end`,
/// * for each candidate whose `[tr_start, tr_end]` fully contains the align,
/// call `align_to_one_transcript`.
///
/// `lread` is the total read length and is needed to flip read-space
/// coordinates on reverse-strand transcripts (`read_pos' = Lread - (read_pos +
/// len)`). For paired-end inputs pass the sum of per-mate lengths.
pub fn align_to_transcripts(
align: &Transcript,
idx: &TranscriptomeIndex,
lread: u32,
) -> Vec<Transcript> {
let mut out: Vec<Transcript> = Vec::new();
if idx.n_transcripts() == 0 || align.exons.is_empty() {
return out;
}
let a_start = align.genome_start;
let a_end = align.genome_end;
// Binary-search `tr_starts_sorted` for greatest tr_start <= a_start.
// partition_point returns the first index with tr_start > a_start; subtract 1.
let upper = idx.tr_starts_sorted.partition_point(|&s| s <= a_start);
if upper == 0 {
return out; // a_start is to the left of all transcripts
}
let mut sorted_i = upper - 1;
// Walk backwards while the running-max end still covers this alignment.
loop {
if idx.tr_end_max_sorted[sorted_i] < a_end {
break;
}
let tr_idx = idx.tr_order[sorted_i];
if idx.tr_chr_idx[tr_idx] == align.chr_idx
&& idx.tr_start[tr_idx] <= a_start
&& idx.tr_end[tr_idx] >= a_end
&& let Some(projected) = align_to_one_transcript(align, tr_idx, idx, lread)
{
out.push(projected);
}
if sorted_i == 0 {
break;
}
sorted_i -= 1;
}
out
}
/// Project a single alignment onto a single transcript. Returns `None` if the
/// alignment is inconsistent with the transcript's exon structure (block
/// extends past an exon boundary, splice boundary doesn't match a transcript
/// junction, etc.).
///
/// Direct port of STAR's `alignToTranscript` (see
/// `source/Transcriptome_quantAlign.cpp:5-89`).
fn align_to_one_transcript(
align: &Transcript,
tr_idx: usize,
idx: &TranscriptomeIndex,
lread: u32,
) -> Option<Transcript> {
let tr_exons = &idx.tr_exons[tr_idx];
let tr_strand = idx.tr_strand[tr_idx];
let tr_length = idx.tr_length[tr_idx];