Skip to content

Commit ed59a6c

Browse files
committed
feat(bam): populate @pg header with PN/VN/CL fields
Per SAM spec §1.3, the @pg line conventionally carries PN (program name), VN (version), and CL (command line) alongside ID. rustar was emitting only ID:rustar-aligner, leaving downstream provenance tools (MultiQC's program-version table, dx-toolkit lineage tracking) with a blank entry. Expand the header writer to emit: @pg\tID:rustar-aligner\tPN:rustar-aligner\tVN:<cargo pkg version>\tCL:<args> The full command line is captured in main() before clap parses it, then threaded into Parameters via a new (skip) field so it reaches the SAM header builder. Version comes from CARGO_PKG_VERSION at compile time. This matches STAR's @pg format and gives downstream tools the provenance they need. Fixes #33 (the @pg header gap; AS divergence is a separate item).
1 parent 70be24d commit ed59a6c

3 files changed

Lines changed: 85 additions & 4 deletions

File tree

src/io/sam.rs

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use noodles::sam::alignment::record_buf::data::field::value::Array;
1717
use noodles::sam::alignment::record_buf::{QualityScores, RecordBuf, Sequence};
1818
use noodles::sam::header::record::value::{
1919
Map,
20-
map::{Program, ReadGroup, tag::Other as HeaderOtherTag},
20+
map::{Program, ReadGroup, program::tag as program_tag, tag::Other as HeaderOtherTag},
2121
};
2222
use std::collections::HashSet;
2323
use std::fmt::Write as FmtWrite;
@@ -843,8 +843,23 @@ where
843843
builder = builder.add_read_group(id, map);
844844
}
845845

846-
// @PG line
847-
builder = builder.add_program("rustar-aligner", Map::<Program>::default());
846+
// @PG line. Per SAM spec §1.3, populate PN/VN/CL alongside ID so downstream
847+
// provenance tools (MultiQC program-version table, etc.) see a fully
848+
// populated record matching STAR's @PG format.
849+
let mut pg = Map::<Program>::default();
850+
pg.other_fields_mut()
851+
.insert(program_tag::NAME, BString::from("rustar-aligner"));
852+
pg.other_fields_mut().insert(
853+
program_tag::VERSION,
854+
BString::from(env!("CARGO_PKG_VERSION")),
855+
);
856+
let cl = params
857+
.command_line
858+
.clone()
859+
.unwrap_or_else(|| "rustar-aligner".to_string());
860+
pg.other_fields_mut()
861+
.insert(program_tag::COMMAND_LINE, BString::from(cl));
862+
builder = builder.add_program("rustar-aligner", pg);
848863

849864
Ok(builder.build())
850865
}
@@ -1373,6 +1388,63 @@ mod tests {
13731388
assert_eq!(header.reference_sequences().len(), 1);
13741389
}
13751390

1391+
#[test]
1392+
fn test_build_sam_header_pg_line_populated() {
1393+
// The @PG line must carry PN, VN and CL alongside ID per SAM spec §1.3,
1394+
// so downstream provenance tools (MultiQC etc.) get a non-blank entry.
1395+
let genome = make_test_genome();
1396+
let mut params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]);
1397+
params.command_line =
1398+
Some("rustar-aligner --readFilesIn test.fq --runThreadN 4".to_string());
1399+
1400+
let header = build_sam_header(&genome, &params).unwrap();
1401+
let programs = header.programs().as_ref();
1402+
let pg = programs
1403+
.get(&b"rustar-aligner"[..])
1404+
.expect("@PG line with ID:rustar-aligner must be present");
1405+
1406+
let pn: &[u8] = pg
1407+
.other_fields()
1408+
.get(&program_tag::NAME)
1409+
.expect("PN field must be present")
1410+
.as_ref();
1411+
assert_eq!(pn, b"rustar-aligner");
1412+
1413+
let vn: &[u8] = pg
1414+
.other_fields()
1415+
.get(&program_tag::VERSION)
1416+
.expect("VN field must be present")
1417+
.as_ref();
1418+
assert_eq!(vn, env!("CARGO_PKG_VERSION").as_bytes());
1419+
1420+
let cl: &[u8] = pg
1421+
.other_fields()
1422+
.get(&program_tag::COMMAND_LINE)
1423+
.expect("CL field must be present")
1424+
.as_ref();
1425+
assert!(!cl.is_empty(), "CL field must be non-empty");
1426+
assert_eq!(cl, b"rustar-aligner --readFilesIn test.fq --runThreadN 4");
1427+
}
1428+
1429+
#[test]
1430+
fn test_build_sam_header_pg_line_default_cl_when_unset() {
1431+
// When command_line is None (e.g. tests, library use), fall back to
1432+
// the program name so CL is still non-empty.
1433+
let genome = make_test_genome();
1434+
let params = Parameters::parse_from(vec!["rustar-aligner", "--readFilesIn", "test.fq"]);
1435+
assert!(params.command_line.is_none());
1436+
1437+
let header = build_sam_header(&genome, &params).unwrap();
1438+
let programs = header.programs().as_ref();
1439+
let pg = programs.get(&b"rustar-aligner"[..]).unwrap();
1440+
let cl: &[u8] = pg
1441+
.other_fields()
1442+
.get(&program_tag::COMMAND_LINE)
1443+
.expect("CL field must be present even when command_line is None")
1444+
.as_ref();
1445+
assert!(!cl.is_empty());
1446+
}
1447+
13761448
#[test]
13771449
fn test_build_sam_header_with_rg() {
13781450
let genome = make_test_genome();

src/main.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ fn main() -> anyhow::Result<()> {
88

99
cpu::check_cpu_compat()?;
1010

11-
let params = Parameters::parse();
11+
let command_line = std::env::args().collect::<Vec<_>>().join(" ");
12+
let mut params = Parameters::parse();
13+
params.command_line = Some(command_line);
1214
rustar_aligner::run(&params)
1315
}

src/params.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,13 @@ pub struct Parameters {
701701
/// Chimeric output type
702702
#[arg(long = "chimOutType", num_args = 1..=2, default_values_t = vec!["Junctions".to_string()])]
703703
pub chim_out_type: Vec<String>,
704+
705+
/// Full command line as invoked (captured in `main` before clap parsing).
706+
/// Not a CLI argument; populated programmatically and embedded in the
707+
/// BAM `@PG` `CL:` field for provenance. STAR captures the same string
708+
/// in `P.commandLineFull`.
709+
#[arg(skip)]
710+
pub command_line: Option<String>,
704711
}
705712

706713
impl Parameters {

0 commit comments

Comments
 (0)