diff --git a/workflow/rules/aatrnaseq-process.smk b/workflow/rules/aatrnaseq-process.smk index b2eb390..612f6b2 100644 --- a/workflow/rules/aatrnaseq-process.smk +++ b/workflow/rules/aatrnaseq-process.smk @@ -125,6 +125,7 @@ rule bwa_align: input: reads=get_alignment_fastq, idx=rules.bwa_idx.output, + ref_dict=rules.ref_dict.output, output: bam=maybe_temp( os.path.join(outdir, "bam", "aln", "{sample}", "{sample}.aln.bam") @@ -144,6 +145,16 @@ rule bwa_align: | samtools view -F 20 -Sb - \ | samtools sort -m 2G -@ 4 -o {output.bam} + # Embed reference M5 checksums into BAM header + {{ + samtools view -H {output.bam} | head -1 + grep '^@SQ' {input.ref_dict} + samtools view -H {output.bam} | grep -v -e '^@SQ' -e '^@HD' || true + }} > {output.bam}.header.sam + samtools reheader --no-PG {output.bam}.header.sam {output.bam} > {output.bam}.reheader + mv {output.bam}.reheader {output.bam} + rm {output.bam}.header.sam + samtools index {output.bam} """ diff --git a/workflow/rules/aatrnaseq-reference.smk b/workflow/rules/aatrnaseq-reference.smk index f09bcbf..792b07e 100644 --- a/workflow/rules/aatrnaseq-reference.smk +++ b/workflow/rules/aatrnaseq-reference.smk @@ -236,3 +236,19 @@ rule trim_reference: --adapter-3p "{params.adapter_3p}" \ 2>&1 | tee {log} """ + + +rule ref_dict: + """ + Generate sequence dictionary with MD5 checksums for embedding in BAM headers. + """ + input: + get_validated_reference(), + output: + os.path.join(outdir, "reference", "reference.dict"), + log: + os.path.join(outdir, "logs", "reference", "dict.log"), + shell: + """ + samtools dict {input} -o {output} 2> {log} + """